In [25]:
import pandas as pd
from textblob import TextBlob
import glob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

### Load datasets

In [26]:
all_files = glob.glob("./dataset_cleaning/*.pkl")
all_files.sort()
test_filename= "./dataset_cleaning/dataset1Cleaned.pkl"
li = []

for filename in all_files:
    
    if(filename != test_filename):

        df = pd.read_pickle(filename)
        li.append(df)

train_df = pd.concat(li, axis=0, ignore_index=True)
test_df = pd.read_pickle(test_filename)

print("Number of train features: {} \nNumber of test features: {}".format(len(train_df), len(test_df)))

Number of train features: 47317 
Number of test features: 18154


In [30]:
# from sklearn.model_selection import train_test_split
# df_merged = pd.concat([train_df, test_df])
# df_merged = df_merged.sample(frac=1).reset_index(drop=True)
# train_df, test_df= train_test_split(df_merged, test_size=0.3, random_state=42)
# print("Number of train features: {} \nNumber of test features: {}".format(len(train_df), len(test_df)))

Number of train features: 45829 
Number of test features: 19642


In [31]:
train_df.head()

Unnamed: 0,id,Informativeness,text,positive_score,negative_score,emotional_devergence_score
43067,243372363088003072,1,BREAKING NEWS...\n\nA powerful 7.6-magnitude e...,1,-1,0.2
11196,409458777738731520,1,"Funeral held for Kisook Ahn, one of four passe...",1,-4,0.5
2457,390774640202629120,0,Thank you to the thousands of firefighters who...,3,-3,0.6
55849,348128787155795971,0,PHOTO: The red circle is the train tunnel into...,1,-1,0.2
34437,911633318151544833,1,RT @mashable: Beer company fills its cans with...,1,-3,0.4


In [32]:
test_df.head()

Unnamed: 0,id,Informativeness,text,positive_score,negative_score,emotional_devergence_score
23069,218391134605545472,1,Pic: #CoFire: Helicopter drops water on #Waldo...,1,-1,0.2
57851,1245917794815754240,1,#coronavirus cases surging to over 1 million w...,1,-2,0.3
62987,541199590309912576,1,RT @ABSCBNChannel2: #Hagupit approaches the Ph...,2,-1,0.3
47132,466259645603274752,1,2 in Florida Show Symptoms of Deadly Middle Ea...,1,-1,0.2
60378,347833028371968000,1,The town of #Sundre has declared a state of lo...,1,-2,0.3


In [33]:
train_df["Informativeness"].value_counts()

1    32332
0    13497
Name: Informativeness, dtype: int64

In [34]:
test_df["Informativeness"].value_counts()

1    13756
0     5886
Name: Informativeness, dtype: int64

In [35]:
train_df.drop_duplicates(subset=["text"], keep="first", inplace=True)
test_df.drop_duplicates(subset=["text"], keep="first", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


### Process the text

In [36]:

def text_processing(tweet):
    
    tweet= tweet.lower()
    
    #Removing hyperlinks from the tweet
    tweet_no_links=re.sub(r'http\S+', '', tweet)
    
    #Generating the list of words in the tweet (hashtags and other punctuations removed)
    def form_sentence(tweet):
        tweet_blob = TextBlob(tweet)
        return ' '.join(tweet_blob.words)
    new_tweet = form_sentence(tweet_no_links)
    
    #Removing stopwords and words with unusual symbols
    def no_user_alpha(tweet):
        tweet_list = [ele for ele in tweet.split() if ele != 'user']
        clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
        clean_s = ' '.join(clean_tokens)
        clean_mess = [word for word in clean_s.split() if word not in stopwords.words('english')]
        return clean_mess
    no_punc_tweet = no_user_alpha(new_tweet)
    
    #Normalizing the words in tweets 
    def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return " ".join(normalized_tweet)
    
    
    return normalization(no_punc_tweet)

In [37]:
train_df['text_processed']=train_df['text'].apply(text_processing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [38]:
test_df['text_processed']=test_df['text'].apply(text_processing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [39]:
pd.set_option('display.max_colwidth', -1)
train_df.iloc[3]

  """Entry point for launching an IPython kernel.


id                            348128787155795971                                                                                                 
Informativeness               0                                                                                                                  
text                          PHOTO: The red circle is the train tunnel into downtown #Calgary. #yycflood http://t.co/5zGG6LOBDD - @christiefaber
positive_score                1                                                                                                                  
negative_score                -1                                                                                                                 
emotional_devergence_score    0.2                                                                                                                
text_processed                photo red circle train tunnel downtown calgary yycflood christiefaber                         

In [40]:
#Randomise order of train dataset
train_df = sklearn.utils.shuffle(train_df)

Merge with new data



In [None]:
authors=pd.read_csv('./dataset_cleaning/tj/parsed/tweet_metadata.csv')
users=pd.read_csv('./dataset_cleaning/tj/parsed/twitter_user.csv')

In [85]:
#Temporary


datasets = [train_df, test_df]
authors=pd.read_csv('./dataset_cleaning/tj/tweet_metadata_full.csv')
users=pd.read_csv('./dataset_cleaning/tj/twitter_user_full.csv')
authors.drop(columns=['Unnamed: 0'], inplace=True)
users.drop(columns=['Unnamed: 0','created_at', 'lang', 'name', 'screen_name', 'location','access'], inplace=True)
users.columns=['author_id', 'has_description', 'bio_has_url', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'statuses_count', 'protected',
       'verified', 'default_profile', 'default_profile_image']


mean=['retweet_count','favorite_count','followers_count', 'friends_count', 'listed_count', 'favourites_count', 'statuses_count']
median = ['tweet_type', 'has_description', 'bio_has_url', 'protected', 'verified','default_profile', 'default_profile_image']


for i,ds in enumerate(datasets):
    print("Processing dataset {}".format(i+1))
    ds = pd.merge(ds, authors, on='id', how='left')

    ds.drop(ds.columns.difference(['id', 'Informativeness', 'text_processed', 'author_id', 'tweet_type', 'retweet_count', 'favorite_count']), 1, inplace=True)



    ds = pd.merge(ds, users, on='author_id', how='left')


    for m in mean:
        ds[m][ds['Informativeness'] == 1] = ds[m][ds['Informativeness'] == 1].fillna((ds[m][ds['Informativeness'] == 1].mean()))
        ds[m][ds['Informativeness'] == 0] = ds[m][ds['Informativeness'] == 0].fillna((ds[m][ds['Informativeness'] == 0].mean()))

    for m in median:
        ds[m][ds['Informativeness'] == 1] = ds[m][ds['Informativeness'] == 1].fillna((ds[m][ds['Informativeness'] == 1].value_counts().idxmax()))
        ds[m][ds['Informativeness'] == 0] = ds[m][ds['Informativeness'] == 0].fillna((ds[m][ds['Informativeness'] == 0].value_counts().idxmax()))

    ds["has_description"] = ds["has_description"].apply(lambda x: 0 if x is np.nan else 1)
    ds["tweet_type"] = ds["tweet_type"].apply(lambda x: 0 if x =='tweet' else 1)
    ds["bio_has_url"] = ds["bio_has_url"].apply(lambda x: 0 if x is np.nan else 1)

    ds = ds.drop(columns=['author_id'])

    datasets[i] = ds
train_df_extended =datasets[0]
test_df_extended =datasets[1]

Processing dataset 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

Processing dataset 2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [65]:
train_df_extended=pd.merge(train_df, authors, on='id')
test_df_extended=pd.merge(test_df, authors, on='id')

In [67]:
train_df_extended.drop(train_df_extended.columns.difference(['id', 'Informativeness', 'text_processed', 'author_id', 'tweet_type', 'retweet_count', 'favorite_count']), 1, inplace=True)
test_df_extended.drop(test_df_extended.columns.difference(['id', 'Informativeness', 'text_processed', 'author_id', 'tweet_type', 'retweet_count', 'favorite_count']), 1, inplace=True)

In [16]:
users.drop(columns=['created_at', 'lang', 'name', 'screen_name', 'location','access'], inplace=True)
users.columns=['author_id', 'has_description', 'bio_has_url', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'statuses_count', 'protected',
       'verified', 'default_profile', 'default_profile_image']

In [69]:
train_df_extended=pd.merge(train_df_extended, users, on='author_id')
test_df_extended=pd.merge(test_df_extended, users, on='author_id')

In [60]:
train_df_extended["has_description"] = train_df_extended["has_description"].apply(lambda x: 0 if x is np.nan else 1)
test_df_extended["has_description"] = test_df_extended["has_description"].apply(lambda x: 0 if x is np.nan else 1)

In [61]:
train_df_extended["tweet_type"] = train_df_extended["tweet_type"].apply(lambda x: 0 if x =='tweet' else 1)
test_df_extended["tweet_type"] = test_df_extended["tweet_type"].apply(lambda x: 0 if x == 'tweet' else 1)

In [62]:
train_df_extended["bio_has_url"] = train_df_extended["bio_has_url"].apply(lambda x: 0 if x is np.nan else 1)
test_df_extended["bio_has_url"] = test_df_extended["bio_has_url"].apply(lambda x: 0 if x is np.nan else 1)

In [63]:
train_df_extended = train_df_extended.drop(columns=['id', 'author_id'])
test_df_extended = test_df_extended.drop(columns=['id', 'author_id'])

## Baseline model

Split data into X and Y, as well as train/test sets

In [22]:
# from sklearn.model_selection import train_test_split



# sentences = main_df['text'].values
# y = main_df['Informativeness'].values

# sentences_train, sentences_test, y_train, y_test = train_test_split(
#    sentences, y, test_size=0.25, random_state=1000)


In [15]:
sentences_train = train_df['text_processed'].values
sentences_test = test_df['text_processed'].values

y_train = train_df['Informativeness'].values
y_test = test_df['Informativeness'].values




In [24]:
sentences_train

array(['pathological narcissist down st never take deaths chin admit direct responsibility gross negligence cause deaths absolutely wrong uk coronavirus test strategy unravel',
       'nearly customers without power state hurricane sandy impact nj',
       'fema epa gird irma house preps vote harvey aid', ...,
       'usslakeerie service members assist srilanka devastate flood',
       'pamnparsons sure question pam big yes',
       'interest fact kuwait light coronavirus outbreak hundreds years ago male kuwaiti sailors return kuwait amp find go women amp children die due plague start bring new wive iran iraq etc'],
      dtype=object)

 Old way: Vectorize sentances using CountVectorizer  and do IDF "manualy"

In [25]:
# from sklearn.feature_extraction.text import CountVectorizer

# # Vevotize each sentance
# vectorizer = CountVectorizer()
# vectorizer.fit(sentences_train)

# # Too look at the vocabulary encoding
# # vectorizer.vocabulary_

# X_train = vectorizer.transform(sentences_train)
# X_test  = vectorizer.transform(sentences_test)

In [26]:
# from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
# tfidf_transformer.fit(X_train)
# X_train_tfidf = tfidf_transformer.transform(X_train)
# X_test_tfidf = tfidf_transformer.transform(X_test)

### Logistic Regression

In [86]:
X_train = train_df_extended.drop(columns=['Informativeness'])
X_test = test_df_extended.drop(columns=['Informativeness'])


y_train = train_df_extended['Informativeness']
y_test = test_df_extended['Informativeness']

In [87]:
X_train_text=X_train['text_processed']
X_train_numerical= X_train.drop(columns=["text_processed"])

X_test_text=X_test['text_processed']
X_test_numerical= X_test.drop(columns=["text_processed"])

In [88]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from scipy.sparse import hstack
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale

def grid_search_text_and_numerical(model, parameters, X_train_text, X_train_numerical):
    best_score=0
    best_params={}
    best_clf= None
    best_vectorizer = None
    best_tfIdfTransformer=None
    for ngram_range in [(1, 1), (1, 2)]:
        for use_tf_idf in [True, False]:
            for normalization_norm in ["max", "l2", "none"]:

                print("Iteration with {}, {}, {}".format(ngram_range, use_tf_idf, normalization_norm))
                vectorizer = CountVectorizer(ngram_range=ngram_range)
                vectorizer.fit(X_train_text)
                X_train_text_vectorized = vectorizer.transform(X_train_text)

                tfidf_transformer = TfidfTransformer(use_idf = use_tf_idf)
                tfidf_transformer.fit(X_train_text_vectorized)
                X_train_text_tfidf = tfidf_transformer.transform(X_train_text_vectorized)


                X_train_merged = hstack((X_train_text_tfidf,np.array(np.array(X_train_numerical).astype(np.float))))

                if(normalization_norm!="none"):
                    X_train_merged = normalize(X_train_merged, norm=normalization_norm, axis=0)
    #             X_train_merged = scale(X_train_merged, with_mean= False, axis = 0)

                clf = GridSearchCV(model, parameters, scoring='roc_auc', n_jobs=-1, verbose=2)
                clf.fit(X_train_merged, y_train)

                if(best_score < clf.best_score_):
                    best_score = clf.best_score_
                    best_params = clf.best_params_
                    best_params["ngram_range"] = ngram_range
                    best_params["use_tf_idf"]= use_tf_idf
                    best_params["normalization_norm"] = normalization_norm
                    best_clf=clf

                    best_vectorizer = vectorizer
                    best_tfIdfTransformer=tfidf_transformer
                
    return best_score, best_params, best_clf, best_vectorizer, best_tfIdfTransformer
            

In [89]:
from sklearn.linear_model import LogisticRegression

lr_parameters = {'penalty':('l2',)}
lr_model = LogisticRegression( max_iter=1000)

lr={}
lr["model"]=lr_model
lr["parameters"] =lr_parameters

lr_best_score, lr_best_params, lr_best_clf, lr_best_vectorizer, lr_best_tfIdfTransformer = grid_search_text_and_numerical(lr["model"], lr["parameters"], X_train_text, X_train_numerical)
    
print("Best score found while training: {}".format(lr_best_score))
print(lr_best_params)

Iteration with (1, 1), True, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   11.0s finished


Iteration with (1, 1), True, l2
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished


Iteration with (1, 1), True, none
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.8s finished


Iteration with (1, 1), False, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.9s finished


Iteration with (1, 1), False, l2
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.3s finished


Iteration with (1, 1), False, none
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    0.9s finished


Iteration with (1, 2), True, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   39.5s finished


Iteration with (1, 2), True, l2
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.7s finished


Iteration with (1, 2), True, none
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    4.2s finished


Iteration with (1, 2), False, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   38.8s finished


Iteration with (1, 2), False, l2
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.1s finished


Iteration with (1, 2), False, none
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    3.6s finished


Best score found while training: 0.8980010811308168
{'penalty': 'l2', 'ngram_range': (1, 2), 'use_tf_idf': False, 'normalization_norm': 'max'}


In [91]:
def merge_test_and_score(X_test_text, X_test_numerical, best_vecotorizer, best_tfIdfTransformer, best_params, best_clf):

    X_test_text_vectorized = best_vecotorizer.transform(X_test_text)
    X_test_text_tfidf = best_tfIdfTransformer.transform(X_test_text_vectorized)


    X_test_merged = hstack((X_test_text_tfidf,np.array(np.array(X_test_numerical).astype(np.float))))
    
    if(best_params["normalization_norm"]!="none"):
        X_test_merged = normalize(X_test_merged, norm=best_params["normalization_norm"], axis=0)
    # X_test_merged = scale(X_test_merged, with_mean= False, axis = 0)

    return best_clf.score(X_test_merged, y_test)

In [92]:
merge_test_and_score(X_test_text, X_test_numerical, lr_best_vectorizer, lr_best_tfIdfTransformer, lr_best_params, lr_best_clf)

0.8988659144450964

In [33]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

text_clf_lr = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-lr', LogisticRegression( max_iter=1000))])

parameters_lr = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-lr__penalty': ('l2', 'l1')}

gs_clf_lr = GridSearchCV(text_clf_lr, parameters_lr ,scoring='roc_auc', n_jobs=-1, verbose=2)
gs_clf_lr = gs_clf_lr.fit(X_train_text, y_train)


print("Best score: ", gs_clf_lr.best_score_)
print("Best parameters: ", gs_clf_lr.best_params_)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.1min finished


Best score:  0.8874170859369963
Best parameters:  {'clf-lr__penalty': 'l2', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 1)}


In [34]:
gs_clf_lr.score(X_test_text, y_test)

0.7076308778130103

In [35]:
y_pred = gs_clf_lr.predict(sentences_test)
y_pred_prob=gs_clf_lr.predict_proba(sentences_test)[:,1]

precision_LR = precision_score(y_test, y_pred)
recall_LR = recall_score(y_test, y_pred)
f1_LR= f1_score(y_test, y_pred)
roc_LR = roc_auc_score(y_test, y_pred_prob)

fpr_LR, tpr_LR, thresholds_LR = roc_curve(y_test, y_pred_prob)

print("Precision: {} \nRecall:{} \nF1 Score: {} \nAUC: {}".format(precision_LR, recall_LR, f1_LR, roc_LR))

Precision: 0.645826025955805 
Recall:0.6338209982788297 
F1 Score: 0.6397671994440585 
AUC: 0.5016850207165987


### SVM 

In [36]:
from sklearn.linear_model import SGDClassifier


text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier( random_state=42, early_stopping=True))])

parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),
                  'clf-svm__loss':('hinge','log',),'clf-svm__max_iter': (100,1000),
                 'clf-svm__alpha':(1e-3,1e-6, 1e-10),'clf-svm__penalty':('l2',),
                 }






gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm,scoring='roc_auc', n_jobs=-1, verbose=10)
gs_clf_svm = gs_clf_svm.fit(sentences_train, y_train)


print("Best score: ", gs_clf_svm.best_score_)
print("Best parameters: ", gs_clf_svm.best_params_)

ValueError: Found input variables with inconsistent numbers of samples: [47256, 34825]

In [None]:
feature_names = gs_clf_svm.best_estimator_[0].get_feature_names() 
coefs = gs_clf_svm.best_estimator_[-1].coef_[0]

coefs_with_fns = sorted(zip(coefs, feature_names)) 
df=pd.DataFrame(coefs_with_fns)
df.columns='coefficient','word'
df.sort_values(by='coefficient')

In [None]:
gs_clf_svm.score(sentences_test, y_test)

In [None]:
y_pred = gs_clf_svm.predict(sentences_test)
y_pred_prob=gs_clf_svm.predict_proba(sentences_test)[:,1]

precision_SVM = precision_score(y_test, y_pred)
recall_SVM = recall_score(y_test, y_pred)
f1_SVM= f1_score(y_test, y_pred)
roc_SVM = roc_auc_score(y_test, y_pred_prob)

fpr_SVM, tpr_SVM, thresholds_SVM = roc_curve(y_test, y_pred_prob)

print("Precision: {} \nRecall:{} \nF1 Score: {} \nAUC: {}".format(precision_SVM, recall_SVM, f1_SVM, roc_SVM))

### Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB


text_clf_nb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-nb', MultinomialNB())])

parameters_nb = {'vect__ngram_range': [(1, 1), (1, 2),(2,2)], 'tfidf__use_idf': (True, False),
                  'clf-nb__alpha': (1,1e-1, 1e-3)}

gs_clf_nb = GridSearchCV(text_clf_nb, parameters_nb,scoring='roc_auc', n_jobs=-1, verbose=5)
gs_clf_nb = gs_clf_nb.fit(sentences_train, y_train)


print("Best score: ", gs_clf_nb.best_score_)
print("Best parameters: ", gs_clf_nb.best_params_)

In [None]:
gs_clf_nb.score(sentences_test, y_test)

In [None]:
y_pred = gs_clf_nb.predict(sentences_test)
y_pred_prob=gs_clf_nb.predict_proba(sentences_test)[:,1]

accuracy_NB = accuracy_score(y_test, y_pred)
precision_NB = precision_score(y_test, y_pred)
recall_NB = recall_score(y_test, y_pred)
f1_NB= f1_score(y_test, y_pred)
roc_NB = roc_auc_score(y_test, y_pred_prob)

fpr_NB, tpr_NB, thresholds_NB = roc_curve(y_test, y_pred_prob)

print("Accuracy: {} \nPrecision: {} \nRecall:{} \nF1 Score: {} \nAUC: {}".format(accuracy_NB,precision_NB, recall_NB, f1_NB, roc_NB))

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

text_clf_dt = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-dt', DecisionTreeClassifier(  ))])

parameters_dt = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, ),
                'clf-dt__max_depth':[200, 400, 1000]}

gs_clf_dt = GridSearchCV(text_clf_dt, parameters_dt,scoring='roc_auc', n_jobs=-1, verbose=5)
gs_clf_dt = gs_clf_dt.fit(sentences_train, y_train)


print("Best score: ", gs_clf_dt.best_score_)
print("Best parameters: ", gs_clf_dt.best_params_)

In [None]:
gs_clf_dt.score(sentences_test, y_test)

In [None]:
y_pred = gs_clf_dt.predict(sentences_test)
y_pred_prob=gs_clf_dt.predict_proba(sentences_test)[:,1]

accuracy_DT = accuracy_score(y_test, y_pred)
precision_DT = precision_score(y_test, y_pred)
recall_DT = recall_score(y_test, y_pred)
f1_DT= f1_score(y_test, y_pred)
roc_DT = roc_auc_score(y_test, y_pred_prob)

fpr_DT, tpr_DT, thresholds_DT = roc_curve(y_test, y_pred_prob)

print("Accuracy: {} \nPrecision: {} \nRecall:{} \nF1 Score: {} \nAUC: {}".format(accuracy_DT,precision_DT, recall_DT, f1_DT, roc_DT))

### Random Forests

In [None]:
from sklearn.ensemble import RandomForestClassifier


text_clf_rf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-rf', RandomForestClassifier( ))])

parameters_rf = {'vect__ngram_range': [(1, 1),(2,2)], 'tfidf__use_idf': (True, False),
                'clf-rf__n_estimators':[800,], 'clf-rf__min_samples_split':[4,10]}

gs_clf_rf = GridSearchCV(text_clf_rf, parameters_rf, scoring='roc_auc', n_jobs=-1, verbose=10)
gs_clf_rf = gs_clf_rf.fit(sentences_train, y_train)


print("Best score: ", gs_clf_rf.best_score_)
print("Best parameters: ", gs_clf_rf.best_params_)

In [None]:
gs_clf_rf.score(sentences_test, y_test)

# Using embeddings (Glove)

In [16]:
# This time don't vectorize, but tokenize. Same idea but runs better with Keras and tokenizer assings
# index 1 to most frequent word and so on. While vecotrizer makes each sentcance to a vector of the same
# size (the vocabulary size) and assigns count of how many time a word appears

from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(sentences_train)

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index

print(sentences_train[1])
print(X_train[1])



photos deadly wildfires rage california
[290, 103, 90, 4049, 13]


In [17]:
# Pad with trailing 0 every tokenized sentance so that they have the same length (the length of the longest sentance)

from keras.preprocessing.sequence import pad_sequences

maxlen = len(max(X_train, key=len)) #length of the longest sentance

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

print(X_train[0, :])

[   1   90 1764  108   13 1349    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0 

In [18]:
import numpy as np

def create_embedding_matrix(filepath, word_index, embedding_dim):
    vocab_size = len(word_index) + 1  # Adding again 1 because of reserved 0 index
    embedding_matrix = np.zeros((vocab_size, embedding_dim))

    with open(filepath) as f:
        for line in f:
            word, *vector = line.split()
            if word in word_index:
                idx = word_index[word] 
                embedding_matrix[idx] = np.array(
                    vector, dtype=np.float32)[:embedding_dim]

    return embedding_matrix

Load pretrained data

In [19]:
# Download here: https://nlp.stanford.edu/projects/glove/

embedding_dim = 300
embedding_matrix = create_embedding_matrix(
    'glove_data/glove.6B.300d.txt',
    tokenizer.word_index, embedding_dim)



Percantage of vocabulary covered in the pretrained model

In [20]:
nonzero_elements = np.count_nonzero(np.count_nonzero(embedding_matrix, axis=1))
nonzero_elements / vocab_size

0.5496727877607301

In [21]:
from keras.models import Sequential
from keras import layers
import keras

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=False))

model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=[keras.metrics.AUC()])
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 415, 300)          13706700  
_________________________________________________________________
global_max_pooling1d (Global (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 10)                3010      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 11        
Total params: 13,709,721
Trainable params: 3,021
Non-trainable params: 13,706,700
_________________________________________________________________


In [22]:
history = model.fit(X_train, y_train,
                    epochs=30,
                    verbose=1,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {}".format(accuracy))
# plot_history(history)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Training Accuracy: 0.895510196685791
Testing Accuracy:  0.7146984934806824


In [23]:
y_pred =(model.predict(X_test) > 0.5).astype("int32")
y_pred_prob = model.predict(X_test)

precision_Glove_Untrainable = precision_score(y_test, y_pred)
recall_Glove_Untrainable = recall_score(y_test, y_pred)
f1_Glove_Untrainable= f1_score(y_test, y_pred)
roc_Glove_Untrainable = roc_auc_score(y_test, y_pred_prob)

fpr_Glove_Untrainable, tpr_Glove_Untrainable, thresholds_Glove_Untrainable = roc_curve(y_test, y_pred_prob)

print("Precision: {} \nRecall:{} \nF1 Score: {} \nAUC: {}".format(precision_Glove_Untrainable, recall_Glove_Untrainable, f1_Glove_Untrainable, roc_Glove_Untrainable))


Precision: 0.720785786508739 
Recall:0.8588640275387264 
F1 Score: 0.7837901515746487 
AUC: 0.7147341958246786


In [None]:
from plot_keras_history import plot_history
import matplotlib.pyplot as plt


plot_history(history.history)
plt.show()
plt.close()

### Do the same thing with glove, but this time all parameters are trainable (takes much longer to run)

In [None]:
from keras.models import Sequential
from keras import layers

model = Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, 
                           weights=[embedding_matrix], 
                           input_length=maxlen, 
                           trainable=True))

model.add(layers.GlobalMaxPool1D())
model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train,
                    epochs=20,
                    verbose=1,
                    validation_data=(X_test, y_test),
                    batch_size=10)
loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {}".format(accuracy))
loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {}".format(accuracy))
plot_history(history.history)

In [None]:
y_pred =(model.predict(X_test) > 0.5).astype("int32")
y_pred_prob = model.predict(X_test)

precision_Glove = precision_score(y_test, y_pred)
recall_Glove = recall_score(y_test, y_pred)
f1_Glove = f1_score(y_test, y_pred)
roc_Glove = roc_auc_score(y_test, y_pred_prob)

fpr_Glove, tpr_Glove, thresholds_Glove= roc_curve(y_test, y_pred_prob)

print("Precision: {} \nRecall:{} \nF1 Score: {} \nAUC: {}".format(precision_Glove, recall_Glove, f1_Glove, roc_Glove))


In [None]:
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_Glove, tpr_Glove, label='Glove Trainable (area = {:.3f})'.format(roc_Glove))
plt.plot(fpr_Glove_Untrainable, tpr_Glove_Untrainable, label='Glove Untrainable (area = {:.3f})'.format(roc_Glove_Untrainable))
plt.plot(fpr_LR, tpr_LR, label='Logistic Regression (area = {:.3f})'.format(roc_LR))
# plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
# Zoom in view of the upper left corner.
# plt.figure(2)
# plt.xlim(0, 0.2)
# plt.ylim(0.8, 1)
# plt.plot([0, 1], [0, 1], 'k--')
# plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
# # plt.plot(fpr_rf, tpr_rf, label='RF (area = {:.3f})'.format(auc_rf))
# plt.xlabel('False positive rate')
# plt.ylabel('True positive rate')
# plt.title('ROC curve (zoomed in at top left)')
# plt.legend(loc='best')
# plt.show()

### ROBERTA

In [None]:
import simpletransformers
from simpletransformers.classification import ClassificationModel

In [None]:
model = ClassificationModel('roberta', 'roberta-base', use_cuda=False)

In [None]:
len(sentences_train)

In [None]:
len(y_train)

In [None]:
trainNew=pd.DataFrame({ 'text':sentences_train,'labels':y_train})
testNew=pd.DataFrame({ 'text':sentences_test,'labels':y_test})

In [None]:
model.train_model(trainNew)

In [None]:
model.predict(['dead pray for victim Some arbitary sentence'])

In [None]:
import sklearn


result, model_outputs, wrong_predictions = model.eval_model(testNew, acc=sklearn.metrics.accuracy_score)

In [None]:
result

In [None]:
wrong_predictions[0]