In [2]:
import pandas as pd
from textblob import TextBlob
import glob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
dataset_names={0:"crisislex",1:"crisisnlp_crisismmd",2:"news_notNews",3:"Covid_WNUT", 4:"crisisnlp_Sandy_Japolin", 5:"crisisnlp_19Crisis"}

In [4]:
all_files = glob.glob("./dataset_cleaning/*.pkl")
all_files.sort()
datasets = []

for filename in all_files:
    print(filename)
    df = pd.read_pickle(filename)
    datasets.append(df)



./dataset_cleaning/dataset1Cleaned.pkl
./dataset_cleaning/dataset2Cleaned.pkl
./dataset_cleaning/dataset3Cleaned.pkl
./dataset_cleaning/dataset4Cleaned.pkl
./dataset_cleaning/dataset5Cleaned.pkl
./dataset_cleaning/dataset6Cleaned.pkl


In [5]:

def text_processing(tweet):
    
    tweet= tweet.lower()
    
    #Removing hyperlinks from the tweet
    tweet_no_links=re.sub(r'http\S+', '', tweet)
    
    #Generating the list of words in the tweet (hashtags and other punctuations removed)
    def form_sentence(tweet):
        tweet_blob = TextBlob(tweet)
        return ' '.join(tweet_blob.words)
    new_tweet = form_sentence(tweet_no_links)
    
    #Removing stopwords and words with unusual symbols
    def no_user_alpha(tweet):
        tweet_list = [ele for ele in tweet.split() if ele != 'user']
        clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
        clean_s = ' '.join(clean_tokens)
        clean_mess = [word for word in clean_s.split() if word not in stopwords.words('english')]
        return clean_mess
    no_punc_tweet = no_user_alpha(new_tweet)
    
    #Normalizing the words in tweets 
    def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return " ".join(normalized_tweet)
    
    
    return normalization(no_punc_tweet)

In [6]:
datasets[0].head()

Unnamed: 0,id,Informativeness,text
0,211040709124440064,0,#Intern #US #TATTOO #Wisconsin #Ohio #NC #PA #...
1,210864180218167296,0,Get in on the fun every Thursday with the @csi...
2,211157222699433985,0,Welcome to our newest STUDENTathlete- Reagan B...
3,211162553659830272,0,Denver Post: #Colorado governor signs bill cre...
4,211216962162933761,0,Pretty sure I'm going to live in Manitou Sprin...


In [7]:
for i,ds in enumerate(datasets):
    print("Processing dataset {}".format(i+1))
    ds['text_processed']=ds['text'].apply(text_processing)

Processing dataset 1
Processing dataset 2
Processing dataset 3
Processing dataset 4
Processing dataset 5
Processing dataset 6


Merge with new data

In [8]:
authors=pd.read_csv('./dataset_cleaning/tj/parsed/tweet_metadata.csv')
users=pd.read_csv('./dataset_cleaning/tj/parsed/twitter_user.csv')
users.drop(columns=['created_at', 'lang', 'name', 'screen_name', 'location','access'], inplace=True)
users.columns=['author_id', 'has_description', 'bio_has_url', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'statuses_count', 'protected',
       'verified', 'default_profile', 'default_profile_image']

  interactivity=interactivity, compiler=compiler, result=result)


In [9]:
for i,ds in enumerate(datasets):
    print("Processing dataset {}".format(i+1))
    ds = pd.merge(ds, authors, on='id')
    ds.drop(ds.columns.difference(['id', 'Informativeness', 'text_processed', 'author_id', 'tweet_type', 'retweet_count', 'favorite_count']), 1, inplace=True)
    
    ds =pd.merge(ds, users, on='author_id')
    
    ds["has_description"] = ds["has_description"].apply(lambda x: 0 if x is np.nan else 1)
    ds["tweet_type"] = ds["tweet_type"].apply(lambda x: 0 if x =='tweet' else 1)
    ds["bio_has_url"] = ds["bio_has_url"].apply(lambda x: 0 if x is np.nan else 1)
    ds = ds.drop(columns=['id', 'author_id'])
    
    datasets[i] = ds
    


Processing dataset 1
Processing dataset 2
Processing dataset 3
Processing dataset 4
Processing dataset 5
Processing dataset 6


In [24]:
datasets[0].columns

Index(['Informativeness', 'text_processed', 'tweet_type', 'retweet_count',
       'favorite_count', 'has_description', 'bio_has_url', 'followers_count',
       'friends_count', 'favourites_count', 'listed_count', 'statuses_count',
       'protected', 'verified', 'default_profile', 'default_profile_image'],
      dtype='object')

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from scipy.sparse import hstack
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale

def grid_search_text_and_numerical(model, parameters, X_train_text, X_train_numerical):
    best_score=0
    best_params={}
    best_clf= None
    best_vectorizer = None
    best_tfIdfTransformer=None
    for ngram_range in [(1, 1), (1, 2)]:
        for use_tf_idf in [True, False]:
            for normalization_norm in ["max"]:

                print("Iteration with {}, {}, {}".format(ngram_range, use_tf_idf, normalization_norm))
                vectorizer = CountVectorizer(ngram_range=ngram_range)
                vectorizer.fit(X_train_text)
                X_train_text_vectorized = vectorizer.transform(X_train_text)

                tfidf_transformer = TfidfTransformer(use_idf = use_tf_idf)
                tfidf_transformer.fit(X_train_text_vectorized)
                X_train_text_tfidf = tfidf_transformer.transform(X_train_text_vectorized)


                X_train_merged = hstack((X_train_text_tfidf,np.array(np.array(X_train_numerical).astype(np.float))))

                X_train_merged = normalize(X_train_merged, norm=normalization_norm, axis=0)
    #             X_train_merged = scale(X_train_merged, with_mean= False, axis = 0)

                clf = GridSearchCV(model, parameters, scoring='roc_auc', n_jobs=-1, verbose=2)
                clf.fit(X_train_merged, y_train)

                if(best_score < clf.best_score_):
                    best_score = clf.best_score_
                    best_params = clf.best_params_
                    best_params["ngram_range"] = ngram_range
                    best_params["use_tf_idf"]= use_tf_idf
                    best_params["normalization_norm"] = normalization_norm
                    best_clf=clf

                    best_vectorizer = vectorizer
                    best_tfIdfTransformer=tfidf_transformer
                
    return best_score, best_params, best_clf, best_vectorizer, best_tfIdfTransformer
            

In [11]:
def merge_test_and_score(X_test_text, X_test_numerical, best_vecotorizer, best_tfIdfTransformer, best_params, best_clf):

    X_test_text_vectorized = best_vecotorizer.transform(X_test_text)
    X_test_text_tfidf = best_tfIdfTransformer.transform(X_test_text_vectorized)


    X_test_merged = hstack((X_test_text_tfidf,np.array(np.array(X_test_numerical).astype(np.float))))
    
    X_test_merged = normalize(X_test_merged, norm=best_params["normalization_norm"], axis=0)
    # X_test_merged = scale(X_test_merged, with_mean= False, axis = 0)

    return best_clf.score(X_test_merged, y_test)

In [None]:
from sklearn.linear_model import SGDClassifier


text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier( penalty='l2', random_state=42))])

parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-5, 1e-3),
                  'clf-svm__loss':('hinge','log','perceptron'),'clf-svm__max_iter': (10, 100)}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm,scoring='roc_auc', n_jobs=-1, verbose=10)

In [None]:
from sklearn.linear_model import LogisticRegression

text_clf_lr = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-lr', LogisticRegression( max_iter=1000))])

parameters_lr = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-lr__penalty': ('l2', 'l1')}

gs_clf_lr = GridSearchCV(text_clf_lr, parameters_lr ,scoring='roc_auc', n_jobs=-1, verbose=2)

In [None]:
from sklearn.naive_bayes import MultinomialNB


text_clf_nb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-nb', MultinomialNB())])

parameters_nb = {'vect__ngram_range': [(1, 1), (1, 2),(2,2)], 'tfidf__use_idf': (True, False),
                  'clf-nb__alpha': (1,1e-1, 1e-3)}

gs_clf_nb = GridSearchCV(text_clf_nb, parameters_nb,scoring='roc_auc', n_jobs=-1, verbose=5)

In [None]:
from sklearn.tree import DecisionTreeClassifier

text_clf_dt = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-dt', DecisionTreeClassifier(  ))])

parameters_dt = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, ),
                'clf-dt__max_depth':[200, 400, 1000]}

gs_clf_dt = GridSearchCV(text_clf_dt, parameters_dt,scoring='roc_auc', n_jobs=-1, verbose=5)

In [12]:
from sklearn.ensemble import RandomForestClassifier


text_clf_rf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-rf', RandomForestClassifier( ))])

parameters_rf = {'vect__ngram_range': [(1, 1),(2,2)], 'tfidf__use_idf': (True, False),
                'clf-rf__n_estimators':[800,], 'clf-rf__min_samples_split':[4,10]}

gs_clf_rf = GridSearchCV(text_clf_rf, parameters_rf, scoring='roc_auc', n_jobs=-1, verbose=10)

In [None]:
models = [gs_clf_svm, gs_clf_lr, gs_clf_nb, gs_clf_dt, gs_clf_rf]

In [12]:
from sklearn.linear_model import LogisticRegression

lr_parameters = {'penalty':('l2',)}
lr_model = LogisticRegression( max_iter=1000)

lr={}
lr["model"]=lr_model
lr["parameters"] =lr_parameters

In [13]:
from sklearn.linear_model import SGDClassifier

svd_parameters = { 'loss':('hinge','log',),'max_iter': (100,1000),
                 'alpha':(1e-3,1e-6, 1e-10),'penalty':('l2',)}
svd_model = SGDClassifier( random_state=42, early_stopping=True)

svd={}
svd["model"]=svd_model
svd["parameters"] =svd_parameters

In [14]:
from sklearn.naive_bayes import MultinomialNB

nb_parameters = { 'alpha': (1,1e-1, 1e-3)}
nb_model = MultinomialNB()

nb = {}
nb["model"]=nb_model
nb["parameters"] =nb_parameters

In [15]:
from sklearn.tree import DecisionTreeClassifier

dt_parameters = {'max_depth':[200, 400, 1000]}
dt_model = DecisionTreeClassifier(  )

dt={}
dt["model"]=dt_model
dt["parameters"] =dt_parameters

In [18]:
from sklearn.ensemble import RandomForestClassifier

rf_parameters = {'n_estimators':[800,], 'min_samples_split':[4,10]}
rf_model = RandomForestClassifier( )

rf={}
rf["model"]=rf_model
rf["parameters"] =rf_parameters

In [19]:
algorithms = [rf]

In [20]:
results_per_model=[]
# feature_importance=[]

for algo in algorithms:
    print("------Algo:", algo)
    results=[]
    for i,ds_train in enumerate(datasets):

        print("Training on Dataset {}".format(i+1))
        row=[]

        
        X_train = ds_train.drop(columns=['Informativeness'])
        y_train = ds_train['Informativeness']
        X_train_text=X_train['text_processed']
        X_train_numerical= X_train.drop(columns=["text_processed"])
        
        best_score, best_params, best_clf, best_vectorizer, best_tfIdfTransformer = grid_search_text_and_numerical(algo["model"], algo["parameters"], X_train_text, X_train_numerical)
        
        




#         feature_names = gs_clf.best_estimator_[0].get_feature_names() 
#         coefs = gs_clf.best_estimator_[-1].coef_[0]
#         coefs_with_fns = sorted(zip(coefs, feature_names)) 
#         df=pd.DataFrame(coefs_with_fns)
#         df.columns='coefficient','word'
#         df.sort_values(by='coefficient')
#         feature_importance.append(df)

        for j,ds_test in enumerate(datasets):

            print("testing on Dataset {}".format(j+1))
            
            X_test = ds_test.drop(columns=['Informativeness'])
            y_test = ds_test['Informativeness']
            X_test_text=X_test['text_processed']
            X_test_numerical= X_test.drop(columns=["text_processed"])

            row.append(merge_test_and_score(X_test_text, X_test_numerical, best_vectorizer, best_tfIdfTransformer, best_params, best_clf))
        results.append(row)
    results_per_model.append(results)
    
    

------Algo: {'model': RandomForestClassifier(), 'parameters': {'n_estimators': [800], 'min_samples_split': [4, 10]}}
Training on Dataset 1
Iteration with (1, 1), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 10.9min finished


Iteration with (1, 1), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 10.9min finished


Iteration with (1, 2), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 39.5min finished


Iteration with (1, 2), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 39.8min finished


testing on Dataset 1
testing on Dataset 2
testing on Dataset 3
testing on Dataset 4
testing on Dataset 5
testing on Dataset 6
Training on Dataset 2
Iteration with (1, 1), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.0min finished


Iteration with (1, 1), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.1min finished


Iteration with (1, 2), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 22.3min finished


Iteration with (1, 2), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 22.5min finished


testing on Dataset 1
testing on Dataset 2
testing on Dataset 3
testing on Dataset 4
testing on Dataset 5
testing on Dataset 6
Training on Dataset 3
Iteration with (1, 1), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   43.8s finished


Iteration with (1, 1), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   42.4s finished


Iteration with (1, 2), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.4min finished


Iteration with (1, 2), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.4min finished


testing on Dataset 1
testing on Dataset 2
testing on Dataset 3
testing on Dataset 4
testing on Dataset 5
testing on Dataset 6
Training on Dataset 4
Iteration with (1, 1), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.1min finished


Iteration with (1, 1), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  3.0min finished


Iteration with (1, 2), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 11.5min finished


Iteration with (1, 2), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 11.4min finished


testing on Dataset 1
testing on Dataset 2
testing on Dataset 3
testing on Dataset 4
testing on Dataset 5
testing on Dataset 6
Training on Dataset 5
Iteration with (1, 1), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.6s finished


Iteration with (1, 1), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.1s finished


Iteration with (1, 2), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.1s finished


Iteration with (1, 2), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    6.1s finished


testing on Dataset 1
testing on Dataset 2
testing on Dataset 3
testing on Dataset 4
testing on Dataset 5
testing on Dataset 6
Training on Dataset 6
Iteration with (1, 1), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  7.3min finished


Iteration with (1, 1), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  8.0min finished


Iteration with (1, 2), True, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 29.3min finished


Iteration with (1, 2), False, max
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 25.8min finished


testing on Dataset 1
testing on Dataset 2
testing on Dataset 3
testing on Dataset 4
testing on Dataset 5
testing on Dataset 6


In [21]:
results_per_model

[[[0.9999984435325532,
   0.6532139307410065,
   0.672994735650402,
   0.6087561819844832,
   0.8049242424242424,
   0.7016863533988655],
  [0.743283233626691,
   1.0,
   0.741802402920865,
   0.5504011850562922,
   0.6477272727272727,
   0.7795283312632846],
  [0.7243200555400572,
   0.6865880475988779,
   0.9999982310653232,
   0.5803350221231633,
   0.7803030303030303,
   0.7021284393534643],
  [0.6525750197439552,
   0.5585715260805513,
   0.674447384806974,
   0.9999993363631673,
   0.6458333333333334,
   0.6264101429374167],
  [0.6969223339964548,
   0.5579042731569808,
   0.6049618617683686,
   0.5606757349321672,
   1.0,
   0.5620334946047189],
  [0.6638264049999828,
   0.683753170215264,
   0.6559308841843088,
   0.6451131110913126,
   0.6363636363636365,
   0.9999915574999048]]]

In [None]:
lr, svd, dt

In [22]:
model_names={0:'random forrest'}

In [23]:
import matplotlib.pyplot as plt
for i,result in enumerate(results_per_model):
    df = pd.DataFrame(result)
    df.columns= map( lambda x: dataset_names[x],df.columns)
    df.index= map( lambda x: dataset_names[x],df.index)
    svm=sns.heatmap(df, annot=True).set_title(model_names[i])
    figure = svm.get_figure()    
    figure.savefig('./heatmaps/{}.png'.format(model_names[i]), dpi=400)
    plt.clf()

<Figure size 432x288 with 0 Axes>

In [None]:
results= pd.read_pickle('./reslist.pkl')

In [None]:
res_auc=list(map( lambda x: list(map(lambda y: y['auc'],x)),results))

In [None]:
df = pd.DataFrame(res_auc)
df.columns= map( lambda x: dataset_names[x],df.columns)
df.index= map( lambda x: dataset_names[x],df.index)

In [None]:
sns.heatmap(df, annot=True).set_title('DistilBERT')


In [None]:
for i, feature_importance_dataset in enumerate(feature_importance):
    feature_importance_dataset.to_csv('./features/{}_features.csv'.format(dataset_names[i]))