In [1]:
import pandas as pd
from textblob import TextBlob
import glob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import sklearn
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
import seaborn as sns

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
dataset_names={0:"crisislex",1:"crisisnlp_crisismmd",2:"news_notNews",3:"Covid_WNUT", 4:"crisisnlp_Sandy_Japolin", 5:"crisisnlp_19Crisis"}

In [9]:
all_files = glob.glob("./dataset_cleaning/*.pkl")
all_files.sort()
datasets = []

for filename in all_files:
    print(filename)
    df = pd.read_pickle(filename)
    datasets.append(df)

datasets=datasets[:-1]

./dataset_cleaning/dataset1Cleaned.pkl
./dataset_cleaning/dataset2Cleaned.pkl
./dataset_cleaning/dataset3Cleaned.pkl
./dataset_cleaning/dataset4Cleaned.pkl
./dataset_cleaning/dataset5Cleaned.pkl
./dataset_cleaning/dataset6Cleaned.pkl
./dataset_cleaning/dataset7Cleaned.pkl


In [10]:

def text_processing(tweet):
    
    tweet= tweet.lower()
    
    #Removing hyperlinks from the tweet
    tweet_no_links=re.sub(r'http\S+', '', tweet)
    
    #Generating the list of words in the tweet (hashtags and other punctuations removed)
    def form_sentence(tweet):
        tweet_blob = TextBlob(tweet)
        return ' '.join(tweet_blob.words)
    new_tweet = form_sentence(tweet_no_links)
    
    #Removing stopwords and words with unusual symbols
    def no_user_alpha(tweet):
        tweet_list = [ele for ele in tweet.split() if ele != 'user']
        clean_tokens = [t for t in tweet_list if re.match(r'[^\W\d]*$', t)]
        clean_s = ' '.join(clean_tokens)
        clean_mess = [word for word in clean_s.split() if word not in stopwords.words('english')]
        return clean_mess
    no_punc_tweet = no_user_alpha(new_tweet)
    
    #Normalizing the words in tweets 
    def normalization(tweet_list):
        lem = WordNetLemmatizer()
        normalized_tweet = []
        for word in tweet_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_tweet.append(normalized_text)
        return " ".join(normalized_tweet)
    
    
    return normalization(no_punc_tweet)

In [11]:
datasets[0].head()

Unnamed: 0,id,Informativeness,text,positive_score,negative_score,emotional_devergence_score
0,211040709124440064,0,#Intern #US #TATTOO #Wisconsin #Ohio #NC #PA #...,1,-1,0.2
1,210864180218167296,0,Get in on the fun every Thursday with the @csi...,2,-1,0.3
2,211157222699433985,0,Welcome to our newest STUDENTathlete- Reagan B...,2,-1,0.3
3,211162553659830272,0,Denver Post: #Colorado governor signs bill cre...,1,-1,0.2
4,211216962162933761,0,Pretty sure I'm going to live in Manitou Sprin...,3,-1,0.4


In [12]:
for i,ds in enumerate(datasets):
    print("Processing dataset {}".format(i+1))
    ds['text_processed']=ds['text'].apply(text_processing)

Processing dataset 1
Processing dataset 2
Processing dataset 3
Processing dataset 4
Processing dataset 5
Processing dataset 6


Merge with new data

In [13]:
authors=pd.read_csv('./dataset_cleaning/tj/parsed/tweet_metadata.csv')
users=pd.read_csv('./dataset_cleaning/tj/parsed/twitter_user.csv')
users.drop(columns=['created_at', 'lang', 'name', 'screen_name', 'location','access'], inplace=True)
users.columns=['author_id', 'has_description', 'bio_has_url', 'followers_count', 'friends_count',
       'favourites_count', 'listed_count', 'statuses_count', 'protected',
       'verified', 'default_profile', 'default_profile_image']

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
mean=['retweet_count','favorite_count','followers_count', 'friends_count', 'listed_count', 'favourites_count', 'statuses_count']
median = ['tweet_type', 'has_description', 'bio_has_url', 'protected', 'verified','default_profile', 'default_profile_image']


for i,ds in enumerate(datasets):
    print("Processing dataset {}".format(i+1))
    ds = pd.merge(ds, authors, on='id', how='left')

    ds.drop(ds.columns.difference(['id', 'Informativeness', 'text_processed', 'author_id', 'tweet_type', 'retweet_count', 'favorite_count']), 1, inplace=True)



    ds = pd.merge(ds, users, on='author_id', how='left')


    for m in mean:
        ds[m][ds['Informativeness'] == 1] = ds[m][ds['Informativeness'] == 1].fillna((ds[m][ds['Informativeness'] == 1].mean()))
        ds[m][ds['Informativeness'] == 0] = ds[m][ds['Informativeness'] == 0].fillna((ds[m][ds['Informativeness'] == 0].mean()))

    for m in median:
        ds[m][ds['Informativeness'] == 1] = ds[m][ds['Informativeness'] == 1].fillna((ds[m][ds['Informativeness'] == 1].value_counts().idxmax()))
        ds[m][ds['Informativeness'] == 0] = ds[m][ds['Informativeness'] == 0].fillna((ds[m][ds['Informativeness'] == 0].value_counts().idxmax()))

    ds["has_description"] = ds["has_description"].apply(lambda x: 0 if x is np.nan else 1)
    ds["tweet_type"] = ds["tweet_type"].apply(lambda x: 0 if x =='tweet' else 1)
    ds["bio_has_url"] = ds["bio_has_url"].apply(lambda x: 0 if x is np.nan else 1)
#     PUT THIS BACK
#     ds = ds.drop(columns=['id', 'author_id'])
    ds = ds.drop(columns=[ 'author_id'])


    datasets[i] = ds
    


Processing dataset 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

Processing dataset 2


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

Processing dataset 3


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

Processing dataset 4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

Processing dataset 5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

Processing dataset 6


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#r

In [None]:
context=pd.read_csv('./dataset_cleaning/tj/context_annotations.csv')
context =context[["id",'domain_name' ,"entity_name"]]
# context.drop_duplicates(subset=['domain_name' ,"entity_name"])

In [None]:
context.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Vevotize each sentance
vectorizer_entity = CountVectorizer()
vectorizer_entity.fit(context['entity_name'])

vectorizer_domain = CountVectorizer()
vectorizer_domain.fit(context['domain_name'])

In [None]:
grouped_entity = context.groupby(by= 'id')['entity_name'].apply(set)
grouped_domain = context.groupby(by= 'id')['domain_name'].apply(set)

In [None]:
grouped_entity_vectorized = grouped_entity.apply(lambda x: vectorizer_entity.transform(x).toarray().sum(axis=0))
grouped_domain_vectorized = grouped_domain.apply(lambda x: vectorizer_domain.transform(x).toarray().sum(axis=0))

In [None]:
for i, ds in enumerate(datasets):

    print("Processing dataset: {}".format(i+1))
    ds = pd.merge(ds, grouped_domain_vectorized, on='id', how='left')
    tostack_domain =ds['domain_name'].apply(pd.Series)
    tostack_domain = tostack_domain.fillna(0)
    ds = pd.concat([ds, tostack_domain], axis=1)
    ds = pd.concat([ds,pd.DataFrame(np.zeros((tostack_domain.shape[0],len(vectorizer_domain.vocabulary_)-tostack_domain.shape[1])))], axis=1)
    
    ds = pd.merge(ds, grouped_entity_vectorized, on='id', how='left')
    tostack_entity =ds['entity_name'].apply(pd.Series)
    tostack_entity = tostack_entity.fillna(0)
    ds = pd.concat([ds, tostack_entity], axis=1)
    ds = pd.concat([ds,pd.DataFrame(np.zeros((tostack_entity.shape[0],len(vectorizer_entity.vocabulary_)-tostack_entity.shape[1])))], axis=1)
    
    
    ds.drop(columns=['domain_name', 'entity_name'], inplace=True)
    datasets[i] = ds

In [None]:
for ds in datasets:
    print(ds.shape)

In [None]:
for i, filename in enumerate(all_files):
    print("Dataset {}, {}:".format(i+1, dataset_names[i]))
    print("Size when keeping only english tweets: {}, \nSize when keeping only tweets for which we have additional features on the users/tweets: {}, \nSize of dataset for which we also have context annotation: {}\n ".format(len(pd.read_pickle(filename)), len(datasets[i]), len(pd.merge(datasets[i], context, on='id').drop_duplicates(subset=['id']))))

In [None]:
for i in datasets:
    
    print(len(i), len(pd.merge(i, context, on='id')))

In [None]:
testdwe= pd.merge(datasets[1], context, on='id')
print(len(testdwe))
print(len(testdwe.drop_duplicates(subset=['id'])))

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from scipy.sparse import hstack
from sklearn.preprocessing import normalize
from sklearn.preprocessing import scale

def grid_search_text_and_numerical(model, parameters, X_train_text, X_train_numerical):
    best_score=0
    best_params={}
    best_clf= None
    best_vectorizer = None
    best_tfIdfTransformer=None
    for ngram_range in [(1, 1), (1, 2)]:
        for use_tf_idf in [True, False]:
            for normalization_norm in ["max"]:

                print("Iteration with {}, {}, {}".format(ngram_range, use_tf_idf, normalization_norm))
                vectorizer = CountVectorizer(ngram_range=ngram_range)
                vectorizer.fit(X_train_text)
                X_train_text_vectorized = vectorizer.transform(X_train_text)

                tfidf_transformer = TfidfTransformer(use_idf = use_tf_idf)
                tfidf_transformer.fit(X_train_text_vectorized)
                X_train_text_tfidf = tfidf_transformer.transform(X_train_text_vectorized)


                X_train_merged = hstack((X_train_text_tfidf,np.array(np.array(X_train_numerical).astype(np.float))))

                X_train_merged = normalize(X_train_merged, norm=normalization_norm, axis=0)
    #             X_train_merged = scale(X_train_merged, with_mean= False, axis = 0)

                clf = GridSearchCV(model, parameters, scoring='roc_auc', n_jobs=-1, verbose=2)
                clf.fit(X_train_merged, y_train)

                if(best_score < clf.best_score_):
                    best_score = clf.best_score_
                    best_params = clf.best_params_
                    best_params["ngram_range"] = ngram_range
                    best_params["use_tf_idf"]= use_tf_idf
                    best_params["normalization_norm"] = normalization_norm
                    best_clf=clf

                    best_vectorizer = vectorizer
                    best_tfIdfTransformer=tfidf_transformer
                
    return best_score, best_params, best_clf, best_vectorizer, best_tfIdfTransformer
            

In [16]:
def merge_test_and_score(X_test_text, X_test_numerical, best_vecotorizer, best_tfIdfTransformer, best_params, best_clf):

    X_test_text_vectorized = best_vecotorizer.transform(X_test_text)
    X_test_text_tfidf = best_tfIdfTransformer.transform(X_test_text_vectorized)


    X_test_merged = hstack((X_test_text_tfidf,np.array(np.array(X_test_numerical).astype(np.float))))
    
    X_test_merged = normalize(X_test_merged, norm=best_params["normalization_norm"], axis=0)
    # X_test_merged = scale(X_test_merged, with_mean= False, axis = 0)

    return best_clf.score(X_test_merged, y_test)

In [17]:
from sklearn.linear_model import SGDClassifier


text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier( penalty='l2', random_state=42))])

parameters_svm = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-5, 1e-3),
                  'clf-svm__loss':('hinge','log','perceptron'),'clf-svm__max_iter': (10, 100)}
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm,scoring='roc_auc', n_jobs=-1, verbose=10)

In [18]:
from sklearn.linear_model import LogisticRegression

text_clf_lr = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-lr', LogisticRegression( max_iter=1000))])

parameters_lr = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False),'clf-lr__penalty': ('l2', 'l1')}

gs_clf_lr = GridSearchCV(text_clf_lr, parameters_lr ,scoring='roc_auc', n_jobs=-1, verbose=2)

In [19]:
from sklearn.naive_bayes import MultinomialNB


text_clf_nb = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-nb', MultinomialNB())])

parameters_nb = {'vect__ngram_range': [(1, 1), (1, 2),(2,2)], 'tfidf__use_idf': (True, False),
                  'clf-nb__alpha': (1,1e-1, 1e-3)}

gs_clf_nb = GridSearchCV(text_clf_nb, parameters_nb,scoring='roc_auc', n_jobs=-1, verbose=5)

In [20]:
from sklearn.tree import DecisionTreeClassifier

text_clf_dt = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-dt', DecisionTreeClassifier(  ))])

parameters_dt = {'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, ),
                'clf-dt__max_depth':[200, 400, 1000]}

gs_clf_dt = GridSearchCV(text_clf_dt, parameters_dt,scoring='roc_auc', n_jobs=-1, verbose=5)

In [21]:
from sklearn.ensemble import RandomForestClassifier


text_clf_rf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-rf', RandomForestClassifier( ))])

parameters_rf = {'vect__ngram_range': [(1, 1),(2,2)], 'tfidf__use_idf': (True, False),
                'clf-rf__n_estimators':[800,], 'clf-rf__min_samples_split':[4,10]}

gs_clf_rf = GridSearchCV(text_clf_rf, parameters_rf, scoring='roc_auc', n_jobs=-1, verbose=10)

In [22]:
models = [gs_clf_svm, gs_clf_lr, gs_clf_nb, gs_clf_dt, gs_clf_rf]


In [23]:
models = [gs_clf_lr]

In [24]:
from sklearn.linear_model import LogisticRegression

lr_parameters = {'penalty':('l2',)}
lr_model = LogisticRegression( max_iter=1000)

lr={}
lr["model"]=lr_model
lr["parameters"] =lr_parameters

In [25]:
from sklearn.linear_model import SGDClassifier

svd_parameters = { 'loss':('hinge','log',),'max_iter': (100,1000),
                 'alpha':(1e-3,1e-6, 1e-10),'penalty':('l2',)}
svd_model = SGDClassifier( random_state=42, early_stopping=True)

svd={}
svd["model"]=svd_model
svd["parameters"] =svd_parameters

In [26]:
from sklearn.naive_bayes import MultinomialNB

nb_parameters = { 'alpha': (1,1e-1, 1e-3)}
nb_model = MultinomialNB()

nb = {}
nb["model"]=nb_model
nb["parameters"] =nb_parameters

In [27]:
from sklearn.tree import DecisionTreeClassifier

dt_parameters = {'max_depth':[200, 400, 1000]}
dt_model = DecisionTreeClassifier(  )

dt={}
dt["model"]=dt_model
dt["parameters"] =dt_parameters

In [28]:
from sklearn.ensemble import RandomForestClassifier

rf_parameters = {'n_estimators':[800,], 'min_samples_split':[4,10]}
rf_model = RandomForestClassifier( )

rf={}
rf["model"]=rf_model
rf["parameters"] =rf_parameters

In [29]:
algorithms = [lr]

In [31]:
df_merged = pd.concat(datasets[:3] + (datasets[4:6]))
df_merged = df_merged.sample(frac=1).reset_index(drop=True)
datasets=[df_merged, datasets[3]]

In [36]:
results_per_model=[]
feature_importance=[]

for algo in algorithms:
    print("------Algo:", algo)
    results=[]
    for i,ds_train in enumerate(datasets):

        print("Training on Dataset {}".format(i+1))
        row=[]

        
        X_train = ds_train.drop(columns=['Informativeness'])
        y_train = ds_train['Informativeness']
        X_train_text=X_train['text_processed']
        X_train_numerical= X_train.drop(columns=["text_processed"])
        
        best_score, best_params, best_clf, best_vectorizer, best_tfIdfTransformer = grid_search_text_and_numerical(algo["model"], algo["parameters"], X_train_text, X_train_numerical)
        
        




#         feature_names = best_clf.best_estimator_[0].get_feature_names() 
#         coefs = best_clf.best_estimator_[-1].coef_[0]
#         coefs_with_fns = sorted(zip(coefs, feature_names)) 
#         df=pd.DataFrame(coefs_with_fns)
#         df.columns='coefficient','word'
#         df.sort_values(by='coefficient')
#         feature_importance.append(df)
        feature_importance.append((best_clf, best_vectorizer))

        for j,ds_test in enumerate(datasets):

            print("testing on Dataset {}".format(j+1))
            
            X_test = ds_test.drop(columns=['Informativeness'])
            y_test = ds_test['Informativeness']
            X_test_text=X_test['text_processed']
            X_test_numerical= X_test.drop(columns=["text_processed"])

            row.append(merge_test_and_score(X_test_text, X_test_numerical, best_vectorizer, best_tfIdfTransformer, best_params, best_clf))
        results.append(row)
    results_per_model.append(results)
    
    

------Algo: {'model': LogisticRegression(max_iter=1000), 'parameters': {'penalty': ('l2',)}}
Training on Dataset 1
Iteration with (1, 1), True, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   14.5s finished


Iteration with (1, 1), False, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   15.9s finished


Iteration with (1, 2), True, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.2min finished


Iteration with (1, 2), False, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min finished


testing on Dataset 1
testing on Dataset 2
Training on Dataset 2
Iteration with (1, 1), True, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.4s finished


Iteration with (1, 1), False, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.8s finished


Iteration with (1, 2), True, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.5s finished


Iteration with (1, 2), False, max
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.8s finished


testing on Dataset 1
testing on Dataset 2


In [60]:
best_clf, best_vectorizer = feature_importance[0]

In [61]:
len(best_vectorizer.get_feature_names())

329238

In [62]:
len(best_clf.best_estimator_.coef_[0])

329253

In [63]:
feature_names = best_vectorizer.get_feature_names()
feature_coef = best_clf.best_estimator_.coef_[0][-len(feature_names):]

In [64]:
best_clf.classes_

array([0, 1])

In [65]:
coefs_with_fns = sorted(zip(feature_coef, feature_names)) 
df=pd.DataFrame(coefs_with_fns)
df.columns='coefficient','word'
df.sort_values(by='coefficient')

Unnamed: 0,coefficient,word
0,-4.016589,moose stick
1,-3.387803,prayforbohol philippines
2,-2.936679,lourddv
3,-2.287554,rednationrising
4,-2.286578,prayforsingapore
...,...,...
329233,4.095931,merlindom
329234,4.125790,엑소
329235,4.388131,children vanuatu
329236,6.027916,prayforcarmenbohol


In [81]:
df[-40:]

Unnamed: 0,coefficient,word
329198,2.338436,watchdogs forum
329199,2.345567,evac zone
329200,2.353635,hurricaine odile
329201,2.383269,cabinet decide
329202,2.385434,destinations gt
329203,2.419677,donald summer
329204,2.421638,sushmaswaraj pmoindia
329205,2.422844,treat patrick
329206,2.445577,harvey unleash
329207,2.458835,irisheagle early


In [79]:
df[:30]

Unnamed: 0,coefficient,word
0,-4.016589,moose stick
1,-3.387803,prayforbohol philippines
2,-2.936679,lourddv
3,-2.287554,rednationrising
4,-2.286578,prayforsingapore
5,-2.223592,ftl could
6,-2.219645,prayforstockholm thing
7,-2.072539,nintendo go
8,-2.066508,russian region
9,-2.040945,loiz give


In [80]:
results

[[0.9991364477357076, 0.608193107959188],
 [0.5308242321168006, 0.9999991702703528]]

In [19]:
feature_names = best_clf.best_estimator_[0].get_feature_names() 
coefs = best_clf.best_estimator_[-1].coef_[0]

coefs_with_fns = sorted(zip(coefs, feature_names)) 
df=pd.DataFrame(coefs_with_fns)
df.columns='coefficient','word'
df.sort_values(by='coefficient')

TypeError: 'LogisticRegression' object is not subscriptable

In [None]:
lr, svd, dt

In [None]:
model_names={0:'random forrest'}

In [None]:
import matplotlib.pyplot as plt
for i,result in enumerate(results_per_model):
    df = pd.DataFrame(result)
    df.columns= map( lambda x: dataset_names[x],df.columns)
    df.index= map( lambda x: dataset_names[x],df.index)
    svm=sns.heatmap(df, annot=True).set_title(model_names[i])
    figure = svm.get_figure()    
    figure.savefig('./heatmaps/{}.png'.format(model_names[i]), dpi=400)
    plt.clf()

In [None]:
results= pd.read_pickle('./reslist.pkl')

In [None]:
res_auc=list(map( lambda x: list(map(lambda y: y['auc'],x)),results))

In [None]:
df = pd.DataFrame(res_auc)
df.columns= map( lambda x: dataset_names[x],df.columns)
df.index= map( lambda x: dataset_names[x],df.index)

In [None]:
sns.heatmap(df, annot=True).set_title('DistilBERT')


In [None]:
for i, feature_importance_dataset in enumerate(feature_importance):
    feature_importance_dataset.to_csv('./features/{}_features.csv'.format(dataset_names[i]))