In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive/MSc_project/.MAIN

Mounted at /content/drive
/content/drive/My Drive/MSc_project/.MAIN


In [2]:
import pandas as pd 
import pickle
import json
import os
import os.path
import numpy as np
import seaborn as sns
from collections import defaultdict
import joblib
import matplotlib.pyplot as plt
from matplotlib.ticker import (
                               FormatStrFormatter, 
                               AutoMinorLocator,
                               FuncFormatter,
                               )
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
%matplotlib inline
from scipy.spatial import distance
import math


import os
os.chdir('G:\My Drive\MSc_project\.MAIN')


In [None]:

hashtags = ['avengers','blm','brexit','climatechange','covid','gaza','loveisland','monkeypox','nhs','olivianewtonjohn','supercup','UkraineWar']


for h_i,hashtag in enumerate(hashtags):

    cats = ['sentiment', 'topic', 'topic_single', 'irony', 'emoji', 'offensive','gender', 'user_sentiment_mode']

    df = pd.read_csv(f'multisource_analysis/user_ft_data/{hashtag}_scored_tweets.csv')
    df.drop(['Unnamed: 0', 'text','hashtag','user_id']+cats, inplace = True, axis =1)

    # readability statistics are nan when the tweet contains no text. so change these scores to 0
    reads = ['ARI','LIX','RIX','complex_words','Coleman-Liau', 'FleschReadingEase', 'GunningFogIndex', 'LIX', 'SMOGIndex', 'RIX', 'DaleChallIndex']
    for r in reads:
        df[[r]] = df[[r]].fillna(value=0).copy()

    cases = list(set(df['target'].tolist()))

    target = df[df['is-target']==1]
    informers = df[df['is-informer']==1]
    infectors = df[df['is-infector']==1]

    n = len(infectors)

    # clean a little
    for sub in [infectors,informers,target]:
        sub.drop(['is-target','is-infector','is-informer'], inplace = True, axis =1)

    cols = infectors.columns.tolist()

    best_formers = []
    av_formers = []

    for case_id in cases:

        o_fect = infectors.copy()[infectors['target']==case_id]
        fect = o_fect.copy().drop('target', axis=1).to_numpy()
        o_formers = informers.copy()[informers['target']==case_id]
        formers = o_formers.copy().drop('target', axis=1).to_numpy()

        av_former = informers.copy()[informers['target']==case_id]
        av_former = av_former.copy().drop('target', axis=1).mean().to_numpy()

        # get the closest feature to the target per feature
        best_calc = abs(fect - formers)
        best_idx = [ np.argmin(abs(row.astype(float))) for row in best_calc.T ]
        best_former = np.array( [form[best_idx[i]] for i,form in enumerate(formers.T) ] ) # we keeep the informer with the closer scores per feature
        
        av_formers.append(av_former.tolist())
        best_formers.append(best_former.tolist())

    cols.remove('target')

    target.drop('target', inplace = True, axis =1)
    infectors.drop('target', inplace = True, axis =1)
    av_informers_df = pd.DataFrame(av_formers, columns = cols)
    best_informers_df = pd.DataFrame(best_formers, columns = cols)

    infectors.insert(0,column = 'hashtag', value= [hashtag]*n )
    target.insert(0,column = 'hashtag', value= [hashtag]*n )
    av_informers_df.insert(0,column = 'hashtag', value= [hashtag]*n )
    best_informers_df.insert(0,column = 'hashtag', value= [hashtag]*n )
    

    if h_i == 0:
        all_infector_df = infectors
        all_target_df = target
        all_av_informer_df = av_informers_df
        all_best_informer_df = best_informers_df
    else:   
        all_infector_df = pd.concat( [all_infector_df, infectors ], axis = 0 )
        all_target_df = pd.concat( [all_target_df, target ], axis = 0 )
        all_av_informer_df = pd.concat( [all_av_informer_df, av_informers_df ], axis = 0 )
        all_best_informer_df = pd.concat( [all_best_informer_df, best_informers_df ], axis = 0 )
    

    print(f'\n\nAdded {hashtag} to DATAFRAME\n----------------------------------------------------------------')

### user columns

In [9]:
user_cols = [ 'user_neg_mean',
 'user_neu_mean',
 'user_pos_mean',
 'user_comp_mean',
 'user_arts_&_culture_mean',
 'user_business_&_entrepreneurs_mean',
 'user_celebrity_&_pop_culture_mean',
 'user_diaries_&_daily_life_mean',
 'user_family_mean',
 'user_fashion_&_style_mean',
 'user_film_tv_&_video_mean',
 'user_fitness_&_health_mean',
 'user_food_&_dining_mean',
 'user_gaming_mean',
 'user_learning_&_educational_mean',
 'user_music_mean',
 'user_news_&_social_concern_mean',
 'user_other_hobbies_mean',
 'user_relationships_mean',
 'user_science_&_technology_mean',
 'user_sports_mean',
 'user_travel_&_adventure_mean',
 'user_youth_&_student_life_mean',
 'user_hateful_mean',
 'user_targeted_mean',
 'user_aggressive_mean',
 'user_joy_mean',
 'user_sadness_mean',
 'user_others_mean',
 'user_anger_mean',
 'user_surprise_mean',
 'user_disgust_mean',
 'user_fear_mean',
 'user_topic_mode',
 'user_negative_count',
 'user_neutral_count',
 'user_positive_count',
 'user_arts_&_culture_count',
 'user_business_&_entrepreneurs_count',
 'user_celebrity_&_pop_culture_count',
 'user_diaries_&_daily_life_count',
 'user_family_count',
 'user_fashion_&_style_count',
 'user_film_tv_&_video_count',
 'user_fitness_&_health_count',
 'user_food_&_dining_count',
 'user_gaming_count',
 'user_learning_&_educational_count',
 'user_music_count',
 'user_news_&_social_concern_count',
 'user_other_hobbies_count',
 'user_relationships_count',
 'user_science_&_technology_count',
 'user_sports_count',
 'user_travel_&_adventure_count',
 'user_youth_&_student_life_count',
 'user_politeness_mean',
 'user_polarity_mean',
 'user_subjectivity_mean']

### finding the difference

In [22]:
from scipy.stats import ks_2samp

best_scores = []

for col in user_cols:

    fec = all_infector_df[col].tolist()
    targ = all_target_df[col].tolist()
    av_form = all_av_informer_df[col].tolist()
    best_form = all_best_informer_df[col].tolist()

    fect_diff = abs(np.array(targ)-np.array(fec))
    inf_diff = abs(np.array(targ)-np.array(best_form))


    s = ks_2samp(fect_diff, inf_diff)
    best_scores.append([f'{"{:.4f}".format(s[0])}',f'{"{:.3f}".format(s[1])}'])
    

best_scores_df = pd.DataFrame(best_scores).T
best_scores_df.columns = user_cols
best_scores_df= best_scores_df.copy().T


In [25]:
# best_scores_df.to_csv('multisource_analysis/significance_tests/user_features_sig.csv')

best_scores_df

Unnamed: 0,0,1
user_neg_mean,0.0905,0.0
user_neu_mean,0.0492,0.0
user_pos_mean,0.0613,0.0
user_comp_mean,0.0873,0.0
user_arts_&_culture_mean,0.0536,0.0
user_business_&_entrepreneurs_mean,0.0586,0.0
user_celebrity_&_pop_culture_mean,0.0606,0.0
user_diaries_&_daily_life_mean,0.0718,0.0
user_family_mean,0.0464,0.0
user_fashion_&_style_mean,0.048,0.0


## AFTER COLLECTING ALL THE DIFFERENCES PER SCORES

In [26]:
hashtag = 'blm'
# hashtag_infector_df[hashtag_infector_df['hashtag']==hashtag]
infector_df = hashtag_infector_df[hashtag_infector_df['hashtag']==hashtag].copy()
infector_df.head(2)

Unnamed: 0,hashtag,metric,Gender,Sentiment,Polarity,Readability,Emotion,Hate,Topics,Extended Topics
0,blm,euclidean,49.497475,0.056569,0.610328,42.699972,0.016854,0.000391,0.141655,0.216753
1,blm,cosine,0.173026,0.001436,0.207376,0.084965,5.8e-05,0.001007,0.004387,0.014427


In [None]:
hashes = ['blm','nhs','gaza','supercup','climatechange']

from scipy.stats import ks_2samp
import numpy as np


cols = ['Gender','Sentiment','Polarity', 'Readability','Emotion','Hate','Topics ','Extended Topics']
scores = []

for hashtag in hashes:

    print(f'------------------------------')
    print(f'{hashtag}')
    cols = ['metric','Gender','Sentiment','Polarity', 'Readability','Emotion','Hate','Topics ','Extended Topics']
    cols2 = target.columns.tolist()

    infector_df = hashtag_infector_df[hashtag_infector_df['hashtag']==hashtag].copy()
    infector_df.drop('hashtag', inplace = True, axis=1)
    av_informer_df = hashtag_av_informer_df[hashtag_av_informer_df['hashtag']==hashtag].copy()
    av_informer_df.drop('hashtag', inplace = True,axis=1)
    best_informer_df = hashtag_all_best_informer_df[hashtag_all_best_informer_df['hashtag']==hashtag].copy()
    best_informer_df.drop('hashtag', inplace = True,axis=1)

    inf = {}
    av_form = {}
    best_form = {}

    cols = ['Gender','Sentiment','Polarity', 'Readability','Emotion','Hate','Topics ','Extended Topics']
    metrics = ['euclidean','cosine']

    for col in cols:

        for m in metrics:

            m_df = infector_df[ infector_df['metric'] ==m ].copy()
            inf.update( {col+m: m_df[col].tolist() } )

            n_df = av_informer_df[ av_informer_df['metric'] ==m ].copy()
            av_form.update( {col+m: n_df[col].tolist() } )

            o_df = best_informer_df[ best_informer_df['metric'] ==m ].copy()
            best_form.update( {col+m:  o_df[col].tolist() } )


    print(f'Performing KS-test on {hashtag} \n------')
    for m in metrics:
        m_scores = [hashtag, m]
        for col in cols:
            print('----\n')
            print(f'Difference in feature vectors: using {m} similarity metric')
            print(f'Feature Vector: {col}')
            print(ks_2samp(inf[col+m], best_form[col+m])) 
            print('----\n')

            s = ks_2samp(inf[col+m], best_form[col+m])
            m_scores.append([f'test statisic: {"{:.5f}".format(s[0])}',f'pvalue:{"{:.5f}".format(s[1])}'])
        
        scores.append(m_scores)

scores_df = pd.DataFrame(scores, columns = ['hashtag','metric']+cols)
scores_df

In [48]:
scores_df = pd.DataFrame(scores, columns = ['hashtag','metric']+cols)
scores_df

Unnamed: 0,hashtag,metric,Gender,Sentiment,Polarity,Readability,Emotion,Hate,Topics,Extended Topics
0,blm,euclidean,"[test statisic: 0.39384, pvalue:0.00000]","[test statisic: 0.55023, pvalue:0.00000]","[test statisic: 0.24158, pvalue:0.00000]","[test statisic: 0.73200, pvalue:0.00000]","[test statisic: 0.61626, pvalue:0.00000]","[test statisic: 0.63956, pvalue:0.00000]","[test statisic: 0.65303, pvalue:0.00000]","[test statisic: 0.65588, pvalue:0.00000]"
1,blm,cosine,"[test statisic: 0.38374, pvalue:0.00000]","[test statisic: 0.56732, pvalue:0.00000]","[test statisic: 0.08286, pvalue:0.00000]","[test statisic: 0.69083, pvalue:0.00000]","[test statisic: 0.62377, pvalue:0.00000]","[test statisic: 0.65898, pvalue:0.00000]","[test statisic: 0.65743, pvalue:0.00000]","[test statisic: 0.65303, pvalue:0.00000]"
2,nhs,euclidean,"[test statisic: 0.50296, pvalue:0.00000]","[test statisic: 0.56966, pvalue:0.00000]","[test statisic: 0.26699, pvalue:0.00000]","[test statisic: 0.74538, pvalue:0.00000]","[test statisic: 0.46728, pvalue:0.00000]","[test statisic: 0.49937, pvalue:0.00000]","[test statisic: 0.53828, pvalue:0.00000]","[test statisic: 0.54707, pvalue:0.00000]"
3,nhs,cosine,"[test statisic: 0.49812, pvalue:0.00000]","[test statisic: 0.58903, pvalue:0.00000]","[test statisic: 0.17536, pvalue:0.00000]","[test statisic: 0.73660, pvalue:0.00000]","[test statisic: 0.47588, pvalue:0.00000]","[test statisic: 0.54330, pvalue:0.00000]","[test statisic: 0.56482, pvalue:0.00000]","[test statisic: 0.54133, pvalue:0.00000]"
4,gaza,euclidean,"[test statisic: 0.40900, pvalue:0.00000]","[test statisic: 0.13312, pvalue:0.00000]","[test statisic: 0.18135, pvalue:0.00000]","[test statisic: 0.25723, pvalue:0.00000]","[test statisic: 0.12669, pvalue:0.00000]","[test statisic: 0.25788, pvalue:0.00000]","[test statisic: 0.24502, pvalue:0.00000]","[test statisic: 0.18971, pvalue:0.00000]"
5,gaza,cosine,"[test statisic: 0.39936, pvalue:0.00000]","[test statisic: 0.12733, pvalue:0.00000]","[test statisic: 0.16463, pvalue:0.00000]","[test statisic: 0.22958, pvalue:0.00000]","[test statisic: 0.14341, pvalue:0.00000]","[test statisic: 0.20836, pvalue:0.00000]","[test statisic: 0.26367, pvalue:0.00000]","[test statisic: 0.19228, pvalue:0.00000]"
6,supercup,euclidean,"[test statisic: 0.33589, pvalue:0.00000]","[test statisic: 0.30517, pvalue:0.00000]","[test statisic: 0.25199, pvalue:0.00000]","[test statisic: 0.48006, pvalue:0.00000]","[test statisic: 0.22600, pvalue:0.00000]","[test statisic: 0.41093, pvalue:0.00000]","[test statisic: 0.40857, pvalue:0.00000]","[test statisic: 0.34092, pvalue:0.00000]"
7,supercup,cosine,"[test statisic: 0.29010, pvalue:0.00000]","[test statisic: 0.30251, pvalue:0.00000]","[test statisic: 0.19764, pvalue:0.00000]","[test statisic: 0.48006, pvalue:0.00000]","[test statisic: 0.23191, pvalue:0.00000]","[test statisic: 0.42245, pvalue:0.00000]","[test statisic: 0.42009, pvalue:0.00000]","[test statisic: 0.33796, pvalue:0.00000]"
8,climatechange,euclidean,"[test statisic: 0.43579, pvalue:0.00000]","[test statisic: 0.31563, pvalue:0.00000]","[test statisic: 0.14698, pvalue:0.00000]","[test statisic: 0.58381, pvalue:0.00000]","[test statisic: 0.29861, pvalue:0.00000]","[test statisic: 0.37133, pvalue:0.00000]","[test statisic: 0.35482, pvalue:0.00000]","[test statisic: 0.35379, pvalue:0.00000]"
9,climatechange,cosine,"[test statisic: 0.42187, pvalue:0.00000]","[test statisic: 0.35173, pvalue:0.00000]","[test statisic: 0.04074, pvalue:0.08001]","[test statisic: 0.53584, pvalue:0.00000]","[test statisic: 0.31047, pvalue:0.00000]","[test statisic: 0.37287, pvalue:0.00000]","[test statisic: 0.36617, pvalue:0.00000]","[test statisic: 0.34863, pvalue:0.00000]"


## EUCLIDEAN

In [None]:
cols = ['Gender','Sentiment','Polarity', 'Readability','Emotion','Hate','Topics ','Extended Topics']
metrics = ['euclidean']

hashtags = ['blm','nhs']

for hashtag in hashtags:
    for col in cols:
        for m in metrics:
            df = pd.DataFrame( [inf[col+m], best_form[col+m]]).T 
            df.columns= ['Infector Similarity','Best Informer Similarity']
            df.plot( figsize = (10,5), title = f'Similarity comparison of {col} with {m} distance for {hashtag}', linestyle='-')

## COSINE

In [None]:
cols = ['Gender','Sentiment','Polarity', 'Readability','Emotion','Hate','Topics ','Extended Topics']
metrics = ['cosine']

hashtags = ['blm','nhs']

for hashtag in hashtags:
    for col in cols:
        for m in metrics:
            df = pd.DataFrame( [inf[col+m], best_form[col+m]]).T 
            df.columns= ['Infector Similarity','Best Informer Similarity']
            df.plot( figsize = (10,5), title = f'Similarity comparison of {col} with {m} distance for {hashtag}', linestyle='-')

## OLD

In [None]:
cols = ['Gender','Sentiment','Polarity', 'Readability','Emotion','Hate','Topics ','Extended Topics']
metrics = ['euclidean','cosine']

for col in cols:
    for m in metrics:
        df = pd.DataFrame( [inf[col+m], best_form[col+m]]).T 
        df.columns= ['Infector Similarity','Best Informer Similarity']
        df.plot( figsize = (5,10), title = f'Similarity comparison of {col} with {m} distance', linestyle='-')

In [None]:
for hashatg in hashtags:

    cols = ['metric','Gender','Sentiment','Polarity', 'Readability','Emotion','Hate','Topics ','Extended Topics']
    cols2 = target.columns.tolist()

    hashtag_infector_df[hashtag_infector_df['hashtag']==hashtag].copy().drop('hashtag', inplace = True)
    hashtag_av_informer_df[hashtag_av_informer_df['hashtag']==hashtag].copy().drop('hashtag', inplace = True)
    hashtag_all_best_informer_df[hashtag_all_best_informer_df['hashtag']==hashtag].copy().drop('hashtag', inplace = True)

    infector_df = hashtag_infector_df[hashtag_infector_df['hashtag']==hashtag].copy().drop('hashtag', inplace = True)
    av_informer_df = hashtag_av_informer_df[hashtag_av_informer_df['hashtag']==hashtag].copy().drop('hashtag', inplace = True)
    best_informer_df = hashtag_all_best_informer_df[hashtag_all_best_informer_df['hashtag']==hashtag].copy().drop('hashtag', inplace = True)

    inf = {}
    av_form = {}
    best_form = {}

    cols = ['Gender','Sentiment','Polarity', 'Readability','Emotion','Hate','Topics ','Extended Topics']
    metrics = ['euclidean','cosine']

    for col in cols:

        for m in metrics:

            m_df = infector_df[ infector_df['metric'] ==m ].copy()
            inf.update( {col+m: m_df[col].tolist() } )

            n_df = av_informer_df[ av_informer_df['metric'] ==m ].copy()
            av_form.update( {col+m: n_df[col].tolist() } )

            o_df = best_informer_df[ best_informer_df['metric'] ==m ].copy()
            best_form.update( {col+m:  o_df[col].tolist() } )
            

In [None]:
# infector_diff = pd.DataFrame(infector_diff_df)
# infector_diff.columns = cols2
# av_informer_diff = pd.DataFrame(av_informer_diff_df)
# av_informer_diff.columns = cols2
# best_informer_diff = pd.DataFrame(best_informer_diff_df)
# best_informer_diff.columns = cols2