In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My Drive/MSc_project/.MAIN

Mounted at /content/drive
/content/drive/My Drive/MSc_project/.MAIN


In [2]:
import pandas as pd 
import pickle
import json
import os
import os.path
import numpy as np
import seaborn as sns
from collections import defaultdict
import joblib
import matplotlib.pyplot as plt
from matplotlib.ticker import (
                               AutoMinorLocator,
                               FuncFormatter,
                               )
import matplotlib.dates as mdates
from matplotlib.dates import DateFormatter
import torch
import re
%matplotlib inline

import os

os.chdir('G:\My Drive\MSc_project\.MAIN')

c:\Users\berke\anaconda3\envs\env-pytorch\lib\site-packages\numpy\.libs\libopenblas.EL2C6PLE4ZYW3ECEVIV3OXXGRN2NRFM2.gfortran-win_amd64.dll
c:\Users\berke\anaconda3\envs\env-pytorch\lib\site-packages\numpy\.libs\libopenblas.JPIJNSWNNAN3CE6LLI5FWSPHUT2VXMTH.gfortran-win_amd64.dll


## analysis

In [None]:
class Analyzer(object):
    def __init__(self):

        # self.hashtags = ['avengers','gaza','borisjohnson','brexit','climatechange','covid','gaza','loveisland','monkeypox','nhs','olivianewtonjohn','supercup','UkraineWar']
        self.hashtags = ['blm']

    @staticmethod
    def load_ms_cases(hashtag):
        path = f'tweets{os.path.sep}{hashtag}{os.path.sep}{hashtag}_ms_cases.json'
        with open(path) as jf:
            data = json.load(jf)
        return data


    @staticmethod
    def sort_cols(df):
        emos = ['others','joy','surprise','disgust','anger','fear','sadness']
        for emo in emos:
            df[emo] = df.apply( lambda row: re.search(f'{emo}: (\d*\.?\d+)', row.emo_output).group(1), axis=1)
            
        hates = ['hateful','targeted','aggressive']
        for hate in hates:
            df[hate] = df.apply( lambda row: re.search(f'{hate}: (\d*\.?\d+)', row.hate_output).group(1), axis=1)
        
        return df



    def load_scores_df(self,hashtag):
        save_path_all = f'tweets/{hashtag}/{hashtag}_TWEETS_scores.csv'
        df =  pd.read_csv(save_path_all)
        df.tweet_id.drop_duplicates(keep='first',inplace=True)
        df.set_index('tweet_id', inplace = True)


        save_path = f'tweets/{hashtag}/{hashtag}_infector_scores.csv'
        i_df =  pd.read_csv(save_path)
        i_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        i_df.set_index('tweet_id', inplace = True)


        save_gender = f'tweets/{hashtag}/{hashtag}_gender_scores.csv'
        g_df =  pd.read_csv(save_gender)
        g_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        g_df.drop(['text'], axis=1)
        g_df.set_index('tweet_id', inplace = True)

        fear_path = f'tweets/{hashtag}/{hashtag}_fear_scores.csv'
        fear_df =  pd.read_csv(fear_path)
        fear_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        fear_df.drop(['text','user_id'], axis=1,inplace=True)
        fear_df.set_index('tweet_id', inplace = True)

        sub_all =pd.concat( [df,i_df],axis=0 ) # adding the infector scores to the database. so new entries

        all = pd.concat( [sub_all,g_df, fear_df], axis = 1) # adding the gender scores to all types
        

        d = [ 'Unnamed: 0','clean_text','punct','tokenized','nonstop','stemmed','topic_tokens','cardiff_tokens','hate_output','emo_output','grammartext']

        
        all.drop(d,inplace=True,axis=1)

        all = all.loc[:,~all.columns.duplicated()].copy()

        print('loaded the scores')
        
        return all


    def get_the_tweets(self,hashtag):
        database = self.load_ms_cases(hashtag)
        cases = {}
        for key,value in database.items():

            inf = value['infector-info']
            k = list(inf.keys())[0]
            infector = inf[k]


            informers = [ int(inf['id']) for inf in value['informers-data']]

            dum =  {   int(key): { 'target':int(key) , 'infector': int(infector['id']), 'informers':informers } } # get the case breakdown of target infector informer
            cases.update( dum  )
            
        print('loaded the MS cases')
        return cases 
                   

    @staticmethod
    def arange_case(scores,cases):
        all = []
        target_count = 0 
        infector_count = 0 
        informer_count = 0

        count=0

        for key,value in cases.items():
            # each row is organised as follows
            # target id.    is target,    is infector, is informer, score

            target = key
            infector = value['infector']
            all_informers = [informer for informer in value['informers'] ]

            if target in scores:
                target_count += 1

            if infector in scores:
                infector_count += 1

            for informer in all_informers:
                if informer  in scores:
                    informer_count += 1
                

            # if (all(target, and infector in scores) and (any(informers in scores)) :
            if ( target and infector in scores ) and any( informer in scores.keys() for informer in all_informers ):
                count+=1
                
                all.append( [ key, 1, 0 ,0] + scores[key]  )                 # target row

                all.append( [ key, 0, 1 ,0] + scores[infector]  )         # infector row

                informers = [informer for informer in all_informers if informer in scores]

                for informer in informers:
                    all.append( [ key, 0, 0, 1] +  scores[informer]  )

        print(f'Out of {len(cases)} MS cases \nWe found. We got the scores for {count} of them.')
        print(f'target count: {target_count}')
        print(f'infector count: {infector_count}')
        print(f'informer count: {informer_count}')

        return all   



    def create_ms(self):

        for i, hashtag in enumerate(self.hashtags):

            #load scores
            scores_df = self.load_scores_df(hashtag) # these are indexed by integer number 1,2,3,4. Tweet ids here are integers.
            cases = self.get_the_tweets(hashtag) # indexed by tweet ids. INTEGER
            
            # index the df with the tweet id
            scores_df.index.drop_duplicates(keep='first')
            scores = scores_df.T.to_dict('list')

            h_list = self.arange_case(scores,cases)
            n = len(h_list)

            old_cols = scores_df.copy().columns.tolist()
            cols = ['hashtag','target','is-target','is-infector','is-informer'] + old_cols

            hdf = pd.DataFrame(h_list)

            hdf.insert(0,column = 'hashtag', value= [hashtag]*n )

            hdf.columns = cols

            if i== 0:
                df  = hdf.copy()
            else:
                df = pd.concat( [df,hdf], ignore_index=True, axis = 0)
                print(f'added {hashtag} to database\n')


            hdf.to_csv(f'multisource_analysis/data/{hashtag}_scored_tweets.csv')

        # df.to_csv('multisource_analysis/multisource_database.csv')


        return df

a = Analyzer()

df = a.create_ms()


loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 3862 MS cases 
We found. We got the scores for 3862 of them.
target count: 3862
infector count: 3862
informer count: 25355


In [None]:
df.head(5)

Unnamed: 0,hashtag,target,is-target,is-infector,is-informer,text,user_id,polarity,subjectivity,text_len,...,surprise,disgust,hateful,targeted,aggressive,grammar-sentence-score,gender,num_male,num_female,fear
0,avengers,1555684437077430272,1,0,0,"RT @Ickypoo82: ""Accept defeat!""\n\n@PlayAvenge...",1241298027731542019,0.0,0.0,97,...,0.003388,0.003023,0.00579,0.003569,0.003724,,1.0,100.0,0.0,0.001475
1,avengers,1555684437077430272,0,1,0,"""Accept defeat!""\n\n@PlayAvengers #PS5Share #M...",1397755845446877184,0.0,0.0,172,...,0.003122,0.00376,0.006412,0.003332,0.003664,0.0,1.0,100.0,0.0,0.001484
2,avengers,1555684437077430272,0,0,1,"Yes, I would call that a party foul!\nGame: @P...",1253824887166296065,-0.4,0.4,159,...,0.003085,0.00251,0.009454,0.002653,0.003465,,1.0,97.0,3.0,0.00149
3,avengers,1555684437077430272,0,0,1,RT @PlayAvengers616: “Shakespeare in The Park?...,1526161221770174464,0.2,0.7,91,...,0.003606,0.002124,0.008906,0.00456,0.002507,,1.0,89.0,11.0,0.001406
4,avengers,1555684437077430272,0,0,1,RT @swingwithspidey: Miles Morales 🕸\n.\n🎮: Sp...,1419712116425101318,0.0,0.0,86,...,0.002849,0.002674,0.005349,0.004129,0.003674,,1.0,99.0,1.0,0.001372


# USER SCORES

In [None]:
class Analyzer(object):
    def __init__(self):

        self.hashtags = ['avengers','gaza', 'blm','brexit','climatechange','covid','gaza','loveisland','monkeypox','nhs','olivianewtonjohn','supercup','UkraineWar']
        # self.hashtags = ['blm']

    @staticmethod
    def load_ms_cases(hashtag):
        path = f'tweets{os.path.sep}{hashtag}{os.path.sep}{hashtag}_ms_cases.json'
        with open(path) as jf:
            data = json.load(jf)
        return data


    def load_scores_df(self,hashtag):
        save_path_all = f'tweets/{hashtag}/{hashtag}_TWEETS_scores.csv'
        df =  pd.read_csv(save_path_all)
        df.tweet_id.drop_duplicates(keep='first',inplace=True)
        df.set_index('tweet_id', inplace = True)


        save_path = f'tweets/{hashtag}/{hashtag}_infector_scores.csv'
        i_df =  pd.read_csv(save_path)
        i_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        i_df.set_index('tweet_id', inplace = True)


        save_gender = f'tweets/{hashtag}/{hashtag}_gender_scores.csv'
        g_df =  pd.read_csv(save_gender)
        g_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        g_df.drop(['text'], axis=1)
        g_df.set_index('tweet_id', inplace = True)

        fear_path = f'tweets/{hashtag}/{hashtag}_fear_scores.csv'
        fear_df =  pd.read_csv(fear_path)
        fear_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        fear_df.drop(['text','user_id'], axis=1,inplace=True)
        fear_df.set_index('tweet_id', inplace = True)

        save_path = f'tweets/{hashtag}/{hashtag}_USER_scores.csv'
        u_df =  pd.read_csv(save_path)
        u_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        u_df.set_index('tweet_id', inplace = True)

        p_path = f'tweets/{hashtag}/{hashtag}_USER_POLITE_scores.csv'
        p_df =  pd.read_csv(p_path)
        p_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        p_df.set_index('tweet_id', inplace = True)     

        r_path = f'tweets/{hashtag}/{hashtag}_USER_READ_scores.csv'
        r_df =  pd.read_csv(r_path)
        r_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        r_df.set_index('tweet_id', inplace = True)        

        sub_all =pd.concat( [df,i_df],axis=0 ) # adding the infector scores to the database. so new entries

        all = pd.concat( [sub_all,g_df, fear_df, u_df, p_df, r_df], axis = 1) # adding the gender scores to all types
        

        d = [ 'Unnamed: 0','clean_text','punct','tokenized','nonstop','stemmed','topic_tokens','cardiff_tokens','hate_output','emo_output','grammartext']

        
        all.drop(d,inplace=True,axis=1)

        all = all.loc[:,~all.columns.duplicated()].copy()

        user_all = all[~all['user_ARI_mean'].isnull() ]

        print('loaded the scores')
        
        return user_all


    def get_the_tweets(self,hashtag):
        database = self.load_ms_cases(hashtag)
        cases = {}
        for key,value in database.items():

            inf = value['infector-info']
            k = list(inf.keys())[0]
            infector = inf[k]


            informers = [ int(inf['id']) for inf in value['informers-data']]

            dum =  {   int(key): { 'target':int(key) , 'infector': int(infector['id']), 'informers':informers } } # get the case breakdown of target infector informer
            cases.update( dum  )
            
        print('loaded the MS cases')
        return cases 
                   

    @staticmethod
    def arange_case(scores,cases):
        all = []
        target_count = 0 
        infector_count = 0 
        informer_count = 0

        count=0

        for key,value in cases.items():
            # each row is organised as follows
            # target id.    is target,    is infector, is informer, score

            target = key
            infector = value['infector']
            all_informers = [informer for informer in value['informers'] ]

            if target in scores:
                target_count += 1

            if infector in scores:
                infector_count += 1

            for informer in all_informers:
                if informer  in scores:
                    informer_count += 1
                

            # if (all(target, and infector in scores) and (any(informers in scores)) :
            if ( target in scores ) and ( infector in scores) and any( informer in scores.keys() for informer in all_informers ):
                count+=1
                
                all.append( [ key, 1, 0 ,0] + scores[key]  )                 # target row

                all.append( [ key, 0, 1 ,0] + scores[infector]  )         # infector row

                informers = [informer for informer in all_informers if informer in scores]

                for informer in informers:
                    all.append( [ key, 0, 0, 1] +  scores[informer]  )

        print(f'Out of {len(cases)} MS cases \nWe found. We got the scores for {count} of them.')
        print(f'target count: {target_count}')
        print(f'infector count: {infector_count}')
        print(f'informer count: {informer_count}')

        return all   



    def create_ms(self):

        for i, hashtag in enumerate(self.hashtags):

            #load scores
            scores_df = self.load_scores_df(hashtag) # these are indexed by integer number 1,2,3,4. Tweet ids here are integers.
            cases = self.get_the_tweets(hashtag) # indexed by tweet ids. INTEGER
            
            # index the df with the tweet id
            scores_df.index.drop_duplicates(keep='first')
            scores = scores_df.T.to_dict('list')

            h_list = self.arange_case(scores,cases)
            n = len(h_list)

            old_cols = scores_df.copy().columns.tolist()
            cols = ['hashtag','target','is-target','is-infector','is-informer'] + old_cols

            hdf = pd.DataFrame(h_list)

            hdf.insert(0,column = 'hashtag', value= [hashtag]*n )

            hdf.columns = cols

            if i== 0:
                df  = hdf.copy()
            else:
                df = pd.concat( [df,hdf], ignore_index=True, axis = 0)
                print(f'added {hashtag} to database\n')


            hdf.to_csv(f'multisource_analysis/user_ft_data/{hashtag}_scored_tweets2.csv')

        # df.to_csv('multisource_analysis/multisource_database.csv')


        return df

a = Analyzer()

df = a.create_ms()

In [3]:
class Analyzer(object):
    def __init__(self):

        self.hashtags = ['avengers','gaza', 'blm','brexit','climatechange','covid','gaza','loveisland','monkeypox','nhs','olivianewtonjohn','supercup','UkraineWar']
        # self.hashtags = ['blm']

    @staticmethod
    def load_ms_cases(hashtag):
        path = f'tweets{os.path.sep}{hashtag}{os.path.sep}{hashtag}_ms_cases.json'
        with open(path) as jf:
            data = json.load(jf)
        return data


    def load_scores_df(self,hashtag):
        save_path_all = f'tweets/{hashtag}/{hashtag}_TWEETS_scores.csv'
        df =  pd.read_csv(save_path_all)
        df.tweet_id.drop_duplicates(keep='first',inplace=True)
        df.set_index('tweet_id', inplace = True)


        save_path = f'tweets/{hashtag}/{hashtag}_infector_scores.csv'
        i_df =  pd.read_csv(save_path)
        i_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        i_df.set_index('tweet_id', inplace = True)


        save_gender = f'tweets/{hashtag}/{hashtag}_gender_scores.csv'
        g_df =  pd.read_csv(save_gender)
        g_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        g_df.drop(['text'], axis=1)
        g_df.set_index('tweet_id', inplace = True)

        fear_path = f'tweets/{hashtag}/{hashtag}_fear_scores.csv'
        fear_df =  pd.read_csv(fear_path)
        fear_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        fear_df.drop(['text','user_id'], axis=1,inplace=True)
        fear_df.set_index('tweet_id', inplace = True)

        save_path = f'tweets/{hashtag}/{hashtag}_USER_scores_10_feeds.csv'
        u_df =  pd.read_csv(save_path)
        u_df.tweet_id.drop_duplicates(keep='first',inplace=True)
        u_df.set_index('tweet_id', inplace = True)

        sub_all =pd.concat( [df,i_df],axis=0 ) # adding the infector scores to the database. so new entries

        all = pd.concat( [sub_all,g_df, fear_df, u_df], axis = 1) # adding the gender scores to all types
        

        d = [ 'Unnamed: 0','clean_text','punct','tokenized','nonstop','stemmed','topic_tokens','cardiff_tokens','hate_output','emo_output','grammartext']

        
        all.drop(d,inplace=True,axis=1)

        all = all.loc[:,~all.columns.duplicated()].copy()

        user_all = all[~all['user_ARI_mean'].isnull() ]

        print('loaded the scores')
        
        return user_all


    def get_the_tweets(self,hashtag):
        database = self.load_ms_cases(hashtag)
        cases = {}
        for key,value in database.items():

            inf = value['infector-info']
            k = list(inf.keys())[0]
            infector = inf[k]


            informers = [ int(inf['id']) for inf in value['informers-data']]

            dum =  {   int(key): { 'target':int(key) , 'infector': int(infector['id']), 'informers':informers } } # get the case breakdown of target infector informer
            cases.update( dum  )
            
        print('loaded the MS cases')
        return cases 
                   

    @staticmethod
    def arange_case(scores,cases):
        all = []
        target_count = 0 
        infector_count = 0 
        informer_count = 0

        count=0

        for key,value in cases.items():
            # each row is organised as follows
            # target id.    is target,    is infector, is informer, score

            target = key
            infector = value['infector']
            all_informers = [informer for informer in value['informers'] ]

            if target in scores:
                target_count += 1

            if infector in scores:
                infector_count += 1

            for informer in all_informers:
                if informer  in scores:
                    informer_count += 1
                

            # if (all(target, and infector in scores) and (any(informers in scores)) :
            if ( target in scores ) and ( infector in scores) and any( informer in scores.keys() for informer in all_informers ):
                count+=1
                
                all.append( [ key, 1, 0 ,0] + scores[key]  )                 # target row

                all.append( [ key, 0, 1 ,0] + scores[infector]  )         # infector row

                informers = [informer for informer in all_informers if informer in scores]

                for informer in informers:
                    all.append( [ key, 0, 0, 1] +  scores[informer]  )

        print(f'Out of {len(cases)} MS cases \nWe found. We got the scores for {count} of them.')
        print(f'target count: {target_count}')
        print(f'infector count: {infector_count}')
        print(f'informer count: {informer_count}')

        return all   



    def create_ms(self):

        for i, hashtag in enumerate(self.hashtags):

            #load scores
            scores_df = self.load_scores_df(hashtag) # these are indexed by integer number 1,2,3,4. Tweet ids here are integers.
            cases = self.get_the_tweets(hashtag) # indexed by tweet ids. INTEGER
            
            # index the df with the tweet id
            scores_df.index.drop_duplicates(keep='first')
            scores = scores_df.T.to_dict('list')

            h_list = self.arange_case(scores,cases)
            n = len(h_list)

            old_cols = scores_df.copy().columns.tolist()
            cols = ['hashtag','target','is-target','is-infector','is-informer'] + old_cols

            hdf = pd.DataFrame(h_list)

            hdf.insert(0,column = 'hashtag', value= [hashtag]*n )

            hdf.columns = cols

            if i== 0:
                df  = hdf.copy()
            else:
                df = pd.concat( [df,hdf], ignore_index=True, axis = 0)
                print(f'added {hashtag} to database\n')


            hdf.to_csv(f'multisource_analysis/user_ft_data/{hashtag}_scored_tweets_10_feeds.csv')

        # df.to_csv('multisource_analysis/multisource_database.csv')


        return df

a = Analyzer()

df = a.create_ms()

loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 1362 MS cases 
We found. We got the scores for 1345 of them.
target count: 1351
infector count: 1356
informer count: 7955
loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 1555 MS cases 
We found. We got the scores for 1447 of them.
target count: 1455
infector count: 1545
informer count: 9160
added gaza to database

loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 3862 MS cases 
We found. We got the scores for 2131 of them.
target count: 2205
infector count: 3525
informer count: 21761
added blm to database

loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 3379 MS cases 
We found. We got the scores for 1987 of them.
target count: 2062
infector count: 3051
informer count: 26065
added brexit to database

loaded the scores
loaded the MS cases
Out of 1939 MS cases 
We found. We got the scores for 1571 of them.
target count: 1585
infector count: 1853
informer count: 11618


  scores = scores_df.T.to_dict('list')


added climatechange to database

loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 1831 MS cases 
We found. We got the scores for 1564 of them.
target count: 1579
infector count: 1767
informer count: 10450
added covid to database

loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 1555 MS cases 
We found. We got the scores for 1447 of them.
target count: 1455
infector count: 1545
informer count: 9160
added gaza to database

loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 1489 MS cases 
We found. We got the scores for 1465 of them.
target count: 1467
infector count: 1487
informer count: 8580
added loveisland to database

loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 4094 MS cases 
We found. We got the scores for 1673 of them.
target count: 1715
infector count: 3227
informer count: 22078
added monkeypox to database

loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 5577 MS cases 
We found. We got the scores for 2329 of them.
target count: 2519
infector count: 4750
informer count: 34550
added nhs to database

loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 1327 MS cases 
We found. We got the scores for 1304 of them.
target count: 1316
infector count: 1315
informer count: 6888
added olivianewtonjohn to database

loaded the scores
loaded the MS cases
Out of 3385 MS cases 
We found. We got the scores for 1688 of them.
target count: 1714
infector count: 3106
informer count: 14204


  scores = scores_df.T.to_dict('list')


added supercup to database

loaded the scores
loaded the MS cases


  scores = scores_df.T.to_dict('list')


Out of 2499 MS cases 
We found. We got the scores for 1586 of them.
target count: 1610
infector count: 2365
informer count: 14290
added UkraineWar to database

