In [1]:
import sys
import psutil
import numpy as np
import pandas as pd
import plotly.io as pio
from collections import Counter
import dataframe_image as dfi
import plotly.graph_objects as go


In [2]:
# !{sys.executable} -m pip install  -U imgkit


# Acoounts 

In [3]:
class accounts_split:
    
    def __init__(self, version):
        self.version = version
        self.langs = ['ar', 'en', 'fr', 'es']
        self.master_file = pd.read_excel(f'{self.version}/data_collection/BBCM - Masterfile.xlsx', engine='openpyxl')
        self.fb_collectable = pd.read_csv(f'{self.version}/data_collection/fb-accounts-v3-ct-imported-pages.csv')
        self.tw_collectable = pd.read_csv(f'{self.version}/data_collection/tw-accounts-v3-collectable.csv')
        self.fb_accounts_collections = pd.read_csv(f'{self.version}/data_collection/fb-v3-post-collections-20210402.csv')
        self.tw_accounts_collections = pd.read_csv(f'{self.version}/data_collection/tw-v3-tweet-collections.csv')
        self.consolidated_accounts_collection = pd.read_csv(f'{self.version}/data_collection/store-all-messages-v3.csv')
        self.acc_sum = self.accounts_summary(self.master_file, self.fb_collectable, self.tw_collectable, self.tw_accounts_collections, self.fb_accounts_collections, self.consolidated_accounts_collection)
        
    def annotate(self, folder):
        self.fig = self.accounts_annotation(self.acc_sum)
        pio.write_image(self.fig, f"{self.version}/{folder}/accounts_collection_split_v3.jpeg")
    
    def create_dict(self, lst):
        res_dct = {}
        for i,t in enumerate(lst):
            res_dct[t] = i
        return res_dct
        


    def accounts_summary(self, master, tw, fb, tw_coll, fb_coll, lang_coll):
        all_accounts = len(master.iloc[:,:22].iloc[:563])
        fb_accounts = len(fb)
        tw_accounts = len(tw)
        tw_collections = len(tw_coll)
        fb_collections = len(fb_coll)
        collectable = fb_accounts + tw_accounts
        not_collectable = all_accounts - collectable
        self.acc_coll_sum = self.accounts_collection_summary(lang_coll)

        return [all_accounts, fb_accounts, tw_accounts, collectable, not_collectable, tw_collections, fb_collections,  tw_collections + fb_collections] + self.acc_coll_sum

    def accounts_collection_summary(self, df):
        lang_collection = []
        for lang in self.langs:
            lang_collection.append(len(df[df['M52/language'] == lang]))
        lang_collection.append(np.sum(lang_collection))
        print(lang_collection)
        return lang_collection
    
    def account_prefixes(self, acc_sum, account_sources):
        annotated = []
        filtered_collections = self.account_collection_prefixes(self.acc_coll_sum, account_sources[-4:])
        annotated.append(f'{account_sources[0]} - {acc_sum[0]}')
        annotated.append(f'{account_sources[1]} - {acc_sum[3]}')
        annotated.append(f'{account_sources[2]} - {acc_sum[4] - 2}')
        annotated.append(f'{account_sources[3]} - {acc_sum[1]} ')
        annotated.append(f'{account_sources[4]} - {acc_sum[2]} ')
        annotated.append(f'{account_sources[5]} - {acc_sum[5]} ')
        annotated.append(f'{account_sources[6]} - {acc_sum[6]} ')
        annotated.append(f'{account_sources[7]} - {acc_sum[7]} ')
        return annotated + filtered_collections


    def account_collection_prefixes(self, acc_coll_sum, sources):
        annotated = []
        annotated.append(f'{sources[0]} - {acc_coll_sum[0]}')
        annotated.append(f'{sources[1]} - {acc_coll_sum[1]}')
        annotated.append(f'{sources[2]} - {acc_coll_sum[2]}')
        annotated.append(f'{sources[3]} - {acc_coll_sum[3]}')

        return annotated 
    
    def accounts_annotation(self, acc_sum):
        account_sources = ["source_accounts", "collectable", "not collectable", "Twitter accounts", "Facebook accounts", "Twitter collection", "Facebook collection", "Consolidate", "Arabic", "English", "Spanish", "French"]
        account_sources_map = self.create_dict(account_sources)
        account_labels = self.account_prefixes(acc_sum, account_sources)
        fig = go.Figure(data=[go.Sankey(
            arrangement = "snap",
            valueformat = ".0f",
            node = dict(
              pad = 15,
              thickness = 20,
              line = dict(color = "black", width = 0.5),
              label = account_labels,
              x =  [0, 0.22, 0.1, 0,  0,  0.61, 0.5, 0.84, 1, 1, 1, 1 ],
              y =  [0, 0.1,   1,  0, 0.9,  0.1, 0.7, 0.3, 0.6, 0.7, 0.8, 1 ],
            ),
            link = dict(
              source = [account_sources_map[account_sources[0]], account_sources_map[account_sources[0]], account_sources_map[account_sources[1]], account_sources_map[account_sources[1]], account_sources_map[account_sources[3]], account_sources_map[account_sources[4]], account_sources_map[account_sources[5]], account_sources_map[account_sources[6]], account_sources_map[account_sources[7]], account_sources_map[account_sources[7]], account_sources_map[account_sources[7]],  account_sources_map[account_sources[7]]  ], 
              target = [account_sources_map[account_sources[1]], account_sources_map[account_sources[2]], account_sources_map[account_sources[3]], account_sources_map[account_sources[4]], account_sources_map[account_sources[5]], account_sources_map[account_sources[6]], account_sources_map[account_sources[7]], account_sources_map[account_sources[7]], account_sources_map[account_sources[8]], account_sources_map[account_sources[9]], account_sources_map[account_sources[10]], account_sources_map[account_sources[11]]  ],
              value =  [acc_sum[0]/acc_sum[0],                   acc_sum[4]/acc_sum[0],                   acc_sum[2]/acc_sum[0],                   acc_sum[1]/acc_sum[0],                   acc_sum[5]/acc_sum[7],                   acc_sum[6]/acc_sum[7],                   acc_sum[5]/acc_sum[7],                   acc_sum[6]/acc_sum[7],                   acc_sum[8]/acc_sum[12],                  acc_sum[9]/acc_sum[12],                  acc_sum[10]/acc_sum[12],                  acc_sum[11]/acc_sum[12]   ]
          ))])

        fig.update_layout(title_text="New accounts collection", font_size=12)
        fig.show()
        return fig


# Keyword annotators

In [4]:
class keyword_annotator_split:
    
    def __init__(self, version):
        self.version = version
        self.langs = ['ar', 'en', 'fr', 'es']
        self.lang_map = {'ar': 'Arabic', 'en': 'English', 'es': 'Spanish', 'fr': 'French'}
        self.language_data = pd.read_csv(f'{self.version}/keyword_annotations/store-all-messages.csv')
        self.lang_sum = self.language_summary(self.language_data)
 
        
    def annotate(self, folder):
        langs_analysis = []
        for lang in self.langs:
            annotated_data = pd.read_csv(f'{self.version}/keyword_annotations/store-all-messages-annotated-{self.version}-{lang}.csv')
            self.themes = [t for t in list(annotated_data.columns) if t !='M52/language']
            self.themes_mapping =  {'keyword.match.theme/chinese-culture-and-people':'chinese culture and people',
                                     'keyword.match.theme/covid-19':'covid-19',
                                     'keyword.match.theme/economy':'economy',
                                     'keyword.match.theme/environment':'environment',
                                     'keyword.match.theme/geopolitics':'geopolitics',
                                     'keyword.match.theme/military-and-security':'military and security',
                                     'keyword.match.theme/politics-and-society':'politics and society',
                                     'keyword.match.theme/technology':'technology'}
            # rename columns
            annotated_renamed = self.rename_columns(annotated_data)
            # count the number of themes for each language 
            self.annotation_count(annotated_renamed)  
            self.unannotated_count(annotated_renamed)
            themes_count = self.theme_count
            # create sankey diagram, source prefixes is to generate output data such as econmy - 100 etc
            self.fig = self.keyword_annotation(self.lang_sum['all'], lang, self.lang_sum[lang], themes_count)
            pio.write_image(self.fig, f"{self.version}/{folder}/keyword_annotator_split_{self.version}_{lang}.png")
        
            
    def language_summary(self, language_data):
        langs_proportions = {}
        for lang in self.langs:
            langs_proportions[lang] = len(language_data[language_data['M52/language'] == lang])
        langs_proportions['all'] = len(language_data)
        return langs_proportions
        

    def rename_columns(self, df):
        renamed = df.rename(columns = self.themes_mapping).drop('M52/language', axis = 1)
        return renamed

    def annotation_count(self, df):
        self.theme_count = {}
        for t in list(self.themes_mapping.values()):
            self.theme_count[t] = df[t].value_counts()[1]
        return self.theme_count

    def unannotated_count(self, df):
        self.theme_count['unannotated'] = len(df[(~df).all(axis=1)])
        return self.theme_count
    
    # map between annotation counts and list of sources     
    def source_prefixes(self, all_proportion, proportion, sources, annotation_count):
        annotated = []
        annotated.append(f'{sources[0]} - {proportion} - {round((proportion/all_proportion)*100)}%')
        for i, ant in enumerate(sources[1:-1]):
            annotated.append(f'{ant} - {annotation_count[sources[i+1]]} - {round((annotation_count[sources[i+1]]/proportion)*100)}%')        
        return annotated + [f'{sources[-1:][0]} - {all_proportion}']
    
    
    def keyword_annotation(self, all_proportion, lang, porportion, annotation_count):
        sources = [ lang, "economy", "politics and society", "covid-19", "technology", "environment", "military and security", "geopolitics", "chinese culture and people", "unannotated", 'source data']    
        sources2idx = {k:v for v, k in enumerate(sources)}
        idx2sources = {k:v for v, k in sources2idx.items()}
        labels = self.source_prefixes(all_proportion,  porportion, sources, annotation_count)
        fig = go.Figure(data=[go.Sankey(
              arrangement = "snap",
              valueformat = ".0f",
              node = dict(
              pad = 15,
              thickness = 20,
              line = dict(color = "black", width = 0.5),
              label = labels,
              x = [0.3, 1, 1, 1, 1, 1, 1, 1, 1, 1],
              y = [0.2, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.7, 1],
            ),
            
            link = dict(
              source = [sources2idx[idx2sources[sources2idx['source data']]], sources2idx[idx2sources[sources2idx[lang]]],                      sources2idx[idx2sources[sources2idx[lang]]],                       sources2idx[idx2sources[sources2idx[lang]]],                                   sources2idx[idx2sources[sources2idx[lang]]],                          sources2idx[idx2sources[sources2idx[lang]]],                                         sources2idx[idx2sources[sources2idx[lang]]],                                    sources2idx[idx2sources[sources2idx[lang]]],                         sources2idx[idx2sources[sources2idx[lang]]],                          sources2idx[idx2sources[sources2idx[lang]]]                          ], 
              target = [sources2idx[idx2sources[sources2idx[lang]]],          sources2idx[idx2sources[sources2idx['economy']]],                 sources2idx[idx2sources[sources2idx['covid-19']]],                 sources2idx[idx2sources[sources2idx['politics and society']]],                 sources2idx[idx2sources[sources2idx['geopolitics']]],                 sources2idx[idx2sources[sources2idx['chinese culture and people']]],                 sources2idx[idx2sources[sources2idx['military and security']]],                 sources2idx[idx2sources[sources2idx['technology']]],                 sources2idx[idx2sources[sources2idx['environment']]],                 sources2idx[idx2sources[sources2idx['unannotated']]]                 ],
              value =  [porportion/all_proportion,                            annotation_count[idx2sources[sources2idx['economy']]]/porportion, annotation_count[idx2sources[sources2idx['covid-19']]]/porportion, annotation_count[idx2sources[sources2idx['politics and society']]]/porportion, annotation_count[idx2sources[sources2idx['geopolitics']]]/porportion, annotation_count[idx2sources[sources2idx['chinese culture and people']]]/porportion, annotation_count[idx2sources[sources2idx['military and security']]]/porportion, annotation_count[idx2sources[sources2idx['technology']]]/porportion, annotation_count[idx2sources[sources2idx['environment']]]/porportion, annotation_count[idx2sources[sources2idx['unannotated']]]/porportion ]
          ))])
        fig.update_layout(title_text=f"{self.lang_map[lang]} keyword annotation proportion {self.version}", font_size=12)
        fig.show()
        return fig


# Heatmap

In [5]:
# create a mask to remove half of the plot
class overlap_heatmap:
    
    def __init__(self, version):
        self.version = version
        self.langs = ['ar', 'en', 'fr', 'es']
        self.lang_map = {'ar': 'Arabic', 'en': 'English', 'es': 'Spanish', 'fr': 'French'}
        self.language_data = pd.read_csv(f'{version}/keyword_annotations/store-all-messages.csv')
        
    def annotate(self, version, folder):
        langs_analysis = []
        hms = []
        for lang in self.langs:
            annotated_data = pd.read_csv(f'{version}/keyword_annotations/store-all-messages-annotated-{self.version}-{lang}.csv')
            self.themes = [t for t in list(annotated_data.columns) if t !='M52/language']
            self.themes_mapping =  {'keyword.match.theme/chinese-culture-and-people':'chinese culture and people',
                                     'keyword.match.theme/covid-19':'covid-19',
                                     'keyword.match.theme/economy':'economy',
                                     'keyword.match.theme/environment':'environment',
                                     'keyword.match.theme/geopolitics':'geopolitics',
                                     'keyword.match.theme/military-and-security':'military and security',
                                     'keyword.match.theme/politics-and-society':'politics and society',
                                     'keyword.match.theme/technology':'technology'}
            annotated_renamed = self.rename_columns(annotated_data).drop('M52/language', axis = 1)
            df = self.analyze_overlap(annotated_renamed)
            self.build_heatmap(version, folder, df, lang)
            
        
            
    def analyze_overlap(self, df):
        df = (df.T @ df) + ((0-df.T)@(0-df))
        df[:] = np.tril(df.values, k=0)
        return df
    
    
    def color_white(self, val):
        """Color the nan text white"""
        if val == 0:
            return 'color: white'

    def color_white_background(self, val):
        if val == 0:
            return 'background-color: white'

    def rename_columns(self, df):
        renamed = df.rename(columns = self.themes_mapping)
        return renamed

    def build_heatmap(self, version, folder, df, lang):
        fig = df.style.background_gradient(cmap ='viridis')\
            .set_properties(**{'font-size': '20px'}).applymap(lambda x: self.color_white_background(x)).applymap(lambda x: self.color_white(x))
        fig.set_caption(f'{self.lang_map[lang]} overlap heatmap {self.version}').export_png(f'{self.version}/{folder}/{self.lang_map[lang]}_heatmap.png')
        display(fig)
        return fig


# Project level

In [6]:
version = 'v1'
class project_level:
    
    def __init__(self):
        self.dc = accounts_split(version)
        self.ka = keyword_annotator_split(version)
        self.hm = overlap_heatmap(version)
        
    def accounts_split(self, folder):
        return self.annotate(folder)
         
    def keyword_annotator_split(self, folder):
        self.ka.annotate(folder)
        
    def heatmap(self, folder):
        return self.hm.annotate(version, folder)
    
        

In [11]:
pl = project_level()     
# pl.keyword_annotator_split('sankey_diagrams')
# hms = pl.heatmap('heatmap')


[14994, 145052, 10750, 13356, 184152]


Unnamed: 0,politics and society,technology,covid-19,geopolitics,environment,military and security,economy,chinese culture and people
politics and society,3201,0,0,0,0,0,0,0
technology,1438,1857,0,0,0,0,0,0
covid-19,1931,1738,5129,0,0,0,0,0
geopolitics,2281,1764,2987,6604,0,0,0,0
environment,1711,1354,2261,2841,3916,0,0,0
military and security,1520,1719,1809,2039,1475,2222,0,0
economy,1353,1221,1600,2040,1425,1288,3031,0
chinese culture and people,1479,1716,1825,2278,1456,1777,1336,2600


Unnamed: 0,politics and society,technology,covid-19,geopolitics,environment,military and security,economy,chinese culture and people
politics and society,9276,0,0,0,0,0,0,0
technology,60,1296,0,0,0,0,0,0
covid-19,585,91,10094,0,0,0,0,0
geopolitics,4399,101,1733,15195,0,0,0,0
environment,22,15,32,34,656,0,0,0
military and security,353,33,163,722,8,2871,0,0
economy,1305,360,1124,3085,65,389,10236,0
chinese culture and people,2460,42,346,669,293,93,544,5404


Unnamed: 0,politics and society,technology,covid-19,geopolitics,environment,military and security,economy,chinese culture and people
politics and society,541,0,0,0,0,0,0,0
technology,35,260,0,0,0,0,0,0
covid-19,75,58,491,0,0,0,0,0
geopolitics,134,55,208,810,0,0,0,0
environment,2,0,0,0,13,0,0,0
military and security,26,92,26,37,0,114,0,0
economy,150,70,84,181,0,31,1011,0
chinese culture and people,55,15,15,38,0,7,42,387


Unnamed: 0,politics and society,technology,covid-19,geopolitics,environment,military and security,economy,chinese culture and people
politics and society,775,0,0,0,0,0,0,0
technology,0,28,0,0,0,0,0,0
covid-19,16,0,773,0,0,0,0,0
geopolitics,620,6,128,1892,0,0,0,0
environment,4,3,0,11,62,0,0,0
military and security,17,0,10,29,0,141,0,0
economy,150,6,39,275,0,25,1032,0
chinese culture and people,95,2,12,103,15,15,66,709


In [265]:

fig = go.Figure(go.Sankey(
    arrangement = "snap",
    node = {
        "label": ["A", "B", "C", "D", "E", "F"],
        "x": [0, 0, 0, 0, 0, 0.9],
        "y": [0, 0, 0, 0, 0, 0.8],
        'pad':10},  # 10 Pixels
    link = {
        "source": [1.8, 0, 1, 2, 5, 4, 3, 5],
        "target": [5, 3, 4, 3, 0, 2, 2, 3],
        "value": [1, 2, 1, 1, 1, 1, 1, 2]}))

fig.show()

In [259]:
# ['ar - 16528 - 10%', 'economy - 3200 - 19%', 'politics and society - 6446 - 39%', 'covid-19 - 5417 - 33%', 'technology - 2402 - 15%', 'environment - 4402 - 27%', 'military and security - 2450 - 15%', 'geopolitics - 6605 - 40%', 'chinese culture and people - 3378 - 20%', 'unannotated - 4106 - 25%', 'source data - 160276']



['ar - 16528 - 10%',
 'economy - 3200 - 19%',
 'politics and society - 6446 - 39%',
 'covid-19 - 5417 - 33%',
 'technology - 2402 - 15%',
 'environment - 4402 - 27%',
 'military and security - 2450 - 15%',
 'geopolitics - 6605 - 40%',
 'chinese culture and people - 3378 - 20%',
 'unannotated - 4106 - 25%',
 'source data - 160276']

In [260]:
# sources = ['source data', 'lang']
# def create_sources(inp, out, sources ):
#     return inp*[sources[0]] + out*['lang']

In [None]:
import numpy as np
foo_array = [38,26,14,55,31,0,15,8,0,0,0,18,40,27,3,19,0,49,29,21,5,38,29,17,16]
foo = np.array(foo_array)
# Compute the median of the non-zero elements
m = np.median(foo[foo > 0])
# Assign the median to the zero elements 
foo[foo == 0] = m

In [None]:
m


In [None]:
x =  np.tril(df.values, k=0)
x[x == 0] = 1
x

In [None]:
version = 'v2'
lang = 'fr'
def analyze_overlap(df):
    return (df.T @ df) + ((0-df.T)@(0-df))

def color_white(val):
        """Color the nan text white"""
        if val == 0:
            return 'color: white'

def color_white_background(val):
        if val == 0:
            return 'background-color: white'

def rename_columns(df):
    renamed = df.rename(columns = themes_mapping)
    return renamed
annotated_data = pd.read_csv(f'{version}/keyword_annotations/store-all-messages-annotated-{version}-{lang}.csv')
themes = [t for t in list(annotated_data.columns) if t !='M52/language']
themes_mapping =  {'keyword.match.theme/chinese-culture-and-people':'chinese culture and people',
                                     'keyword.match.theme/covid-19':'covid-19',
                                     'keyword.match.theme/economy':'economy',
                                     'keyword.match.theme/environment':'environment',
                                     'keyword.match.theme/geopolitics':'geopolitics',
                                     'keyword.match.theme/military-and-security':'military and security',
                                     'keyword.match.theme/politics-and-society':'politics and society',
                                     'keyword.match.theme/technology':'technology'}
annotated_renamed = rename_columns(annotated_data).drop('M52/language', axis = 1)
df = analyze_overlap(annotated_renamed)

df[:] = np.tril(df.values, k=0)
fig = df.style.background_gradient(cmap ='viridis').set_properties(**{'font-size': '20px'}).applymap(lambda x: color_white_background(x)).applymap(lambda x: color_white(x))
display(fig)

In [None]:
import numpy as np; np.random.seed(0)
import seaborn as sns; sns.set_theme()
from matplotlib import pyplot as plt
fig_dims = (15, 10)
# create mask 
mask = np.triu(np.ones_like(df, dtype=bool), k=1)
sns.set(font_scale=1.5)
#  add white space instead of the masked 
with sns.axes_style("white"):
    fig, ax = plt.subplots(figsize=fig_dims)
    ax = sns.heatmap(df, annot=True, fmt="d", mask= mask, annot_kws={"size": 15}, square=True)


In [None]:
len(annotated_renamed[annotated_renamed['politics and society']&annotated_renamed['geopolitics']])

In [None]:
len(annotated_renamed[annotated_renamed['environment'] & annotated_renamed['chinese culture and people']])


In [None]:
# test heatmap
version = 'v2'
lang = 'fr'
def rename_columns(df):
        renamed = df.rename(columns = themes_mapping)
        return renamed
def unannotated_count(df):
    return len(df[(~df).all(axis=1)])
annotated_data = pd.read_csv(f'{version}/keyword_annotations/store-all-messages-annotated-{version}-{lang}.csv')
themes = [t for t in list(annotated_data.columns) if t !='M52/language']
themes_mapping =  {'keyword.match.theme/chinese-culture-and-people':'chinese culture and people',
                                     'keyword.match.theme/covid-19':'covid-19',
                                     'keyword.match.theme/economy':'economy',
                                     'keyword.match.theme/environment':'environment',
                                     'keyword.match.theme/geopolitics':'geopolitics',
                                     'keyword.match.theme/military-and-security':'military and security',
                                     'keyword.match.theme/politics-and-society':'politics and society',
                                     'keyword.match.theme/technology':'technology'}
annotated_renamed  = rename_columns(annotated_data).drop('M52/language', axis = 1)
len(annotated_renamed[annotated_renamed['geopolitics'] & annotated_renamed['politics and society']])
# df = unannotated_count(annotated_renamed)

# debug accounts 

In [None]:
# f = pd.read_csv('bbcm-china-source-data_fb-accounts-v3-ct-imported-pages.csv')
# x= f['crowdtangle.account/id']
# l = pd.read_excel('BBCM - Masterfile.xlsx', engine='openpyxl')
# l = l.iloc[:,0:22].fillna('None')
# l = l[l['M52/platform']=='Facebook']
# l[l['M52/accountId']=='crowdtangle-3701035']
# y = l['M52/accountId'].str.replace('crowdtangle-','')
# y = y[:196] 
# b = [int(i) for i in y ]
# c = [int(j) for j in x ]
# for i, j in zip(sorted(b),sorted(c)):
#         print(i,j)


In [None]:
# fb_collectable = pd.read_csv('data_collection/fb-accounts-v3-ct-imported-pages.csv')
# tw_collectable = pd.read_csv('data_collection/tw-accounts-v3-collectable.csv')
# fb_accounts_collections = pd.read_csv('data_collection/fb-v3-post-collections-20210402.csv')
# tw_accounts_collections = pd.read_csv('data_collection/tw-v3-tweet-collections.csv')
# consolidated_accounts_collection = pd.read_csv('data_collection/store-all-messages-v3.csv')