In [3]:
import ast
import pandas as pd
import numpy as np
from collections import defaultdict
import re
from emoji import UNICODE_EMOJI
import demoji
import operator
from tqdm import tqdm


import string
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

database = 'postgres'
user = 'postgres'
password = 'rKKFiDXpiu6Wbv3'
host='47.200.121.209'

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/seancafferty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Preprocessing - PCA of TFIDF 

### Get the handles for preprocessing. In our case, we have them stored in a larger dataframe of information. 

In [4]:
base = pd.read_csv('base_info_handles.csv')
base['HANDLE'] = base['HANDLE'].fillna(0)

In [1]:
#base.head()

In [6]:
base['HANDLE'] = base['HANDLE'].fillna(0)
list_of_rus_handles = [handle[1:] for handle in list(base[(base['Source']=='RUS') & (base['HANDLE']!=0)]['HANDLE'].values)]
list_of_usa_handles = [handle[1:] for handle in list(base[(base['Source']=='USA') & (base['HANDLE']!=0)]['HANDLE'].values)]

In [7]:
handle_dictionary = dict(zip(list(base.HANDLE),list(base.Country)))

### Get user activity from the Tweets database

In [8]:
def get_all_user_activity(user_name):
    from sqlalchemy import create_engine
    import pandas as pd
    engine = create_engine(f'postgresql://{user}:{password}@{host}:5432/{user}')
    user_id =  pd.read_sql_query(f"SELECT user_id FROM twitter_profiles WHERE screen_name = ('{user_name}')",con=engine)['user_id'][0]
    table_data  =  pd.read_sql_query(f"SELECT * FROM tweets WHERE (tweet_id IN (SELECT tweet_id FROM retweets WHERE user_id = ('{user_id}'))) OR (user_id = ('{user_id}'))",con=engine)
    return table_data

def get_network_df(embassy_list):
    embassy_df = get_all_user_activity(embassy_list[0])
    embassy_df['Country'] = handle_dictionary['@'+embassy_list[0]]
    for handle in tqdm(embassy_list[1:]):
        print(f'...getting data from {handle}')
        df2 = get_all_user_activity(handle)
        df2['Country'] = handle_dictionary['@'+handle]
        embassy_df = pd.concat([embassy_df,df2])
    return embassy_df

In [9]:
# df = get_network_df(list_of_usa_handles)
# df.to_csv('all_usa_tweets')
# df2 = get_network_df(list_of_rus_handles)
# df2.to_csv('all_rus_tweets')

In [10]:
#dff = get_all_user_activity('StateDept')

In [2]:
#usa = pd.read_csv('all_usa_tweets.csv')
#rus = pd.read_csv('all_rus_tweets.csv')

In [3]:
#len(usa[usa['user_id']==9624742])

In [13]:
## Get only English-language Tweets

In [15]:
english_documents_usa = usa[usa.lang=='en'][['clean_text','Country']]
english_documents_usa_df = pd.DataFrame(english_documents_usa).rename(columns={'clean_text':'documents'})

In [16]:
english_documents_rus = rus[rus.lang=='en'][['clean_text','Country']]
english_documents_rus_df = pd.DataFrame(english_documents_rus).rename(columns={'clean_text':'documents'})

In [17]:
### Preprocess Text 

In [19]:
def preprocess_text(document_df,stopword_lang):
    lines = document_df['documents'].fillna(0).values.tolist()
    sentences = list()
    for line in tqdm(lines):
        tokens = word_tokenize(line) if line!=0 else 'empty'
        tokens = [word.lower() for word in tokens]
        table = str.maketrans('','',string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words(stopword_lang))
        words = [w for w in words if not w in stop_words]
        sentences.append(words)
    document_df['preprocessed_text'] = sentences
    return document_df

In [20]:
english_preprocessed_us = preprocess_text(english_documents_usa_df,'english')

100%|██████████| 311537/311537 [02:16<00:00, 2287.77it/s]


In [21]:
english_preprocessed_rus = preprocess_text(english_documents_rus_df,'english')

100%|██████████| 156613/156613 [01:10<00:00, 2215.92it/s]


In [16]:
## Make the tweets into TFIDF-friendly list of documents

In [23]:
def make_documents(df):
    countries = list(df.Country.unique())
    documents = []
    for country in tqdm(countries):
        new_list = []
        for entry in df[df['Country']==country].preprocessed_text.values:
            new_list += entry
        " ".join(new_list)
        documents.append(" ".join(new_list))
    return documents

In [24]:
documents_us = make_documents(english_preprocessed_us)
documents_rus = make_documents(english_preprocessed_rus)

100%|██████████| 162/162 [00:02<00:00, 54.73it/s]
100%|██████████| 130/130 [00:01<00:00, 101.15it/s]


In [25]:
all_documents = documents_us + documents_rus 

In [32]:
def make_pca_tfidf(all_documents):
    tfidfvectoriser=TfidfVectorizer(ngram_range=(1,2), max_features=10000, min_df=10)
    X=tfidfvectoriser.fit_transform(all_documents).todense()
    countries_us = list(english_preprocessed_us.Country.unique())
    countries_rus = list(english_preprocessed_rus.Country.unique())
    countries_us_class = ['America'] * len(countries_us)
    countries_rus_class = ['Russia'] * len(countries_rus)
    countries_us_binary = ['red'] * len(countries_us)
    countries_rus_binary = ['blue'] * len(countries_rus)
    continent_name_us = list(base[(base['Source']=='USA') & (base['HANDLE']!=0)]['ContinentName'])
    continent_name_rus = list(base[(base['Source']=='RUS') & (base['HANDLE']!=0)]['ContinentName'])
    iso_us = list(base[(base['Source']=='USA') & (base['HANDLE']!=0)]['ISO_A3'])
    iso_rus = list(base[(base['Source']=='RUS') & (base['HANDLE']!=0)]['ISO_A3'])
    countries = countries_us + countries_rus
    classes = countries_us_class + countries_rus_class
    binary_class = countries_us_binary + countries_rus_binary
    continent_name = continent_name_us + continent_name_rus
    iso = iso_us + iso_rus
    pca2 = PCA(n_components=2).fit(X)
    pca3 = PCA(n_components=3).fit(X)
    data2D = pca2.transform(X)
    data3D = pca3.transform(X)
    pca_df2 = pd.DataFrame(data2D, columns = ['2D-0','2D-1'])
    pca_df3 = pd.DataFrame(data3D, columns = ['3D-0','3D-1','3D-2'])
    pca_df = pd.concat([pca_df2,pca_df3],axis = 1)
    pca_df['Country'] = countries
    pca_df['class'] = classes
    pca_df['binary_class'] = binary_class
    pca_df['continent_name'] = continent_name
    pca_df['ISO_A3'] = iso
    color_dict = {'Asia':'red','South America':'green','Central America':'lightgreen','Australia':'blue','Africa':'orange','North America':'yellow','Europe':'purple'}
    pca_df['continent_colors'] = [color_dict[cont] for cont in pca_df['continent_name']]
    
    return pca_df.to_csv('tfidf_pca_df.csv')


In [33]:
make_pca_tfidf(all_documents)

In [41]:
#pca_df.to_csv('tfidf_pca_df.csv')

In [4]:
#tfidf_pca_df = pd.read_csv('tfidf_pca_df.csv')

In [91]:
len(tfidf_pca_df)

292

In [6]:
## Bert Sentences 

In [5]:
#bert = pd.read_csv('BERT_sentence_averages.csv')

In [93]:
len(bert)

292

In [249]:
bert.shape

(292, 771)

In [95]:
def make_pca_bert(df):
    X = df[list(df.columns[1:-2])]
    countries_us = list(english_preprocessed_us.Country.unique())
    countries_rus = list(english_preprocessed_rus.Country.unique())
    countries_us_class = ['America'] * len(countries_us)
    countries_rus_class = ['Russia'] * len(countries_rus)
    countries_us_binary = ['red'] * len(countries_us)
    countries_rus_binary = ['blue'] * len(countries_rus)
    continent_name_us = list(base[(base['Source']=='USA') & (base['HANDLE']!=0)]['ContinentName'])
    continent_name_rus = list(base[(base['Source']=='RUS') & (base['HANDLE']!=0)]['ContinentName'])
    iso_us = list(base[(base['Source']=='USA') & (base['HANDLE']!=0)]['ISO_A3'])
    iso_rus = list(base[(base['Source']=='RUS') & (base['HANDLE']!=0)]['ISO_A3'])
    countries = countries_us + countries_rus
    classes = countries_us_class + countries_rus_class
    binary_class = countries_us_binary + countries_rus_binary
    continent_name = continent_name_us + continent_name_rus
    iso = iso_us + iso_rus
    pca2 = PCA(n_components=2).fit(X)
    pca3 = PCA(n_components=3).fit(X)
    data2D = pca2.transform(X)
    data3D = pca3.transform(X)
    pca_df2 = pd.DataFrame(data2D, columns = ['2D-0','2D-1'])
    pca_df3 = pd.DataFrame(data3D, columns = ['3D-0','3D-1','3D-2'])
    pca_df = pd.concat([pca_df2,pca_df3],axis = 1)
    pca_df['Country'] = countries
    pca_df['class'] = classes
    pca_df['binary_class'] = binary_class
    pca_df['continent_name'] = continent_name
    pca_df['ISO_A3'] = iso
    color_dict = {'Asia':'red','South America':'green','Central America':'lightgreen','Australia':'blue','Africa':'orange','North America':'yellow','Europe':'purple'}
    pca_df['continent_colors'] = [color_dict[cont] for cont in pca_df['continent_name']]
    
    return pca_df.to_csv('bert_pca_df.csv')

In [96]:
make_pca_bert(bert)

In [154]:
bert_pca_df = pd.read_csv('bert_pca_df.csv')

In [155]:
bert_pca_df.columns

Index(['Unnamed: 0', '2D-0', '2D-1', '3D-0', '3D-1', '3D-2', 'Country',
       'class', 'binary_class', 'continent_name', 'ISO_A3',
       'continent_colors'],
      dtype='object')

## Test Visualizations of Semantic Vector Space 

In [265]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objects as go
from plotly.offline import plot


def update_clusters(color_scheme='network',data='BERT',n_clusters=3,dimension='2D',show_k=True, text=True):
    if data == 'BERT':
        dff = bert_pca_df
    elif data == 'TFIDF':
        dff = tfidf_pca_df

    if text == True:
        marker_text = 'markers+text'
    else:
        marker_text = 'markers'
        
    if color_scheme == 'network':
        color_variable = dff['binary_class']
    elif color_scheme == 'continent':
        color_variable = dff['continent_colors']
    
    if dimension == '2D':
        trace_1 = go.Scatter(
            x=dff['2D-0'],
            y=dff['2D-1'],
            text=dff['Country'],
            textposition = 'top center',
            mode=marker_text,
            marker=dict(
                size=16,
                color=color_variable,                # set color to an array/list of desired values
                colorscale='spectral',   # choose a colorscale
                opacity=0.6,
                line=dict(
                        color='MediumPurple',
                        width=1
                    )
            )
        )

        if show_k == True:
            km = KMeans(n_clusters)
            clusts = km.fit_predict(np.array(pca_df[['2D-0','2D-1']]))
            centers = pd.DataFrame(km.cluster_centers_)#.transpose()


            trace_2 = go.Scatter(
                x = centers[0],
                y = centers[1],
                mode='markers+text',
                marker=dict(
                    size=123,
                    color='green',               
                    opacity=0.3
                )
            )
            fig = go.Figure(data=[trace_2,trace_1])
        else:
            fig = go.Figure(data=[trace_1]) 
            
    elif dimension == '3D':
        trace_1 = go.Scatter3d(
            x=dff['3D-0'],
            y=dff['3D-1'],
            z=dff['3D-2'],
            text=dff['Country'],
            textposition = 'top center',
            mode=marker_text,
            marker=dict(
                size=6,
                color=color_variable,                # set color to an array/list of desired values
                colorscale='spectral',   # choose a colorscale
                opacity=0.6,
                line=dict(
                        color='MediumPurple',
                        width=1
                    )
            )
        )

        if show_k == True:
            km = KMeans(n_clusters)
            clusts = km.fit_predict(np.array(pca_df[['3D-0','3D-1','3D-2']]))
            centers = pd.DataFrame(km.cluster_centers_)#.transpose()


            trace_2 = go.Scatter3d(
                x = centers[0],
                y = centers[1],
                z = centers[2],
                mode='markers+text',
                marker=dict(
                    size=123,
                    color='green',               
                    opacity=0.3
                )
            )
            fig = go.Figure(data=[trace_2,trace_1])
        else:
            fig = go.Figure(data=[trace_1])
    
    
    
    fig.update_traces(textposition='top center')
    fig.update_layout(
            height=800,
            #title_text='TFIDF-Tweet Accounts'
        )    
    camera = dict(
        eye=dict(x=2, y=2, z=0.1),
        center=dict(x=0.5, y=0.7, z=0),
        up=dict(x=0, y=0, z=1)
    )

    fig.update_layout(scene_camera=camera)
    
    return fig.show()

In [7]:
#update_clusters(color_scheme='continent',data='BERT',n_clusters=1,dimension='2D',show_k=False, text=False)

In [194]:
#list(bert.country)

In [355]:
def find_most_similar_messaging(df,country,network):
    country_array = np.array(df[(df.country==country)&(df.network==network)][list(df.columns)[1:-2]].iloc[0]).reshape(1,-1)
    X = df[list(df.columns)[1:-2]]
    sims  = []
    for idx in range(len(X)):
        sim = cosine_similarity(country_array, np.array(X.iloc[idx]).reshape(1,-1))
        sims.append(sim[0][0])
    top_df = pd.DataFrame(sims,columns=['cosine_similarity'])
    top_df['country'] = df['country']
    top_df['network'] = df['network']
    top_df = top_df.sort_values(by='cosine_similarity',ascending=False)
    top_df = top_df[['country','network','cosine_similarity']]
    return top_df
     

In [8]:
#find_most_similar(bert,'YEMEN','RUS').head(10)

In [333]:
def find_most_different(df):
    countries = set(list(df[df.network=='USA'].country)).intersection(set(list(df[df.network=='RUS'].country)))
    sims = []
    for country in countries:
        country_array_rus = np.array(df[(df.country==country)&(df.network=='RUS')][list(df.columns)[1:-2]].iloc[0]).reshape(1,-1)
        country_array_usa = np.array(df[(df.country==country)&(df.network=='USA')][list(df.columns)[1:-2]].iloc[0]).reshape(1,-1)
        sim = cosine_similarity(country_array_rus, country_array_usa)
        sims.append(sim[0][0])
    top_df = pd.DataFrame(sims,columns=['cosine_similarity'])
    top_df['country'] = countries
    top_df = top_df.sort_values(by='cosine_similarity',ascending=True)
    top_df = top_df[['country','cosine_similarity']]
    return top_df

In [9]:
#find_most_different(bert)#.to_csv('BERT_divergence.csv')

In [343]:
#find_most_similar(bert,'SAUDI ARABIA','USA').head(10)

In [10]:
#bert.columns

In [230]:
def mahalanobis(x=None, data=None, cov=None):
    """Compute the Mahalanobis Distance between each row of x and the data  
    x    : vector or matrix of data with, say, p columns.
    data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.
    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
    """
    x_minus_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = sp.linalg.inv(cov)
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    return mahal.diagonal()

import scipy as sp
countries = ['afghanistan', 'albania', 'algeria', 'angola', 'argentina', 'armenia', 'australia', 'austria', 'azerbaijan', 'bahamas', 'bahrain', 'bangladesh', 'barbados', 'belarus', 'belgium', 'belize', 'benin', 'bolivia', 'bosnia', 'botswana', 'brazil', 'brunei', 'bulgaria', 'burkina', 'burundi', 'cabo', 'cambodia', 'cameroon', 'canada', 'chad', 'chile', 'china', 'colombia', 'comoros', 'congo', 'costa', 'côte', 'croatia', 'cuba', 'cyprus', 'czechia', 'denmark', 'djibouti', 'dominican', 'ecuador', 'egypt', 'elsalvador', 'equatorialguinea', 'eritrea', 'estonia', 'eswatini', 'ethiopia', 'fiji', 'finland', 'france', 'gabon', 'georgia', 'germany', 'ghana', 'greece', 'guatemala', 'guineabissau', 'guinea', 'guyana', 'haiti','vatican','honduras', 'hungary', 'iceland', 'india', 'indonesia', 'iran', 'iraq', 'ireland', 'israel', 'italy', 'jamaica', 'japan', 'jordan', 'kazakhstan', 'kenya', 'northkorea','southkorea', 'kuwait', 'kyrgyzstan', 'lao', 'latvia', 'lebanon', 'lesotho', 'liberia', 'libya', 'lithuania', 'luxembourg', 'madagascar', 'malawi', 'malaysia', 'mali', 'malta', 'mauritania', 'mauritius', 'mexico', 'moldova', 'mongolia', 'montenegro', 'morocco', 'mozambique', 'myanmar', 'nepal', 'netherlands', 'newzealand', 'nicaragua', 'niger', 'nigeria', 'norway', 'oman', 'pakistan', 'palau', 'palestine', 'panama', 'paraguay', 'peru', 'philippines', 'poland', 'portugal', 'qatar', 'romania', 'russia', 'rwanda', 'saudi', 'saudiarabia','senegal', 'serbia', 'seychelles', 'sierra', 'singapore', 'slovakia', 'slovenia', 'somalia', 'spain', 'srilanka', 'sudan', 'suriname', 'sweden', 'switzerland', 'syria', 'tajikistan', 'tanzania', 'thailand', 'togo', 'trinidad', 'tunisia', 'turkey', 'turkmenistan', 'uganda', 'ukraine', 'uae', 'emirates', 'unitedstates', 'uruguay', 'uzbekistan', 'venezuela', 'vietnam', 'yemen', 'zambia', 'zimbabwe']
    
def get_mahalanobis_outlier_rank_BERT(df,network):
    dff = df[df['network']==network]
    columns = list(dff.columns)[1:-2]
    matrix = np.array(dff[columns])
    vector_df = pd.DataFrame(matrix)
    vector_df['network'] = network
    vector_df.index = dff.country
    cols = list(range(768))
    df_x = vector_df[cols]
    vector_df['mahalanobis'] = mahalanobis(x=df_x, data=vector_df[cols])
    
    return pd.DataFrame(vector_df[['network','mahalanobis']].sort_values(by='mahalanobis',ascending=False))

In [400]:
#get_mahalanobis_outlier_rank_BERT(bert,'USA')#.to_csv('BERT_outliers_usa.csv')

### Make Language Models - Word2Vec

In [93]:
## We're using the same dataframe of tweets that we used for TFIDF

In [100]:
import gensim 
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

window = 7

languages = ['en','fr','es','ru']
stopword_lang = ['english','french','spanish','russian']
def w2v(df, language, stopword_lang): 
    df = df[df['lang']==language]
    sentences = list()
    lines = df['full_text'].values.tolist()
    ## remove urls
    lines = [re.sub(r"http\S+", "", x) for x in lines]
    for line in tqdm(lines):
        tokens = word_tokenize(line)
        tokens = [word.lower() for word in tokens]
        table = str.maketrans('','',string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        words = [word for word in stripped if word.isalpha()]
        stop_words = set(stopwords.words(stopword_lang))
        words = [w for w in words if not w in stop_words]
        sentences.append(words)
    EMBEDDING_DIM = 100
    model = gensim.models.Word2Vec(
            sentences = sentences,
            size = EMBEDDING_DIM,
            window = window,
            min_count = 1)
    words = list(model.wv.vocab)
    print("Test Vocabulary size: %d" % len(words))
    return model

In [95]:
#russian_embassies_w2v_english = w2v(rus, 'en', 'english')

In [96]:
#american_embassies_w2v_english = w2v(usa, 'en', 'english')

In [97]:
#american_embassies_w2v_english.save("us_en_w2v.model")

In [98]:
#russian_embassies_w2v_english.save("rus_en_w2v.model")

In [27]:
from gensim.models import Word2Vec
american_embassies_w2v_english = Word2Vec.load("language_models/us_en_w2v.model")
russian_embassies_w2v_english = Word2Vec.load("language_models/rus_en_w2v.model")

In [11]:
#american_embassies_w2v_english.wv.most_similar(['dprk'],topn=20)

In [30]:
def word_association_with_countries(word,model):
    countries = ['america','usa','afghanistan', 'albania', 'algeria', 'angola', 'argentina', 'armenia', 'australia', 'austria', 'azerbaijan', 'bahamas', 'bahrain', 'bangladesh', 'barbados', 'belarus', 'belgium', 'belize', 'benin', 'bolivia', 'bosnia', 'botswana', 'brazil', 'brunei', 'bulgaria', 'burkina', 'burundi', 'cabo', 'cambodia', 'cameroon', 'canada', 'chad', 'chile', 'china', 'colombia', 'comoros', 'congo', 'costa', 'côte', 'croatia', 'cuba', 'cyprus', 'czechia', 'denmark', 'djibouti', 'dominican', 'ecuador', 'egypt', 'elsalvador', 'equatorialguinea', 'eritrea', 'estonia', 'eswatini', 'ethiopia', 'fiji', 'finland', 'france', 'gabon', 'georgia', 'germany', 'ghana', 'greece', 'guatemala', 'guineabissau', 'guinea', 'guyana', 'haiti', 'holysee','vatican','honduras', 'hungary', 'iceland', 'india', 'indonesia', 'iran', 'iraq', 'ireland', 'israel', 'italy', 'jamaica', 'japan', 'jordan', 'kazakhstan', 'kenya', 'northkorea','southkorea', 'kuwait', 'kyrgyzstan', 'lao', 'latvia', 'lebanon', 'lesotho', 'liberia', 'libya', 'lithuania', 'luxembourg', 'madagascar', 'malawi', 'malaysia', 'mali', 'malta', 'mauritania', 'mauritius', 'mexico', 'micronesia', 'moldova', 'mongolia', 'montenegro', 'morocco', 'mozambique', 'myanmar', 'nepal', 'netherlands', 'newzealand', 'nicaragua', 'niger', 'nigeria', 'norway', 'oman', 'pakistan', 'palau', 'palestine', 'panama', 'papua', 'paraguay', 'peru', 'philippines', 'poland', 'portugal', 'qatar', 'romania', 'russia', 'rwanda', 'samoa', 'saudi', 'saudiarabia','senegal', 'serbia', 'seychelles', 'sierra', 'singapore', 'slovakia', 'slovenia', 'somalia', 'spain', 'srilanka', 'sudan', 'suriname', 'sweden', 'switzerland', 'syrian', 'tajikistan', 'tanzania', 'thailand', 'timor', 'togo', 'trinidad', 'tunisia', 'turkey', 'turkmenistan', 'uganda', 'ukraine', 'uae', 'unitedarabemirates', 'unitedstates', 'uruguay', 'uzbekistan', 'venezuela', 'viet', 'yemen', 'zambia', 'zimbabwe']
    country_dict = {}
    for country in countries:
        country_dict[country] = model.wv.similarity(word,country)
    top_dict = {k: v for k, v in sorted(country_dict.items(), key=lambda item: -item[1])}
    return top_dict

In [32]:
#word_association_with_countries('dispute',american_embassies_w2v_english)

## VISUALIZE WORD2VEC with NETWORKS

In [36]:
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import plotly.graph_objects as go
from plotly.offline import plot
import networkx as nx
import numpy as np

def word2vec_network(model, word_list, threshhold=0.5):
    words, vectors = [], []
    for item in word_list:
        try:
            vectors.append(model.wv.get_vector(item))
            words.append(item)
        except:
            print(f'Word {item} not found in vocab.')
    sims = cosine_similarity(vectors, vectors)       
    for i in range(len(vectors)):
        for j in range(len(vectors)):
            if i<=j:
                sims[i, j] = False
    indices = np.argwhere(sims > threshhold)

    G = nx.Graph()

    for index in indices:
        G.add_edge(words[index[0]], words[index[1]], weight=sims[index[0],
                                                                 index[1]])

    weight_values = nx.get_edge_attributes(G,'weight')
    positions = nx.spring_layout(G)
    nx.set_node_attributes(G,name='position',values=positions)
    searches = []
    edge_x = []
    edge_y = []
    weights = []
    ave_x, ave_y = [], []
    for edge in G.edges():
        x0, y0 = G.nodes[edge[0]]['position']
        x1, y1 = G.nodes[edge[1]]['position']
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
        ave_x.append(np.mean([x0, x1]))
        ave_y.append(np.mean([y0, y1]))
        weights.append(f'{edge[0]}, {edge[1]}: {weight_values[(edge[0], edge[1])]}')
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        opacity=0.3,
        line=dict(width=2, color='White'),
        hoverinfo=None,
        mode='lines')
    edge_trace.text = weights
    node_x = []
    node_y = []
    sizes = []
    for node in G.nodes():
        x, y = G.nodes[node]['position']
        node_x.append(x)
        node_y.append(y)
        if node in searches:
            sizes.append(50)
        else:
            sizes.append(15)
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hoverinfo='text',
        textposition="top center",
        marker=dict(
            showscale=False,
            line=dict(color='White'),
            colorscale='RdBu',
            reversescale=False,
            color=[],
            opacity=0.9,
            size=sizes,
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2
        )
    )
    invisible_similarity_trace = go.Scatter(
        x=ave_x, y=ave_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            color=[],
            opacity=0,
        )
    )
    invisible_similarity_trace.text=weights
    
    node_adjacencies = []
    node_text = []
    for node, adjacencies in enumerate(G.adjacency()):
        node_adjacencies.append(len(adjacencies[1]))
        node_text.append(adjacencies[0])
    node_trace.marker.color = node_adjacencies
    node_trace.text = node_text
    fig = go.Figure(
        data=[edge_trace, node_trace, invisible_similarity_trace],
        layout=go.Layout(
            title=None,
            template='plotly_dark',
            titlefont_size=20,
            showlegend=False,
            coloraxis=None,
            hovermode='closest',
            margin=dict(b=20,l=20,r=20,t=40),
            annotations=[
                dict(
                    text='Word Associations',
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) 
            ],
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
        )
    )
    #fig.update_coloraxes(showscale=False)
    #fig.update_layout(showscale=False, showlegend=False)

    return fig.show()

In [13]:
#word_list = list(american_embassies_w2v_english.wv.index2entity[:10])
#countries = ['afghanistan', 'albania', 'algeria', 'angola', 'argentina', 'armenia', 'australia', 'austria', 'azerbaijan', 'bahamas', 'bahrain', 'bangladesh', 'barbados', 'belarus', 'belgium', 'belize', 'benin', 'bolivia', 'bosnia', 'botswana', 'brazil', 'brunei', 'bulgaria', 'burkina', 'burundi', 'cabo', 'cambodia', 'cameroon', 'canada', 'chad', 'chile', 'china', 'colombia', 'comoros', 'congo', 'costa', 'côte', 'croatia', 'cuba', 'cyprus', 'czechia', 'denmark', 'djibouti', 'dominican', 'ecuador', 'egypt', 'elsalvador', 'equatorialguinea', 'eritrea', 'estonia', 'eswatini', 'ethiopia', 'fiji', 'finland', 'france', 'gabon', 'georgia', 'germany', 'ghana', 'greece', 'guatemala', 'guineabissau', 'guinea', 'guyana', 'haiti', 'holysee','vatican','honduras', 'hungary', 'iceland', 'india', 'indonesia', 'iran', 'iraq', 'ireland', 'israel', 'italy', 'jamaica', 'japan', 'jordan', 'kazakhstan', 'kenya', 'northkorea','southkorea', 'kuwait', 'kyrgyzstan', 'lao', 'latvia', 'lebanon', 'lesotho', 'liberia', 'libya', 'lithuania', 'luxembourg', 'madagascar', 'malawi', 'malaysia', 'mali', 'malta', 'mauritania', 'mauritius', 'mexico', 'micronesia', 'moldova', 'mongolia', 'montenegro', 'morocco', 'mozambique', 'myanmar', 'nepal', 'netherlands', 'newzealand', 'nicaragua', 'niger', 'nigeria', 'norway', 'oman', 'pakistan', 'palau', 'palestine', 'panama', 'paraguay', 'peru', 'philippines', 'poland', 'portugal', 'qatar', 'romania', 'russia', 'rwanda', 'samoa', 'saudi', 'saudiarabia','senegal', 'serbia', 'seychelles', 'sierra', 'singapore', 'slovakia', 'slovenia', 'somalia', 'spain', 'srilanka', 'sudan', 'suriname', 'sweden', 'switzerland', 'syrian', 'tajikistan', 'tanzania', 'thailand', 'timor', 'togo', 'trinidad', 'tunisia', 'turkey', 'turkmenistan', 'uganda', 'ukraine', 'uae', 'unitedstates', 'uruguay', 'uzbekistan', 'venezuela', 'vietnam', 'yemen', 'zambia', 'zimbabwe']
#word_associations = [word[0] for word in american_embassies_w2v_english.wv.most_similar(['russia'],topn=20)]

#word2vec_network(american_embassies_w2v_english, countries, threshhold=0.4)

In [39]:
def word2vec_word_association_network(word, model):
    word = word.lower()
    word_associations = [word[0] for word in model.wv.most_similar([word],topn=40)]
    return word2vec_network(model, word_associations, threshhold=0.7)
    

In [14]:
#word2vec_word_association_network('covid', american_embassies_w2v_english)

In [41]:
def word2vec_word_association_with_countries(model):
    countries = ['afghanistan', 'albania', 'algeria', 'angola', 'argentina', 'armenia', 'australia', 'austria', 'azerbaijan', 'bahamas', 'bahrain', 'bangladesh', 'barbados', 'belarus', 'belgium', 'belize', 'benin', 'bolivia', 'bosnia', 'botswana', 'brazil', 'brunei', 'bulgaria', 'burkina', 'burundi', 'cabo', 'cambodia', 'cameroon', 'canada', 'chad', 'chile', 'china', 'colombia', 'comoros', 'congo', 'costa', 'côte', 'croatia', 'cuba', 'cyprus', 'czechia', 'denmark', 'djibouti', 'dominican', 'ecuador', 'egypt', 'elsalvador', 'equatorialguinea', 'eritrea', 'estonia', 'eswatini', 'ethiopia', 'fiji', 'finland', 'france', 'gabon', 'georgia', 'germany', 'ghana', 'greece', 'guatemala', 'guineabissau', 'guinea', 'guyana', 'haiti', 'holysee','vatican','honduras', 'hungary', 'iceland', 'india', 'indonesia', 'iran', 'iraq', 'ireland', 'israel', 'italy', 'jamaica', 'japan', 'jordan', 'kazakhstan', 'kenya', 'northkorea','southkorea', 'kuwait', 'kyrgyzstan', 'lao', 'latvia', 'lebanon', 'lesotho', 'liberia', 'libya', 'lithuania', 'luxembourg', 'madagascar', 'malawi', 'malaysia', 'mali', 'malta', 'mauritania', 'mauritius', 'mexico', 'micronesia', 'moldova', 'mongolia', 'montenegro', 'morocco', 'mozambique', 'myanmar', 'nepal', 'netherlands', 'newzealand', 'nicaragua', 'niger', 'nigeria', 'norway', 'oman', 'pakistan', 'palau', 'palestine', 'panama', 'paraguay', 'peru', 'philippines', 'poland', 'portugal', 'qatar', 'romania', 'russia', 'rwanda', 'samoa', 'saudi', 'saudiarabia','senegal', 'serbia', 'seychelles', 'sierra', 'singapore', 'slovakia', 'slovenia', 'somalia', 'spain', 'srilanka', 'sudan', 'suriname', 'sweden', 'switzerland', 'syria', 'tajikistan', 'tanzania', 'thailand', 'timor', 'togo', 'trinidad', 'tunisia', 'turkey', 'turkmenistan', 'uganda', 'ukraine', 'uae', 'emirates', 'unitedstates', 'uruguay', 'uzbekistan', 'venezuela', 'vietnam', 'yemen', 'zambia', 'zimbabwe']
    return word2vec_network(model, countries, threshhold=0.3)

In [42]:
def mahalanobis(x=None, data=None, cov=None):
    """Compute the Mahalanobis Distance between each row of x and the data  
    x    : vector or matrix of data with, say, p columns.
    data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.
    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
    """
    x_minus_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = sp.linalg.inv(cov)
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    return mahal.diagonal()

import scipy as sp
countries = ['afghanistan', 'albania', 'algeria', 'angola', 'argentina', 'armenia', 'australia', 'austria', 'azerbaijan', 'bahamas', 'bahrain', 'bangladesh', 'barbados', 'belarus', 'belgium', 'belize', 'benin', 'bolivia', 'bosnia', 'botswana', 'brazil', 'brunei', 'bulgaria', 'burkina', 'burundi', 'cabo', 'cambodia', 'cameroon', 'canada', 'chad', 'chile', 'china', 'colombia', 'comoros', 'congo', 'costa', 'côte', 'croatia', 'cuba', 'cyprus', 'czechia', 'denmark', 'djibouti', 'dominican', 'ecuador', 'egypt', 'elsalvador', 'equatorialguinea', 'eritrea', 'estonia', 'eswatini', 'ethiopia', 'fiji', 'finland', 'france', 'gabon', 'georgia', 'germany', 'ghana', 'greece', 'guatemala', 'guineabissau', 'guinea', 'guyana', 'haiti','vatican','honduras', 'hungary', 'iceland', 'india', 'indonesia', 'iran', 'iraq', 'ireland', 'israel', 'italy', 'jamaica', 'japan', 'jordan', 'kazakhstan', 'kenya', 'northkorea','southkorea', 'kuwait', 'kyrgyzstan', 'lao', 'latvia', 'lebanon', 'lesotho', 'liberia', 'libya', 'lithuania', 'luxembourg', 'madagascar', 'malawi', 'malaysia', 'mali', 'malta', 'mauritania', 'mauritius', 'mexico', 'moldova', 'mongolia', 'montenegro', 'morocco', 'mozambique', 'myanmar', 'nepal', 'netherlands', 'newzealand', 'nicaragua', 'niger', 'nigeria', 'norway', 'oman', 'pakistan', 'palau', 'palestine', 'panama', 'paraguay', 'peru', 'philippines', 'poland', 'portugal', 'qatar', 'romania', 'russia', 'rwanda', 'saudi', 'saudiarabia','senegal', 'serbia', 'seychelles', 'sierra', 'singapore', 'slovakia', 'slovenia', 'somalia', 'spain', 'srilanka', 'sudan', 'suriname', 'sweden', 'switzerland', 'syria', 'tajikistan', 'tanzania', 'thailand', 'togo', 'trinidad', 'tunisia', 'turkey', 'turkmenistan', 'uganda', 'ukraine', 'uae', 'emirates', 'unitedstates', 'uruguay', 'uzbekistan', 'venezuela', 'vietnam', 'yemen', 'zambia', 'zimbabwe']
    
def get_mahalanobis_outlier_rank(model, category_list):
    matrix = []
    for item in category_list:
        vector = model.wv.get_vector(item)
        matrix.append(vector)
    matrix = np.vstack(matrix)
    vector_df = pd.DataFrame(matrix)
    vector_df.index = category_list
    columns = list(range(100))
    df_x = vector_df[columns]
    vector_df['mahalanobis'] = mahalanobis(x=df_x, data=vector_df[columns])
    return pd.DataFrame(vector_df['mahalanobis'].sort_values(ascending=False))

In [15]:
## Russia - Contextual Outliers
#df = get_mahalanobis_outlier_rank(russian_embassies_w2v_english,countries)#.head(10)

In [16]:
#df

In [17]:
## United States - Contextual Outliers
#get_mahalanobis_outlier_rank(american_embassies_w2v_english,countries).head(10)

In [62]:
#sorted(word2vec_word_association_with_countries(american_embassies_w2v_english),key=operator.itemgetter(3),reverse=False)

## Categorical Context Deviation Score? 
#### What does it tell us? -- The extent to which a token differs in context from ostensibly similar tokens. 
#### In this case, which countries are referenced in relatively 'atypical' contexts. 

In [66]:
d = word2vec_word_association_with_countries(american_embassies_w2v_english)

In [18]:
#{k: np.round(len(d.keys())/v,3) for k, v in sorted(d.items(), key=lambda item: item[1],reverse=False)}

In [385]:
## BERT NETWORK

import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
import plotly.graph_objects as go
from plotly.offline import plot
import networkx as nx
import numpy as np

def BERT_network(df,network,threshhold=0.5):
    dff = df[df['network']==network]
    words, vectors = [], []
    for idx in range(len(dff)):
        vectors.append(np.array(dff[list(dff.columns)[1:-2]].iloc[idx]))
        words.append(df.iloc[idx]['country'])
    sims = cosine_similarity(vectors, vectors)       
    for i in range(len(vectors)):
        for j in range(len(vectors)):
            if i<=j:
                sims[i, j] = False
    indices = np.argwhere(sims > threshhold)

    G = nx.Graph()

    for index in indices:
        G.add_edge(words[index[0]], words[index[1]], weight=sims[index[0],
                                                                 index[1]])

    weight_values = nx.get_edge_attributes(G,'weight')
    positions = nx.spring_layout(G)
    nx.set_node_attributes(G,name='position',values=positions)
    searches = []
    edge_x = []
    edge_y = []
    weights = []
    ave_x, ave_y = [], []
    for edge in G.edges():
        x0, y0 = G.nodes[edge[0]]['position']
        x1, y1 = G.nodes[edge[1]]['position']
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)
        ave_x.append(np.mean([x0, x1]))
        ave_y.append(np.mean([y0, y1]))
        weights.append(f'{edge[0]}, {edge[1]}: {weight_values[(edge[0], edge[1])]}')
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        opacity=0.3,
        line=dict(width=2, color='White'),
        hoverinfo=None,
        mode='lines')
    edge_trace.text = weights
    node_x = []
    node_y = []
    sizes = []
    for node in G.nodes():
        x, y = G.nodes[node]['position']
        node_x.append(x)
        node_y.append(y)
        if node in searches:
            sizes.append(50)
        else:
            sizes.append(15)
    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        hoverinfo='text',
        textposition="top center",
        marker=dict(
            showscale=False,
            line=dict(color='White'),
            colorscale='RdBu',
            reversescale=False,
            color=[],
            opacity=0.9,
            size=sizes,
            colorbar=dict(
                thickness=15,
                title='Node Connections',
                xanchor='left',
                titleside='right'
            ),
            line_width=2
        )
    )
    invisible_similarity_trace = go.Scatter(
        x=ave_x, y=ave_y,
        mode='markers',
        hoverinfo='text',
        marker=dict(
            color=[],
            opacity=0,
        )
    )
    invisible_similarity_trace.text=weights
    
    node_adjacencies = []
    node_text = []
    for node, adjacencies in enumerate(G.adjacency()):
        node_adjacencies.append(len(adjacencies[1]))
        node_text.append(adjacencies[0])
    node_trace.marker.color = node_adjacencies
    node_trace.text = node_text
    fig = go.Figure(
        data=[edge_trace, node_trace, invisible_similarity_trace],
        layout=go.Layout(
            title=None,
            template='plotly_dark',
            titlefont_size=20,
            showlegend=False,
            coloraxis=None,
            hovermode='closest',
            margin=dict(b=20,l=20,r=20,t=40),
            annotations=[
                dict(
                    text='Word Associations',
                    showarrow=False,
                    xref="paper", yref="paper",
                    x=0.005, y=-0.002 ) 
            ],
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
        )
    )
    #fig.update_coloraxes(showscale=False)
    #fig.update_layout(showscale=False, showlegend=False)

    return fig.show()

In [396]:
#bert[list(bert.columns)[1:-2]]

In [395]:
#BERT_network(bert,'RUS',threshhold=0.975)