In [1]:
import pandas as pd
from numpy.random import randint, choice, seed
from gensim.models import Word2Vec, KeyedVectors
from dask import delayed, compute
from dask.distributed import Client
import dask.dataframe as dd
from glob import glob
import re

In [2]:
vector_dir = 'G:/vectors/'

In [3]:
vector_files = glob(vector_dir + '*wordvectors')

In [4]:
vector_files = pd.Series(vector_files)

In [5]:
countries = pd.read_csv('countries.csv')

In [6]:
countries = countries['country'].str.lower().unique()

In [7]:
threat_labels = pd.read_csv('threat_labels.csv')

In [8]:
threat_labels['month_yr'] = threat_labels['month'].astype(str) + '-' + threat_labels['year'].astype(str)

In [9]:
ccodes = pd.read_csv('ccodes.csv')

In [10]:
threat_labels = threat_labels.merge(ccodes[['country.name.en', 'iso3c']])

In [11]:
threat_labels = threat_labels[['code', 'month_yr', 'country.name.en']]

In [12]:
threat_labels.rename(columns = {'country.name.en':'country'}, inplace = True)

In [13]:
threat_labels['country'] = threat_labels['country'].str.lower()

In [14]:
def calculate_threat(filename, row, countries):
    code, month, country = row
    vectors = KeyedVectors.load(filename)
    vocab = set(vectors.key_to_index)
    if country in vocab:
        vocab = vocab.difference(countries)
        vocab = list(vocab)
        similarities = KeyedVectors.cosine_similarities(vectors[country], vectors[vocab])
        threat_levels = pd.DataFrame({'country':country,
                                      'word':vocab,
                                      'threat_level':code * similarities,
                                      'month':month})
    else:
        threat_levels = pd.DataFrame(columns = ['country', 'word', 'threat_level', 'month'])
    threat_levels = threat_levels.astype({'country':str, 'word':str, 'threat_level':float, 'month':str})
    return threat_levels

In [15]:
# client = Client(n_workers = 8)

In [16]:
threat_list = []

In [17]:
for row in threat_labels.itertuples(index = False):
    matching_files = vector_files.loc[vector_files.str.contains(row[1])]
    threat_list.extend([delayed(calculate_threat)(filename, row, 
                        countries) for filename in matching_files])

In [18]:
threat_df = dd.from_delayed(threat_list, meta = {'country':str, 'word':str, 
                                                 'threat_level':float, 'month':str})

In [19]:
threat_df = threat_df.repartition(partition_size = '100MB')

In [20]:
threat_df.to_parquet('G:/threat_wordscale.parquet')

(None,)

In [21]:
# client.close()