In [None]:
import pandas as pd
import numpy as np
from numpy.random import randint, choice, seed
from gensim.models import Word2Vec, KeyedVectors
from dask import delayed, compute
from dask.distributed import Client
import dask.dataframe as dd
from glob import glob
import re
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale

In [None]:
vector_dir = 'G:/vectors/'

In [None]:
vector_files = glob(vector_dir + '*wordvectors')

In [None]:
vector_files = pd.Series(vector_files)

In [None]:
countries = pd.read_csv('countries.csv')

In [None]:
countries = countries['country'].str.lower().unique()

In [None]:
threat_df = pd.read_parquet('G:/threat_wordscale.parquet')

In [None]:
threat_levels = threat_df.groupby('word')['threat_level'].mean().reset_index()

In [None]:
threat_levels['threat_level'] = scale(threat_levels['threat_level'])

In [None]:
threat_levels.set_index('word', inplace = True)

In [None]:
def train_model(X_train, y_train):
    elasticnet = ElasticNetCV(l1_ratio = [.1, .5, .7, .9, .95, .99, 1],
                             cv = 5, n_jobs = 1, random_state = 8265, 
                             max_iter = 10000, selection = 'random')
    elasticnet.fit(X_train, y_train)
    return elasticnet

In [None]:
def get_vectors(filename, countries, threat_levels):
    vectors = KeyedVectors.load(filename)
    vocab = set(vectors.key_to_index)
    countries = list(vocab.intersection(countries))
    vocab = list(vocab.intersection(threat_levels.index))
    if len(vocab) > 0:
        y = threat_levels.loc[threat_levels.index.isin(vocab), 
                              'threat_level'].reindex(index = vocab)
        X = vectors[vocab]
    else:
        y = np.array([])
        X = np.array([])
    if len(countries) > 0:
        country_vectors = vectors[countries]
    else:
        country_vectors = np.array([])
    return X, y, countries, country_vectors

In [None]:
def estimate_threat(filename, countries, threat_levels):
    X, y, countries, country_vectors = get_vectors(filename, countries, threat_levels)
    if (X.size > 0) and (country_vectors.size > 0):
        month = re.search('\\d{1,2}-\\d{4}', filename).group(0)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 680)
        model = train_model(X_train, y_train)
        threat_scores = pd.DataFrame({'country':countries, 
                                      'threat_perception':model.predict(country_vectors),
                                      'month':month})
        test_set = pd.DataFrame({'word':y_test.index, 'actual':y_test, 
                                 'predicted':model.predict(X_test), 'month':month})
    else:
        threat_scores = pd.DataFrame(columns = ['country', 'threat_perception', 'month'])
        test_set = pd.DataFrame(columns = ['word', 'actual', 'predicted', 'month'])
    threat_scores = threat_scores.astype({'country':str, 'threat_perception':float, 'month':str})
    test_set = test_set.astype({'word':str, 'actual':float, 'predicted':float, 'month':str})
    test_filename = 'G:/model_test/' + filename.split('.')[0].split('\\')[1] + '_test_set.csv'
    test_set.to_csv(test_filename, index = False)
    return threat_scores

In [None]:
# client = Client(n_workers = 8)

In [None]:
threat_list = [delayed(estimate_threat)(filename, countries, threat_levels) for filename in vector_files]

In [None]:
threat_df = dd.from_delayed(threat_list, meta = {'country':str, 'threat_perception':float, 'month':str})

In [None]:
threat_df = threat_df.repartition(partition_size = '100MB')

In [None]:
threat_df.to_parquet('G:/threat_perception.parquet')

In [None]:
# client.close()