In [1]:
import pandas as pd
from numpy.random import randint, choice, seed
from gensim.models import Word2Vec, KeyedVectors
from dask import delayed, compute
from dask.distributed import Client
import dask.dataframe as dd
from glob import glob
import re

In [2]:
vector_dir = 'G:/vectors/'

In [3]:
vector_files = glob(vector_dir + '*wordvectors')

In [4]:
countries = pd.read_csv('countries.csv')

In [5]:
countries = countries['country'].str.lower().unique()

In [6]:
def calculate_sim(filename, countries):
    vectors = KeyedVectors.load(filename)
    vocab = list(vectors.key_to_index)
    if 'india' in vocab:
        similarities = pd.DataFrame([{'country':country, 'similarity':vectors.similarity('india', country)}
                         for country in countries if country in vocab])
        similarities['month'] = re.search('\\d{1,2}-\\d{4}', filename).group(0)
    else:
        similarities = pd.DataFrame(columns = ['country', 'similarity', 'month'])
    similarities = similarities.astype({'country':str, 'similarity':float, 'month':str})
    return similarities

In [7]:
# client = Client(n_workers = 8)

In [8]:
similarities = [delayed(calculate_sim)(filename, countries) for filename in vector_files]

In [9]:
similarities = dd.from_delayed(similarities, meta = {'country':str, 'similarity':float, 'month':str})

In [10]:
similarities = similarities.repartition(partition_size = '100MB')

In [11]:
similarities.to_parquet('G:/country_similarities.parquet')

(None,)

In [12]:
# client.close()