In [1]:
import sys,os
sys.path.append("../")
import gensim
from gensim.models import KeyedVectors
from gensim.models.fasttext import FastText as FT_gensim
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from IPython.display import HTML
%matplotlib inline
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from scipy import stats

In [2]:
%%time 
wiki_vec = KeyedVectors.load_word2vec_format('../../wordvectors/wiki.en.vec', binary=False, limit=1000000)

CPU times: user 5min 40s, sys: 10min 1s, total: 15min 42s
Wall time: 15min 43s


In [3]:
def get_word_vecs(wiki_vec, vocab, tokens):
    tokens = np.array(tokens)
    tokens_in_vocab_mask = np.isin(tokens, vocab)
    in_vocab_tokens = tokens[tokens_in_vocab_mask]
    if np.any(tokens_in_vocab_mask):
        return np.average(wiki_vec[in_vocab_tokens], axis=0)


    else:
        return np.zeros( wiki_vec.vector_size)

In [4]:
def weighted_average_word_vec(wiki_vec, vocab, tokens):
    weights = np.exp(-1/32*np.arange(len(tokens))**2)
    tokens = np.array(tokens)
    tokens_in_vocab_mask = np.isin(tokens, vocab)
    in_vocab_tokens = tokens[tokens_in_vocab_mask]
    in_vocab_weight = weights[tokens_in_vocab_mask]
    if np.any(tokens_in_vocab_mask):
        return np.average(wiki_vec[in_vocab_tokens], weights=in_vocab_weight, axis=0)


    else:
        return np.zeros( wiki_vec.vector_size)

### reading the change object and clustering.

In [5]:
article_name = "Violence_against_Muslims_in_India"
change_object_dir =  "../data/change objects/"
change_vector_dir = "../data/change_vector/"

change_object_file_name = f"{article_name}_vec.npz"
filename =  f"{article_name}_change.h5"

change_object_file = os.path.join(change_object_dir, filename)
change_vector_file = os.path.join(change_vector_dir, change_object_file_name)

In [6]:
%%time
if os.path.exists(change_object_file):
    with pd.HDFStore(change_object_file, 'r') as store:
        change_object_dataframe = store.get("data")
else:
    print("file do not exist")

CPU times: user 1.82 s, sys: 4.86 s, total: 6.69 s
Wall time: 8.49 s


In [12]:
change_object_dataframe.shape

(3933, 12)

## Make Vector from change object.

In [None]:
vocab = np.array(list(wiki_vec.vocab))

In [None]:
%%time
ins_vec_list = []
change_object_dataframe["ins_tokens"].apply(lambda token_set: ins_vec_list.append(get_word_vecs(wiki_vec, vocab, token_set)))
ins_matrix = np.c_[ins_vec_list]

del_vec_list = []
change_object_dataframe["del_tokens"].apply(lambda token_set: del_vec_list.append(get_word_vecs(wiki_vec, vocab, token_set)))
del_matrix = np.c_[del_vec_list]
ins_del_sum_matrix = (ins_matrix + del_matrix)/2

del ins_vec_list
del del_vec_list
del ins_matrix
del del_matrix

In [None]:
%%time
left_vec_list = []
change_object_dataframe["left_token"].apply(lambda token_set: left_vec_list.append(get_word_vecs(wiki_vec, vocab, token_set[-10:])))
left_neighbour_matrix = np.c_[left_vec_list]

right_vec_list = []
change_object_dataframe["right_token"].apply(lambda token_set: right_vec_list.append(get_word_vecs(wiki_vec, vocab, token_set[:10])))
right_neighbour_matrix = np.c_[right_vec_list]

neighbour_10_matrix = np.concatenate([ left_neighbour_matrix, right_neighbour_matrix], axis=1)

ins_del_10_sum_neighbour_matrix = np.concatenate([left_neighbour_matrix, ins_del_sum_matrix, right_neighbour_matrix], axis=1)




In [None]:
%%time
left_vec_list = []
change_object_dataframe["left_token"].apply(lambda token_set: left_vec_list.append(get_word_vecs(wiki_vec, vocab, token_set[-4:])))
left_neighbour_matrix = np.c_[left_vec_list]

right_vec_list = []
change_object_dataframe["right_token"].apply(lambda token_set: right_vec_list.append(get_word_vecs(wiki_vec, vocab, token_set[:4])))
right_neighbour_matrix = np.c_[right_vec_list]

neighbour_4_matrix = np.concatenate([left_neighbour_matrix, right_neighbour_matrix], axis=1)
 
ins_del_4_sum_neighbour_matrix = np.concatenate([left_neighbour_matrix, ins_del_sum_matrix, right_neighbour_matrix], axis=1)


In [None]:
%%time
left_vec_list = []
change_object_dataframe["left_token"].apply(lambda token_set: left_vec_list.append(weighted_average_word_vec(wiki_vec, vocab, token_set)))
left_neighbour_matrix = np.c_[left_vec_list]

right_vec_list = []
change_object_dataframe["right_token"].apply(lambda token_set: right_vec_list.append(weighted_average_word_vec(wiki_vec, vocab, token_set)))
right_neighbour_matrix = np.c_[right_vec_list]

weighted_neighbour_matrix = np.concatenate([left_neighbour_matrix, right_neighbour_matrix], axis=1)
 
ins_del_weighted_neighbour_matrix = np.concatenate([left_neighbour_matrix, ins_del_sum_matrix, right_neighbour_matrix], axis=1)


In [None]:
# del left_vec_list
# del right_vec_list
# del right_neighbour_matrix
# del left_neighbour_matrix

## Saving change object vector  to file

In [None]:
arrays_to_save = {
    "neighbour_10": neighbour_10_matrix,
    "ins_del_10_sum_neighbour": ins_del_10_sum_neighbour_matrix, 
    "neighbour_4": neighbour_4_matrix,
    "ins_del_4_sum_neighbour": ins_del_4_sum_neighbour_matrix,
    "weighted_neighbour_matrix": weighted_neighbour_matrix,
    "ins_del_weighted_neighbour_matrix": ins_del_weighted_neighbour_matrix
}

with open(change_vector_file, "wb") as file:
    np.savez(file, **arrays_to_save)

In [None]:
with open(change_vector_file, "rb") as file:
    arrays_dict = np.load(file)
    neighbour_10_matrix_1 = arrays_dict["neighbour_10"]

In [None]:
with open(change_vector_file, "rb") as file:
    arrays_dict = np.load(file)
    neighbour_10_matrix = arrays_dict["neighbour_10"]
    ins_del_10_sum_neighbour_matrix = arrays_dict["ins_del_10_sum_neighbour"]
    neighbour_4_matrix = arrays_dict["neighbour_4"]
    ins_del_4_sum_neighbour_matrix = arrays_dict["ins_del_4_sum_neighbour"]
    weighted_neighbour_matrix_matrix = arrays_dict["weighted_neighbour_matrix"]
    ins_del_weighted_neighbour_matrix = arrays_dict["ins_del_weighted_neighbour_matrix"]