In [1]:
import sys,os
sys.path.append("../")
import gensim
from gensim.models import KeyedVectors
from gensim.models.fasttext import FastText as FT_gensim
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from IPython.display import HTML
%matplotlib inline
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from scipy import stats
import pickle



In [2]:
%%time 
wiki_vec = KeyedVectors.load_word2vec_format('../../wordvectors/wiki.en.vec', binary=False, limit=1000000)

CPU times: user 6min 50s, sys: 2min 57s, total: 9min 48s
Wall time: 9min 49s


In [3]:
vocab_list = list(wiki_vec.vocab)
vocab_list[:20]

[',',
 '.',
 'the',
 '</s>',
 'of',
 '-',
 'in',
 'and',
 "'",
 ')',
 '(',
 'to',
 'a',
 'is',
 'was',
 'on',
 's',
 'for',
 'as',
 'by']

In [4]:
# a= np.array([v.count for v in list(wiki_vec.vocab.values())])
# vocab[0]
filtered_vocab = [ t for t in vocab_list[20:] if len(t) > 3]

In [5]:
display(f"lenght of vocabulary is {len(filtered_vocab)} words")
filtered_vocab[:30]

'lenght of vocabulary is 970137 words'

['that',
 'with',
 'from',
 'this',
 'talk',
 'which',
 'also',
 'were',
 'have',
 'first',
 'page',
 'they',
 'article',
 'their',
 'there',
 'been',
 'made',
 'people',
 'after',
 'other',
 'should',
 'score',
 'would',
 'more',
 'about',
 'when',
 'time',
 'team',
 'american',
 'such']

In [6]:
filtered_vocab = np.array(filtered_vocab)

In [7]:
# file_path = os.path.join(change_vector_dir
with open("../../wordvectors/vocabs.pkl", "wb") as file:
#     print(file)
    pickle.dump(filtered_vocab, file)


In [8]:
def get_word_vecs(wiki_vec, masks, tokens):
    if tokens and tokens[0] == "{st@rt}":
        tokens = tokens[1:]
    if tokens and tokens[-1] == "{$nd}":
        tokens = tokens[:-1]
    tokens = np.array(tokens)
    tokens_in_vocab_mask = masks.loc[tokens, "mask"].values
    in_vocab_tokens = tokens[tokens_in_vocab_mask]
    if np.any(tokens_in_vocab_mask):
        return np.average(wiki_vec[in_vocab_tokens], axis=0)


    else:
        return np.zeros( wiki_vec.vector_size)

### reading the change object and clustering.

In [19]:
article_name = "Berlin_Wall"
change_object_dir =  "../data/change objects/"

change_object_file_name = f"{article_name}_vec.npz"
filename =  f"{article_name}_change.h5"
# change_file_name = f"{article_name}.pkl"
change_object_file = os.path.join(change_object_dir, filename)
change_object_file

'../data/change objects/Berlin_Wall_change.h5'

In [20]:
%%time
if os.path.exists(change_object_file):
    with pd.HDFStore(change_object_file, 'r') as store:
        change_object_dataframe = store.get("data")
else:
    print("file do not exist")

CPU times: user 420 ms, sys: 176 ms, total: 596 ms
Wall time: 2.45 s


In [21]:
%%time
content_dir = "../data/content/"
filename = article_name + ".h5"
filepath = os.path.join(content_dir, filename)
with pd.HDFStore(filepath, 'r') as store:
    all_rev = store.get("all_tokens")
unique_str = np.unique(all_rev.str)
masks_vec = np.isin(unique_str, filtered_vocab)
masks_df = pd.DataFrame({ "str":unique_str, "mask":masks_vec}).set_index("str")

CPU times: user 8min 50s, sys: 1.82 s, total: 8min 52s
Wall time: 8min 53s


In [22]:
change_object_dataframe.shape

(19641, 12)

## Make Vector from change object.

In [23]:
%%time
ins_vec_list = []
change_object_dataframe["ins_tokens"].apply(lambda token_set: ins_vec_list.append(get_word_vecs(wiki_vec, masks_df, token_set)))
ins_matrix = np.c_[ins_vec_list]

del_vec_list = []
change_object_dataframe["del_tokens"].apply(lambda token_set: del_vec_list.append(get_word_vecs(wiki_vec, masks_df, token_set)))
del_matrix = np.c_[del_vec_list]
ins_del_sum_matrix = (ins_matrix + del_matrix)/2

del ins_vec_list
del del_vec_list
del ins_matrix
del del_matrix

CPU times: user 1min 18s, sys: 608 ms, total: 1min 18s
Wall time: 1min 17s


In [24]:
%%time
left_vec_list = []
change_object_dataframe["left_token"].apply(lambda token_set: left_vec_list.append(get_word_vecs(wiki_vec, masks_df, token_set[-10:])))
left_neighbour_matrix = np.c_[left_vec_list]

right_vec_list = []
change_object_dataframe["right_token"].apply(lambda token_set: right_vec_list.append(get_word_vecs(wiki_vec, masks_df, token_set[:10])))
right_neighbour_matrix = np.c_[right_vec_list]

neighbour_10_matrix = np.concatenate([ left_neighbour_matrix, right_neighbour_matrix], axis=1)

ins_del_10_sum_neighbour_matrix = np.concatenate([left_neighbour_matrix, ins_del_sum_matrix, right_neighbour_matrix], axis=1)




CPU times: user 49.3 s, sys: 1 s, total: 50.3 s
Wall time: 48.9 s


In [25]:
%%time
left_vec_list = []
change_object_dataframe["left_token"].apply(lambda token_set: left_vec_list.append(get_word_vecs(wiki_vec, masks_df, token_set[-4:])))
left_neighbour_matrix = np.c_[left_vec_list]

right_vec_list = []
change_object_dataframe["right_token"].apply(lambda token_set: right_vec_list.append(get_word_vecs(wiki_vec, masks_df, token_set[:4])))
right_neighbour_matrix = np.c_[right_vec_list]

neighbour_4_matrix = np.concatenate([left_neighbour_matrix, right_neighbour_matrix], axis=1)
 
ins_del_4_sum_neighbour_matrix = np.concatenate([left_neighbour_matrix, ins_del_sum_matrix, right_neighbour_matrix], axis=1)


CPU times: user 43.5 s, sys: 548 ms, total: 44 s
Wall time: 43 s


In [26]:
def weighted_average_word_vec(wiki_vec, masks, tokens):
    if tokens and tokens[0] == "{st@rt}":
        tokens = tokens[1:]
    if tokens and tokens[-1] == "{$nd}":
        tokens = tokens[:-1]
    weights = np.exp(-(1/32)*np.arange(len(tokens))*2)
    tokens = np.array(tokens)
    tokens_in_vocab_mask = masks.loc[tokens, "mask"].values
    in_vocab_tokens = tokens[tokens_in_vocab_mask]
    in_vocab_weight = weights[tokens_in_vocab_mask]
    if np.any(tokens_in_vocab_mask):
        return np.average(wiki_vec[in_vocab_tokens], weights=in_vocab_weight, axis=0)


    else:
        return np.zeros( wiki_vec.vector_size)

In [27]:
%%time
left_vec_list = []
change_object_dataframe["left_token"].apply(lambda token_set: left_vec_list.append(weighted_average_word_vec(wiki_vec, masks_df, token_set)))
left_neighbour_matrix = np.c_[left_vec_list]

right_vec_list = []
change_object_dataframe["right_token"].apply(lambda token_set: right_vec_list.append(weighted_average_word_vec(wiki_vec, masks_df, token_set)))
right_neighbour_matrix = np.c_[right_vec_list]

weighted_neighbour_matrix = np.concatenate([left_neighbour_matrix, right_neighbour_matrix], axis=1)
ins_del_weighted_neighbour_matrix = np.concatenate([left_neighbour_matrix, ins_del_sum_matrix, right_neighbour_matrix], axis=1)

CPU times: user 1min 8s, sys: 644 ms, total: 1min 9s
Wall time: 1min 8s


## Saving change object vector  to file

In [28]:
change_vector_dir = "../data/change_vector_optimised/"
change_vector_file = os.path.join(change_vector_dir, change_object_file_name)

arrays_to_save = {
    "neighbour_10": neighbour_10_matrix,
    "ins_del_10_sum_neighbour": ins_del_10_sum_neighbour_matrix, 
    "neighbour_4": neighbour_4_matrix,
    "ins_del_4_sum_neighbour": ins_del_4_sum_neighbour_matrix,
    "weighted_neighbour_matrix": weighted_neighbour_matrix,
    "ins_del_weighted_neighbour_matrix": ins_del_weighted_neighbour_matrix
}

with open(change_vector_file, "wb") as file:
    np.savez(file, **arrays_to_save)