In [1]:
import sys,os
sys.path.append("../")
import gensim
from gensim.models import KeyedVectors
from gensim.models.fasttext import FastText as FT_gensim
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from IPython.display import HTML
%matplotlib inline
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import pickle



In [2]:
%%time 
wiki_vec = KeyedVectors.load_word2vec_format('../../wordvectors/wiki.en.vec', binary=False, limit=1000000)

CPU times: user 4min 45s, sys: 12.8 s, total: 4min 57s
Wall time: 6min 59s


In [3]:
vocab_list = list(wiki_vec.vocab)
filtered_vocab = [ t for t in vocab_list[20:] if len(t) > 3]

display(f"lenght of vocabulary is {len(vocab_list)} words")
display(f"lenght of vocabulary is {len(filtered_vocab)} words")

vocab_list = np.array(vocab_list)
filtered_vocab = np.array(filtered_vocab)

'lenght of vocabulary is 1000000 words'

'lenght of vocabulary is 970137 words'

In [4]:
# file_path = os.path.join(change_vector_dir
# with open("../../wordvectors/vocabs.pkl", "wb") as file:
# #     print(file)
#     pickle.dump(filtered_vocab, file)


In [6]:
def get_word_vecs(wiki_vec, masks, tokens):
    if tokens and tokens[-1] == "{st@rt}":
        tokens = tokens[:-1]
    if tokens and tokens[-1] == "{$nd}":
        print(tokens)
        tokens = tokens[:-1]
    if len(tokens) == 0:
        return np.zeros( wiki_vec.vector_size)
    tokens = np.array(tokens)
    tokens_in_vocab_mask = masks.loc[tokens, "mask"].values
#     print(tokens_in_vocab_mask)
    in_vocab_tokens = tokens[tokens_in_vocab_mask]
    if np.any(tokens_in_vocab_mask):
        return np.average(wiki_vec[in_vocab_tokens], axis=0)


    else:
        return np.zeros( wiki_vec.vector_size)

def weighted_average_word_vec(wiki_vec, masks, tokens):
    if tokens and tokens[-1] == "{st@rt}":
        tokens = tokens[:-1]
    if tokens and tokens[-1] == "{$nd}":
        tokens = tokens[:-1]
    token_len = len(tokens)
    if token_len == 0:
        return np.zeros( wiki_vec.vector_size)
    weights = (token_len-np.arange(token_len))/token_len
    tokens = np.array(tokens)
    tokens_in_vocab_mask = masks.loc[tokens, "mask"].values
    in_vocab_tokens = tokens[tokens_in_vocab_mask]
    in_vocab_weight = weights[tokens_in_vocab_mask]
    if np.any(tokens_in_vocab_mask):
        return np.average(wiki_vec[in_vocab_tokens], weights=in_vocab_weight, axis=0)


    else:
        return np.zeros( wiki_vec.vector_size)

### reading the change object and clustering.

In [31]:
article_name = "Berlin_Wall"
change_object_dir =  "../data/change objects/"

change_object_file_name = f"{article_name}_vec.npz"
filename =  f"{article_name}_change.h5"
# change_file_name = f"{article_name}.pkl"
change_object_file = os.path.join(change_object_dir, filename)


In [32]:
%%time
if os.path.exists(change_object_file):
    with pd.HDFStore(change_object_file, 'r') as store:
        change_object_dataframe = store.get("data")
else:
    print("file do not exist")
display(change_object_dataframe.shape)

(19641, 12)

CPU times: user 580 ms, sys: 96 ms, total: 676 ms
Wall time: 1.29 s


In [33]:
%%time
content_dir = "../data/content/"
filename = article_name + ".h5"
filepath = os.path.join(content_dir, filename)
with pd.HDFStore(filepath, 'r') as store:
    all_rev = store.get("all_tokens")
unique_str = np.unique(all_rev.str)
str_in_vocab_mask = np.isin(unique_str, vocab_list, assume_unique=True)
str_in_filtered_vocab_mask = np.isin(unique_str, filtered_vocab, assume_unique=True)

vocab_masks_df = pd.DataFrame({ "str":unique_str, "mask":str_in_vocab_mask}).set_index("str")
filtered_vocab_masks_df = pd.DataFrame({ "str":unique_str, "mask":str_in_filtered_vocab_mask}).set_index("str")





CPU times: user 15min 55s, sys: 264 ms, total: 15min 55s
Wall time: 15min 58s


## Make Vector from change object.

In [34]:
%%time

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:10] ).apply(lambda token_set: get_word_vecs(wiki_vec, vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[::-1][:10] ).apply(lambda token_set: get_word_vecs(wiki_vec, vocab_masks_df,  token_set)).values)
neighbour10_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:4] ).apply(lambda token_set: get_word_vecs(wiki_vec, vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:4] ).apply(lambda token_set: get_word_vecs(wiki_vec, vocab_masks_df, token_set)).values)
neighbour4_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:10] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:10] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set)).values)
filtered_neighbour10_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:4] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:4] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set)).values)
filtered_neighbour4_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

# weighted vectors

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:10] ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:10] ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, vocab_masks_df, token_set)).values)
weighted_neighbour10_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:4] ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:4] ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, vocab_masks_df, token_set)).values)
weighted_neighbour4_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:10] ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, filtered_vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:10] ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, filtered_vocab_masks_df, token_set)).values)
filtered_weighted_neighbour10_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:4] ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, filtered_vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:4] ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, filtered_vocab_masks_df, token_set)).values)
filtered_weighted_neighbour4_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

CPU times: user 5min 30s, sys: 1.2 s, total: 5min 31s
Wall time: 5min 31s


In [35]:
left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1] ).apply(lambda token_set: get_word_vecs(wiki_vec, vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, vocab_masks_df, token_set)).values)
not_filtered_not_weighted_neighbour30_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1] ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, vocab_masks_df, token_set)).values)
not_filtered_weighted_neighbour30_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, filtered_vocab_masks_df, token_set)).values)
filtered_not_weighted_neighbour30_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1] ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, filtered_vocab_masks_df, token_set)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set ).apply(lambda token_set: weighted_average_word_vec(wiki_vec, filtered_vocab_masks_df, token_set)).values)
filtered_weighted_neighbour30_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

## Saving change object vector  to file

In [36]:
change_vector_dir = "../data/change_vector_optimised/"
change_vec_filename = f"{article_name}_comp_vec.npz"
change_vector_file = os.path.join(change_vector_dir, change_vec_filename)

arrays_to_save = {
    "4_clean_weighted": filtered_weighted_neighbour4_matrix,
    "10_clean_weighted": filtered_weighted_neighbour10_matrix,

    "4_clean_not_weighted": filtered_neighbour4_matrix,
    "10_clean_not_weighted": filtered_neighbour10_matrix,

    "4_notclean_weighted": weighted_neighbour4_matrix,
    "4_notclean_not_weighted": neighbour4_matrix,
    "10_notclean_weighted": weighted_neighbour10_matrix,
    "10_notclean_not_weighted": neighbour10_matrix,
    "not_filtered_not_weighted_neighbour30_matrix": not_filtered_not_weighted_neighbour30_matrix,
    "not_filtered_weighted_neighbour30_matrix": not_filtered_weighted_neighbour30_matrix,
    "filtered_weighted_neighbour30_matrix": filtered_weighted_neighbour30_matrix,
    "filtered_not_weighted_neighbour30_matrix": filtered_not_weighted_neighbour30_matrix
}
with open(change_vector_file, "wb") as file:
    np.savez(file, **arrays_to_save)