# Notebook to convert change object saved in `./data/change object ` into change vector.

In [1]:
import sys,os
sys.path.append("../")
import gensim
from gensim.models import KeyedVectors
from gensim.models.fasttext import FastText as FT_gensim
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from IPython.display import HTML
%matplotlib inline
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import pickle



In [2]:
def get_word_vecs(wiki_vec, masks, tokens, token_string_df):
    if tokens and tokens[-1] == -1:
        tokens = tokens[:-1]
    if tokens and tokens[-1] == -2:
        tokens = tokens[:-1]
    if len(tokens) == 0:
        return np.zeros( wiki_vec.vector_size)
    tokens = token_string_df[np.array(tokens)].values
    tokens_in_vocab_mask = masks.loc[tokens, "mask"].values
#     print(tokens_in_vocab_mask)
    in_vocab_tokens = tokens[tokens_in_vocab_mask]
    if np.any(tokens_in_vocab_mask):
        return np.average(wiki_vec[in_vocab_tokens], axis=0)


    else:
        return np.zeros( wiki_vec.vector_size)


In [3]:
%%time 
wiki_vec = KeyedVectors.load_word2vec_format('../../wordvectors/wiki.en.vec', binary=False, limit=1000000)

CPU times: user 3min 38s, sys: 3.47 s, total: 3min 41s
Wall time: 3min 43s


In [5]:
vocab_list = list(wiki_vec.vocab)
filtered_vocab = [ t for t in vocab_list[20:] if len(t) > 3]

# display(f"lenght of vocabulary is {len(vocab_list)} words")
# display(f"lenght of vocabulary is {len(filtered_vocab)} words")

vocab_list = np.array(vocab_list)
filtered_vocab = np.array(filtered_vocab)

### reading the change object and clustering.

In [7]:
%%time

article_name = "John_Logie_Baird"
change_object_dir =  "../data/change objects/"

change_object_file_name = f"{article_name}_vec.npz"
filename =  f"{article_name}_change.h5"
# change_file_name = f"{article_name}.pkl"
change_object_file = os.path.join(change_object_dir, filename)


if os.path.exists(change_object_file):
    with pd.HDFStore(change_object_file, 'r') as store:
        change_object_dataframe = store.get("data")
else:
    print("file do not exist")
# display(change_object_dataframe.shape)

CPU times: user 157 ms, sys: 27.4 ms, total: 184 ms
Wall time: 202 ms


In [8]:
%%time
content_dir = "../data/content/"
filename = article_name + ".h5"
filepath = os.path.join(content_dir, filename)
with pd.HDFStore(filepath, 'r') as store:
    all_rev = store.get("all_tokens")
unique_str = np.unique(all_rev.str)
str_in_filtered_vocab_mask = np.isin(unique_str, filtered_vocab, assume_unique=True)

filtered_vocab_masks_df = pd.DataFrame({ "str":unique_str, "mask":str_in_filtered_vocab_mask}).set_index("str")



token_string_df = all_rev.set_index("token_id")["str"]

CPU times: user 1min 3s, sys: 1.01 s, total: 1min 4s
Wall time: 1min 4s


## Make Vector from change object.

In [9]:
left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:30] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:30] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
filtered_neighbour30_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:25] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:25] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
filtered_neighbour25_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:20] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:20] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
filtered_neighbour20_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:15] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:15] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
filtered_neighbour15_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:12] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:12] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
filtered_neighbour12_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:10] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:10] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
filtered_neighbour10_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:8] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:8] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
filtered_neighbour8_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:6] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:6] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
filtered_neighbour6_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:4] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:4] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
filtered_neighbour4_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

left_neighbour_matrix = np.stack(change_object_dataframe["left_token"].apply(lambda token_set: token_set[::-1][:2] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
right_neighbour_matrix = np.stack(change_object_dataframe["right_token"].apply(lambda token_set: token_set[:2] ).apply(lambda token_set: get_word_vecs(wiki_vec, filtered_vocab_masks_df, token_set, token_string_df)).values)
filtered_neighbour2_matrix = np.c_[ left_neighbour_matrix, right_neighbour_matrix]

## Saving change object vector  to file

In [10]:
change_vector_dir = "../data/change_vector/"
change_vec_filename = f"{article_name}.npz"
change_vector_file = os.path.join(change_vector_dir, change_vec_filename)

arrays_to_save = {
    "2_clean_not_weighted": filtered_neighbour2_matrix,
    "4_clean_not_weighted": filtered_neighbour4_matrix,
    "6_clean_not_weighted": filtered_neighbour6_matrix,
    "8_clean_not_weighted": filtered_neighbour8_matrix,
    "10_clean_not_weighted": filtered_neighbour10_matrix,
    "12_clean_not_weighted": filtered_neighbour12_matrix,
    "15_clean_not_weighted": filtered_neighbour15_matrix,
    "20_clean_not_weighted": filtered_neighbour20_matrix,
    "25_clean_not_weighted": filtered_neighbour25_matrix,
    "30_clean_not_weighted": filtered_neighbour30_matrix,
}
with open(change_vector_file, "wb") as file:
    np.savez(file, **arrays_to_save)

In [11]:
filtered_neighbour25_matrix.shape

(4913, 600)

In [12]:
9

9