# Visualisation for clusters of an article 
 This notebook Visualises and compares change object as clustered according to **Bykau et. al** and using **Word2vec** for an article. Given an **article_name** it shows creation and deletion portion of each clusters along with showing revision_id editor and time as index.

In [1]:
import sys
import os
import pickle
from string import punctuation
from string import whitespace


import matplotlib.pyplot as plt
from IPython.display import HTML

from ipywidgets import interact, interactive, fixed, interact_manual, HBox

import ipywidgets as widgets
import pandas as pd
import numpy as np

sys.path.append("../")


In [2]:
table_style =     [
    {'selector': 'table', 'props': [('border', "6px double #696969")]},
    {'selector': 'th', 'props': [('border', "2px solid #D3D3D3"), ("font-size", "100%")]},
    {"selector":".data", "props":[("text-align", "justify"), ('border', "1px solid #000"), ('margin', '4px 24px 4px 24px' ), ("font-size", "8pt")]}
] 

deleted_token_style = {"color":"red", "font-weight": "bold","font-size": "100px"}
inserted_token_style = {"color":"blue", "font-weight": "bold","font-size": "100px"}

In [3]:
with open("../../wordvectors/vocabs.pkl", "rb") as file:
    vocab = pickle.load(file)
vocab_set = set(vocab)

In [4]:
### getting the vlaue of the article to visualise data.
article_name = "Yugoslavia"

### We first read and visualise the clusters done using vectors prepeared by Using pre trained word2vec vector.

Word tokens in change objects are converted into vector space by using 300 dimensions of pre-trained fast text pre trained embedding vectors. Size of the vocobulary loaded is 1 million words.

Each change object is represented by concatinating  vectors representing left neighbour tokens, insert and delete tokens and right neighbour tokens. Insert and delete tokens vectors is average of fast-text vector representation of each words in token. Left and right neighbours are prepared by taking weighted average of vectors representated by word2vec embeddings. Weights are created by a exponential decay functions whose parameter is  neighbouring token's distance from inserted and deleted word.


#### Reading change object

In [5]:
change_object_dir = "../data/change objects/"
content_file = article_name + "_change.h5"
change_object_path = os.path.join(change_object_dir, content_file)

with pd.HDFStore(change_object_path, 'r') as store:
    #retrieving all rev list and change object from file
    change_df = store.get("data")
change_df.shape

(11960, 12)

## Remove bigger change object.

In [6]:
change_df["ins_length"]= change_df["ins_tokens"].apply(lambda x: len(x))
change_df["del_length"]= change_df["del_tokens"].apply(lambda x: len(x))

optimised_change_object_mask = ((change_df["ins_length"] <= 20 ) & (change_df["del_length"] <= 20))

optimised_df = change_df[optimised_change_object_mask].copy()

optimised_df.shape

(10492, 14)

In [7]:
change_df["ins_token_len"] = change_df["ins_tokens"].str.len()
change_df["del_token_len"] = change_df["del_tokens"].str.len()

##### Reading revision clusters.

In [8]:
cluster_dir = "../data/clusters/"
file_name = article_name + "_optimised_cluster.h5"

change_dataframe_path = os.path.join(cluster_dir,file_name)

with pd.HDFStore(change_dataframe_path, 'r') as store:
    optimised_cluster_df = store.get("cluster")

In [9]:
# cluster_dir = "../data/clusters/"
# file_name = f"{article_name}_cluster.h5"
# change_dataframe_path = os.path.join(cluster_dir,file_name)

# with pd.HDFStore(change_dataframe_path, 'r') as store:
#     cluster_df = store.get("cluster")

##### Merging change object with its clusters

In [52]:
# change_df_with_clusters = pd.concat([change_df, optimised_cluster_df], axis=1)
# change_df_with_clusters["edited_tokens"] = change_df_with_clusters["ins_tokens"] + change_df_with_clusters["del_tokens"]


optimised_df_with_clusters = pd.concat([optimised_df, optimised_cluster_df], axis=1)
optimised_df_with_clusters["edited_tokens"] = optimised_df_with_clusters["ins_tokens"] + optimised_df_with_clusters["del_tokens"]



### Make left, ins and delete string for visualisation

In [53]:
# optimised_df_with_clusters["left_string"] = optimised_df_with_clusters["left_token"].apply(lambda tokens: tuple(token for token in tokens if token.isalnum())).str.join(" ")
# optimised_df_with_clusters["del_string"] = optimised_df_with_clusters["ins_tokens"].apply(lambda tokens: tuple(token for token in tokens if token.isalnum())).str.join(" ")
# optimised_df_with_clusters["ins_string"] = optimised_df_with_clusters["del_tokens"].apply(lambda tokens: tuple(token for token in tokens if token.isalnum())).str.join(" ")
# optimised_df_with_clusters["right_string"] = optimised_df_with_clusters["right_token"].apply(lambda tokens: tuple(token for token in tokens if token.isalnum())).str.join(" ")

optimised_df_with_clusters["left_string"] = optimised_df_with_clusters["left_token"].str.join(" ")
optimised_df_with_clusters["ins_string"] = optimised_df_with_clusters["ins_tokens"].str.join(" ")
optimised_df_with_clusters["del_string"] = optimised_df_with_clusters["del_tokens"].str.join(" ")
optimised_df_with_clusters["right_string"] = optimised_df_with_clusters["right_token"].str.join(" ")

## Finding relative positions of change object

In [54]:
content_dir = "../data/content/"
content_file = article_name + ".h5"
content_path = os.path.join(content_dir, content_file)
len_file = article_name + "_rev_len.h5"
len_file_path = os.path.join(content_dir, len_file)
rev_len_df = pd.read_hdf(len_file_path, key = "rev_len")



# with pd.HDFStore(content_path, 'r') as store:
#     #retrieving all rev list and change object from file
#     rev_list = store.get("rev_list")["id"].values.tolist()
#     keys = ["r" +  str(rev) for rev in rev_list]
#     rev_len_list = [store.get(key).shape[0] for key in keys]

# rev_len_df = pd.DataFrame({"rev_id":rev_list[:-1], "length": rev_len_list[:-1]})
# rev_len_df.to_hdf(len_file_path, "rev_len")


In [55]:
optimised_df_with_clusters = optimised_df_with_clusters.reset_index().set_index('from revision id')
optimised_df_with_clusters = optimised_df_with_clusters.join(rev_len_df.set_index("rev_id"))
optimised_df_with_clusters.index.name = "from revision id"

optimised_df_with_clusters["relative_position"] =(optimised_df_with_clusters["left_neigh"]+1)/(optimised_df_with_clusters["length"])


# plt.scatter(np.arange(optimised_df_with_clusters["relative_position"].shape[0])+1,  optimised_df_with_clusters["relative_position"], linestyle="-")
# plt.xscale("log")
# plt.ylim([0,1])
# plt.xlim([1,optimised_df_with_clusters["relative_position"].shape[0]+1])

#### Grouping clusters and making html of each groups

In [221]:
clustering_options =['dbscan_weighted_neighbour', 'dbscan_weighted_all',
       'cluster_weighted_neighbour', 'cluster_4', 'cluster_10',
       'cluster_4_full', 'cluster_4_weighted']

In [67]:
clustering_by = "dbscan_weighted_neighbour"

non_zero_cluster_mask = (optimised_df_with_clusters[clustering_by] != -1)



edited_tokens_freq_per_group = optimised_df_with_clusters.set_index(clustering_by)["edited_tokens"].apply(lambda tokens: tuple(token for token in tokens if token in vocab_set)).groupby(clustering_by).apply(lambda x:  pd.Series(np.concatenate(x.values, axis=0)).value_counts(ascending=False))
left_context_freq_per_group = optimised_df_with_clusters.set_index(clustering_by)["left_token"].apply(lambda tokens: tuple(token for token in tokens if token in vocab_set)).groupby(clustering_by).apply(lambda x:  pd.Series(np.concatenate(x.values, axis=0)).value_counts(ascending=False))
right_context_freq_per_group = optimised_df_with_clusters.set_index(clustering_by)["right_token"].apply(lambda tokens: tuple(token for token in tokens if token in vocab_set)).groupby(clustering_by).apply(lambda x:  pd.Series(np.concatenate(x.values, axis=0)).value_counts(ascending=False))

optimised_df_with_clusters = optimised_df_with_clusters.reset_index().set_index(["from revision id","to revision id","timestamp", "editor", "level_5"])

repers_weighted =  optimised_df_with_clusters.groupby(clustering_by)[["left_string", "del_string", "ins_string", "right_string"]].apply(lambda x: x.style.render())
optimised_df_with_clusters = optimised_df_with_clusters.reset_index().set_index(["from revision id", "level_5"])

change_grouped_by_tokens = optimised_df_with_clusters.groupby(clustering_by)



### Ranking
###### Ranking clustered groups on following parameters.
1. Size of clusters
2. No of unique editors is clusters
3. Total period of cluster. i.e difference between start and end date.
4. Mean timegap in cluster.

In [68]:
rank_by_size = change_grouped_by_tokens.size().sort_values(ascending=False)

rank_by_uniq_editor = optimised_df_with_clusters.reset_index().groupby(clustering_by)["editor"].nunique().sort_values(ascending=False)

rank_by_period = optimised_df_with_clusters.reset_index().groupby(clustering_by)["timestamp"].apply(lambda x: x.max() - x.min()).sort_values(ascending=False)

rank_by_rate = optimised_df_with_clusters.reset_index().groupby(clustering_by)["timegap"].apply(lambda x: x.mean()).sort_values(ascending=False)

# unique_word_count_per_group = word_freq_per_group.groupby("cluster_4_weighted").apply(lambda x: x.index.shape[0]).sort_values(ascending=False)

## Visualisation for change object clustered on neighbour vectors

Following visualisation can be used to compare with above visualisation which is using neighbour vectors to cluster.



In [69]:
count = 100
def plot_freq(gap_freq, left_context_freq, right_context_freq, timestamp, relative_position, number=100):
    fig, axs = plt.subplots(nrows=1, ncols=3,figsize=(35, 20))
        
    axs[0].barh( left_context_freq.index[:count][::-1], left_context_freq.values[:count][::-1])
    axs[0].set_title(" frequency plot of top 100 words in left context")
    axs[0].set_xlabel("frequency")
    axs[0].set_ylabel("unique words in left context ")
    
    axs[1].barh( gap_freq.index[:count][::-1], gap_freq.values[:count][::-1])
    axs[1].set_title(" frequency plot of top 100 words in gap")
    axs[1].set_xlabel("frequency")
    axs[1].set_ylabel("unique words in gap ")
    
    axs[2].barh( right_context_freq.index[:count][::-1], right_context_freq.values[:count][::-1])
    axs[2].set_title(" frequency plot of top 100 words in right context")
    axs[2].set_xlabel("frequency")
    axs[2].set_ylabel("unique words in right context ")
    
    fig2, ax = plt.subplots(nrows=1, ncols=1,figsize=(35, 20))
    ax.scatter( np.arange(relative_position.shape[0])+1, relative_position, c="red",marker="D", label = "relative position with respect to timestamp")
    ax.set_title("Time scale invariant Plot of timestamp with relative position")
    ax.set_xlabel("Position with respect to time")
    ax.set_ylabel("relative position ")
    ax.set_xticklabels(timestamp)
    ax.legend()
#     axs[3].set_xscale("log")
#     axs[3].set_yscale("log")


    return fig
# _= plot_freq(edited_tokens_freq_per_group.loc[1], 
#             left_context_freq_per_group.loc[1], 
#             right_context_freq_per_group.loc[1],
#             change_grouped_by_tokens["timestamp"].get_group(2).values,
#             change_grouped_by_tokens["relative_position"].get_group(2).values
#             )

In [77]:
def display_article_content(index, change_html_series, edited_tokens_freq_per_group, left_context_freq_per_group, right_context_freq_per_group, change_grouped_by_tokens, out):
    with out:
        out.clear_output()
        
    if index in change_html_series.index:
        change_html = change_html_series.loc[index]
    else:
        change_html = "<p>empty table according to cleanup</p>"
    
    if index in left_context_freq_per_group.index:
        left_context_freq = left_context_freq_per_group.loc[index]
    else:
        left_context_freq = pd.Series()
    
    if index in right_context_freq_per_group.index:
        right_context_freq = right_context_freq_per_group.loc[index]
    else:
        right_context_freq = pd.Series()
        
    if index in edited_tokens_freq_per_group.index:
        edited_tokens_freq = edited_tokens_freq_per_group.loc[index]
    else:
        edited_tokens_freq = pd.Series()
    _ = plot_freq(edited_tokens_freq, left_context_freq, right_context_freq,
            change_grouped_by_tokens["timestamp"].get_group(index).values,
            change_grouped_by_tokens["relative_position"].get_group(index).values)
    with out:
#         display(change_html)
        display(f"Word length distribution for {index}")
#         display(fig)
        display(HTML(change_html))
#     return out
        

In [78]:
drop_down = list(zip(np.arange(rank_by_rate.size), rank_by_rate.index))

**Please rerun next cell each time page is reloaded**

In [79]:
outp = widgets.Output(layout={'border': '1px solid black'})

_=widgets.interact(display_article_content, index=drop_down, change_html_series= widgets.fixed(repers_weighted),edited_tokens_freq_per_group= widgets.fixed(edited_tokens_freq_per_group), left_context_freq_per_group= widgets.fixed(left_context_freq_per_group), right_context_freq_per_group= widgets.fixed(right_context_freq_per_group), change_grouped_by_tokens= widgets.fixed(change_grouped_by_tokens), out=widgets.fixed(outp));

interactive(children=(Dropdown(description='index', options=((0, 192), (1, 198), (2, 39), (3, 200), (4, 0), (5…

In [80]:
outp

Output(layout=Layout(border='1px solid black'), outputs=({'output_type': 'display_data', 'data': {'text/plain'…

In [None]:
# widgets.Dropdown(
#     options=['1', '2', '3'],
#     value='2',
#     description='Number:',
#     disabled=False,
# )

### **Bykau Et. al.** change objects and its clusters

In [None]:
bykau_dir =  "../data/bykau_change_object/"
filename =  f"{article_name}_change.h5"

change_object_file = os.path.join(bykau_dir, filename)
bykau_change_df = pd.read_hdf(change_object_file, key="data")
bykau_change_df.shape

In [None]:
bykau_change_df = bykau_change_df.reset_index().set_index('from revision id')
bykau_change_df = bykau_change_df.join(rev_len_df.set_index("rev_id"))
bykau_change_df.index.name = "from revision id"
bykau_change_df["relative_position"] =(bykau_change_df["left_neigh"]+1)/(bykau_change_df["length"])

In [None]:
bykau_change_df["left_string"] = bykau_change_df["left_token"].str.join(" ")
bykau_change_df["ins_string"] = bykau_change_df["ins_tokens"].str.join(" ")
bykau_change_df["del_string"] = bykau_change_df["del_tokens"].str.join(" ")
bykau_change_df["right_string"] = bykau_change_df["right_token"].str.join(" ")

#### Grouping and ranking.

In [None]:
bykau_groups = bykau_change_df.groupby("reclustered_group")

# bykau_edited_freq = bykau_groups["ins_tokens"].apply(lambda x: find_freq_vocab_words(x, vocab))
# bykau_left_context_freq = bykau_groups["left_token"].apply(lambda x: find_freq_vocab_words(x, vocab))
# bykau_right_context_freq = bykau_groups["right_token"].apply(lambda x: find_freq_vocab_words(x, vocab))

bykau_edited_freq = bykau_groups["ins_tokens"].apply(lambda x:  pd.Series(np.concatenate(x.values, axis=0)).value_counts(ascending=False))
bykau_left_context_freq = bykau_groups["left_token"].apply(lambda x:  pd.Series(np.concatenate(x.values, axis=0)).value_counts(ascending=False))
bykau_right_context_freq = bykau_groups["right_token"].apply(lambda x:  pd.Series(np.concatenate(x.values, axis=0)).value_counts(ascending=False))


In [None]:
bykau_rank_by_size = bykau_groups.size().sort_values(ascending=False)

bykau_rank_by_uniq_editor = bykau_change_df.reset_index().groupby("reclustered_group")["editor"].nunique().sort_values(ascending=False)

bykau_rank_by_period = bykau_change_df.reset_index().groupby("reclustered_group")["timestamp"].apply(lambda x: x.max() - x.min()).sort_values(ascending=False)

bykau_rank_by_rate = bykau_change_df.reset_index().groupby("reclustered_group")["timegap"].apply(lambda x: x.mean()).sort_values(ascending=False)

bykau_change_df = bykau_change_df.reset_index().set_index(["from revision id", "level_5"])


# unique_word_count_per_group = word_freq_per_group.groupby("cluster_4_weighted").apply(lambda x: x.index.shape[0]).sort_values(ascending=False)

#### Bykau Visualisation

THis can be used to compare with our visualisation.

In [None]:
bykau_drop_down = list(zip(np.arange(bykau_rank_by_period.size), bykau_rank_by_period.index))

In [None]:
repers_bykau = bykau_change_df.groupby("reclustered_group")[["left_string", "del_string", "ins_string", "right_string"]].apply(lambda x: x.style.render())

**Please rerun next cell each time page is reloaded**

In [None]:
bykau_outp = widgets.Output(layout={'border': '1px solid black'})

# _=widgets.interact(display_article_content, index=drop_down, change_html_series= widgets.fixed(repers_bykau), out=widgets.fixed(bykau_outp));

_=widgets.interact(display_article_content, index=bykau_drop_down, change_html_series= widgets.fixed(repers_bykau),edited_tokens_freq_per_group= widgets.fixed(bykau_edited_freq), left_context_freq_per_group= widgets.fixed(bykau_left_context_freq), right_context_freq_per_group= widgets.fixed(bykau_right_context_freq), change_grouped_by_tokens= widgets.fixed(bykau_change_df.groupby("reclustered_group")), out=widgets.fixed(bykau_outp));

In [None]:
bykau_outp

In [None]:
# @interact( clusters_html=fixed(repers_4_neigh), group=range(groups.ngroups))
# def display_clusters(clusters_html, group):
#      return display(HTML(clusters_html.iloc[group]))

In [18]:
# drop_down = list(zip(rev_list.id, rev_list.index))