In [1]:
import sys
import os
sys.path.append("../")

import pandas as pd
import numpy as np
import pickle
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN


from sklearn.metrics import  silhouette_score, silhouette_samples
import matplotlib.pyplot as plt
from IPython.display import HTML
%matplotlib inline
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
from scipy import stats

## Reading the change object and clustering.

In [56]:
article_name = "Violence_against_Muslims_in_India"
change_object_dir =  "../data/change objects/"

change_object_file_name = f"{article_name}_vec.npz"
filename =  f"{article_name}_change.h5"

change_object_file = os.path.join(change_object_dir, filename)

change_vector_dir = "../data/change_vector_optimised/"
change_vector_file = os.path.join(change_vector_dir, change_object_file_name)

content_dir = "../data/content/"
len_file = article_name + "_rev_len.h5"
len_file_path = os.path.join(content_dir, len_file)

In [57]:
count = 100
def plot_freq(gap_freq, left_context_freq, right_context_freq, timestamp, relative_position, number=100):
    fig, axs = plt.subplots(nrows=1, ncols=3,figsize=(35, 20))
        
    axs[0].barh( left_context_freq.index[:count][::-1], left_context_freq.values[:count][::-1])
    axs[0].set_title(" frequency plot of top 100 words in left context")
    axs[0].set_xlabel("frequency")
    axs[0].set_ylabel("unique words in left context ")
    
    axs[1].barh( gap_freq.index[:count][::-1], gap_freq.values[:count][::-1])
    axs[1].set_title(" frequency plot of top 100 words in gap")
    axs[1].set_xlabel("frequency")
    axs[1].set_ylabel("unique words in gap ")
    
    axs[2].barh( right_context_freq.index[:count][::-1], right_context_freq.values[:count][::-1])
    axs[2].set_title(" frequency plot of top 100 words in right context")
    axs[2].set_xlabel("frequency")
    axs[2].set_ylabel("unique words in right context ")
    
    fig2, ax = plt.subplots(nrows=1, ncols=1,figsize=(35, 20))
    ax.scatter( np.arange(relative_position.shape[0])+1, relative_position, c="red",marker="D", label = "relative position with respect to timestamp")
    ax.set_title("Time scale invariant Plot of timestamp with relative position")
    ax.set_xlabel("Position with respect to time")
    ax.set_ylabel("relative position ")
    ax.set_xticklabels(timestamp)
    ax.legend()
#     axs[3].set_xscale("log")
#     axs[3].set_yscale("log")


    return fig
# _= plot_freq(edited_tokens_freq_per_group.loc[1], 
#             left_context_freq_per_group.loc[1], 
#             right_context_freq_per_group.loc[1],
#             change_grouped_by_tokens["timestamp"].get_group(2).values,
#             change_grouped_by_tokens["relative_position"].get_group(2).values
#             )

def display_article_content(index, change_html_series, edited_tokens_freq_per_group, left_context_freq_per_group, right_context_freq_per_group, change_grouped_by_tokens, out):
    with out:
        out.clear_output()
        
    change_html = change_html_series.loc[index]
    _ = plot_freq(edited_tokens_freq_per_group.loc[index], 
            left_context_freq_per_group.loc[index], 
            right_context_freq_per_group.loc[index],
            change_grouped_by_tokens["timestamp"].get_group(index).values,
            change_grouped_by_tokens["relative_position"].get_group(index).values)
    with out:
#         display(change_html)
        display(f"Word length distribution for {index}")
#         display(fig)
        display(HTML(change_html))
#     return out

In [58]:
%%time
if os.path.exists(change_object_file):
    with pd.HDFStore(change_object_file, 'r') as store:
        change_object_dataframe = store.get("data")
else:
    print("file do not exist")
rev_len_df = pd.read_hdf(len_file_path, key = "rev_len")



CPU times: user 332 ms, sys: 52 ms, total: 384 ms
Wall time: 383 ms


### Make left, ins and delete string for visualisation

In [59]:
change_object_dataframe["left_string"] = change_object_dataframe["left_token"].str.join(" ")
change_object_dataframe["ins_string"] = change_object_dataframe["ins_tokens"].str.join(" ")
change_object_dataframe["del_string"] = change_object_dataframe["del_tokens"].str.join(" ")
change_object_dataframe["right_string"] = change_object_dataframe["right_token"].str.join(" ")

## Remove bigger change object.

In [60]:
change_object_dataframe["ins_length"]= change_object_dataframe["ins_tokens"].apply(lambda x: len(x))
change_object_dataframe["del_length"]= change_object_dataframe["del_tokens"].apply(lambda x: len(x))

optimised_change_object_mask = ((change_object_dataframe["ins_length"] <= 20 ) & (change_object_dataframe["del_length"] <= 20))

change_object_dataframe = change_object_dataframe[optimised_change_object_mask]

change_object_dataframe.shape

(3181, 18)

## Read Vectors of change object.

In [61]:
%%time
with open(change_vector_file, "rb") as file:
    arrays_dict = np.load(file)
    neighbour_10_matrix = arrays_dict["neighbour_10"]
    ins_del_10_sum_neighbour_matrix = arrays_dict["ins_del_10_sum_neighbour"]
    neighbour_4_matrix = arrays_dict["neighbour_4"]
    ins_del_4_sum_neighbour_matrix = arrays_dict["ins_del_4_sum_neighbour"]
    weighted_neighbour_matrix = arrays_dict["weighted_neighbour_matrix"]
    ins_del_weighted_neighbour_matrix = arrays_dict["ins_del_weighted_neighbour_matrix"]

CPU times: user 400 ms, sys: 1 s, total: 1.4 s
Wall time: 8.21 s


In [62]:
# optimised_change_object["ins_token_len"]=optimised_change_object["ins_tokens"].str.len()
# optimised_change_object["del_token_len"]=optimised_change_object["del_tokens"].str.len()
weighted_neighbour_matrix.shape

(3933, 600)

### Remove vectors whose change object has been removed due to optimisation

In [63]:
optimised_ins_del_4_sum_neighbour_matrix = ins_del_4_sum_neighbour_matrix[optimised_change_object_mask,:]
optimised_ins_del_weighted_neighbour_matrix = ins_del_weighted_neighbour_matrix[optimised_change_object_mask,:]
optimised_neighbour_4_matrix = neighbour_4_matrix[optimised_change_object_mask,:]
optimised_ins_del_10_sum_neighbour_matrix = ins_del_10_sum_neighbour_matrix[optimised_change_object_mask,:]
optimised_neighbour_10_matrix = neighbour_10_matrix[optimised_change_object_mask,:]
optimised_weighted_neighbour_matrix  =  weighted_neighbour_matrix[optimised_change_object_mask,:]
optimised_weighted_neighbour_matrix.shape

(3181, 600)

## Clustering

In [64]:
table_style =     [
    {'selector': 'table', 'props': [('border', "6px double #696969")]},
    {'selector': 'th', 'props': [('border', "2px solid #D3D3D3"), ("font-size", "100%")]},
    {"selector":".data", "props":[("text-align", "justify"), ('border', "1px solid #000"), ('margin', '4px 24px 4px 24px' ), ("font-size", "8pt")]}
] 

deleted_token_style = {"color":"red", "font-weight": "bold","font-size": "100px"}
inserted_token_style = {"color":"blue", "font-weight": "bold","font-size": "100px"}

In [65]:
style_dict = {'border': "2px solid #000",
              "text-align": "justify"
    
}

### clustering using DB scan
#### clustering weighted neighbour using dbscan

In [89]:
%%time 
clusters = DBSCAN(eps=0.5, min_samples=4).fit(optimised_weighted_neighbour_matrix)
change_object_dataframe["dbscan_weighted_neighbour"] = pd.Series(clusters.labels_, index= change_object_dataframe.index)
repers_weighted = change_object_dataframe.groupby("dbscan_weighted_neighbour")[["left_string", "del_string", "ins_string", "right_string"]].apply(lambda x: x.style.render())


CPU times: user 8.29 s, sys: 4 ms, total: 8.3 s
Wall time: 8.29 s


In [90]:
@interact( clusters_html=fixed(repers_weighted), group=range(change_object_dataframe.groupby("dbscan_weighted_neighbour").ngroups))
def display_clusters(clusters_html, group):
     return display(HTML(clusters_html.iloc[group]))

interactive(children=(Dropdown(description='group', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,…

#### All weighted vectors weighted_left + gap + weighted_right

In [93]:
%%time 
clusters = DBSCAN(eps=0.4, min_samples=4).fit(optimised_ins_del_weighted_neighbour_matrix)
change_object_dataframe["dbscan_weighted_all"] = pd.Series(clusters.labels_, index= change_object_dataframe.index)
repers_weigh_all = change_object_dataframe.groupby("dbscan_weighted_all")[["left_string", "del_string", "ins_string", "right_string"]].apply(lambda x: x.style.render())


CPU times: user 10.8 s, sys: 12 ms, total: 10.8 s
Wall time: 10.8 s


In [94]:
@interact( clusters_html=fixed(repers_weigh_all), group=range(change_object_dataframe.groupby("dbscan_weighted_all").ngroups))
def display_clusters(clusters_html, group):
     return display(HTML(clusters_html.iloc[group]))

interactive(children=(Dropdown(description='group', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,…

#####  neghbours of size 4 vectors clusters

In [70]:
%%time

NO_OF_CLUSTERS = 70
km = KMeans(n_clusters= NO_OF_CLUSTERS, n_jobs=3, n_init=50)
clusters = km.fit(optimised_neighbour_4_matrix)

change_object_dataframe["cluster_4"] = pd.Series(clusters.labels_, index= change_object_dataframe.index)

repers_4_neigh = change_object_dataframe.groupby("cluster_4")[["left_string", "del_string", "ins_string", "right_string"]].apply(lambda x: x.style.render())




CPU times: user 1min 36s, sys: 7.49 s, total: 1min 44s
Wall time: 27.4 s


In [71]:
@interact( clusters_html=fixed(repers_4_neigh), group=range(change_object_dataframe.groupby("cluster_4").ngroups))
def display_clusters(clusters_html, group):
     return display(HTML(clusters_html.iloc[group]))

interactive(children=(Dropdown(description='group', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,…

##### Cluster number of neighbour tokens=10, number of clusters =100 

In [72]:
%%time
NO_OF_CLUSTERS = 100
km = KMeans(n_clusters= NO_OF_CLUSTERS, n_jobs=3)
clusters_10 = km.fit(optimised_neighbour_10_matrix)

CPU times: user 24.9 s, sys: 1.66 s, total: 26.5 s
Wall time: 6.66 s


In [73]:
change_object_dataframe["cluster_10"] = pd.Series(clusters_10.labels_, index= change_object_dataframe.index)
change_grouped_by_tokens_10_neigh = change_object_dataframe.groupby("cluster_10")

In [74]:
repers_10_full_neigh = change_grouped_by_tokens_10_neigh[["left_string", "del_string", "ins_string", "right_string"]].apply(lambda x: x.style.render())

In [75]:
@interact( clusters_html=fixed(repers_10_full_neigh), group=range(change_grouped_by_tokens_10_neigh.ngroups))
def display_clusters(clusters_html, group):
     return display(HTML(clusters_html.iloc[group]))

interactive(children=(Dropdown(description='group', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,…

### Clustering with vectors concatinated vectors average of 4 right and left neighbours and average of inserted and deleted tokens. 

In [76]:
%%time
NO_OF_CLUSTERS = 100
km = KMeans(n_clusters= NO_OF_CLUSTERS, n_jobs=3)

clusters_4_full = km.fit(optimised_ins_del_4_sum_neighbour_matrix)
change_object_dataframe["cluster_4_full"] = pd.Series(clusters_4_full.labels_, index= change_object_dataframe.index)

CPU times: user 37.8 s, sys: 2.75 s, total: 40.5 s
Wall time: 10.5 s


In [77]:
change_grouped_by_tokens_4_full = change_object_dataframe.groupby("cluster_4_full")

In [78]:
repers_4_full_neigh = change_grouped_by_tokens_4_full[["left_string", "del_string", "ins_string", "right_string"]].apply(lambda x: x.style.render())

In [79]:
@interact( clusters_html=fixed(repers_4_full_neigh), group=range(change_grouped_by_tokens_4_full.ngroups))
def display_clusters(clusters_html, group):
     return display(HTML(clusters_html.iloc[group]))

interactive(children=(Dropdown(description='group', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,…

### Weighted neghbours vectors clusters

In [80]:
%%time
NO_OF_CLUSTERS = 70
km = KMeans(n_clusters= NO_OF_CLUSTERS, n_jobs=3, n_init=50)
clusters_neighbour = km.fit(optimised_weighted_neighbour_matrix)

change_object_dataframe["cluster_weighted_neighbour"] = pd.Series(clusters_neighbour.labels_, index= change_object_dataframe.index)


CPU times: user 1min 38s, sys: 7.35 s, total: 1min 45s
Wall time: 26.5 s


In [81]:
repers_weighted_neigh = change_object_dataframe.groupby("cluster_weighted_neighbour")[["left_string", "del_string", "ins_string", "right_string"]].apply(lambda x: x.style.render())
@interact( clusters_html=fixed(repers_4_neigh), group=range(change_object_dataframe.groupby("cluster_weighted_neighbour").ngroups))
def display_clusters(clusters_html, group):
     return display(HTML(clusters_html.iloc[group]))

interactive(children=(Dropdown(description='group', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,…

### Clustering with vectors concatinated vectors weighted average of 4 right and left neighbours and average of inserted and deleted tokens. 

In [82]:
%%time
NO_OF_CLUSTERS = 100
km = KMeans(n_clusters= NO_OF_CLUSTERS, n_jobs=3)
clusters_4_full = km.fit(optimised_ins_del_weighted_neighbour_matrix)
change_object_dataframe["cluster_4_weighted"] = pd.Series(clusters_4_full.labels_, index= change_object_dataframe.index)

CPU times: user 37.6 s, sys: 2.6 s, total: 40.2 s
Wall time: 10.5 s


In [83]:
change_grouped_by_tokens_4_weighted = change_object_dataframe.groupby("cluster_4_weighted")

In [84]:
repers_4_weighted = change_grouped_by_tokens_4_weighted[["left_string", "del_string", "ins_string", "right_string"]].apply(lambda x: x.style.render())

In [85]:
@interact( clusters_html=fixed(repers_4_weighted), group=range(change_grouped_by_tokens_4_weighted.ngroups))
def display_clusters(clusters_html, group):
     return display(HTML(clusters_html.iloc[group]))

interactive(children=(Dropdown(description='group', options=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,…

### Saving the cluster with change object
###### TO-DO: save change object and cluster seperately.

In [95]:
cluster_dir = "../data/clusters/"

file_name = article_name + "_optimised_cluster.h5"
full_file_path = os.path.join(cluster_dir, file_name)
with pd.HDFStore(full_file_path, 'w') as store:
    store.put("cluster", change_object_dataframe[["dbscan_weighted_neighbour","dbscan_weighted_all","cluster_weighted_neighbour",'cluster_4', 'cluster_10',
       'cluster_4_full', 'cluster_4_weighted']], table=False)

### Ranking
###### Ranking clustered groups on following parameters.
1. Size of clusters
2. No of unique editors is clusters
3. Total period of cluster. i.e difference between start and end date.
4. Median length of edited token in each cluster.

In [None]:
rank_by_size = change_grouped_by_tokens_4_weighted.size().sort_values(ascending=False)

In [None]:
rank_by_uniq_editor = change_object_dataframe.reset_index().groupby("cluster_4_weighted")["editor"].nunique().sort_values(ascending=False)

In [None]:
rank_by_period = change_object_dataframe.reset_index().groupby("cluster_4_weighted")["timestamp"].apply(lambda x: x.max() - x.min()).sort_values(ascending=False)

In [None]:
rank_by_rate = change_object_dataframe.reset_index().groupby("cluster_4_weighted")["timegap"].apply(lambda x: x.mean()).sort_values(ascending=False)

In [None]:
rank_by_token_length = (change_grouped_by_tokens_4_weighted["ins_token_len"].median() + change_grouped_by_tokens_4_weighted["del_token_len"].median()).sort_values()
rank_by_token_length = rank_by_token_length /2

In [None]:
ins_tokens_per_group = change_grouped_by_tokens_4_weighted["ins_tokens"].apply(lambda x: pd.Series(np.concatenate(x.values,axis=0)))
del_tokens_per_group = change_grouped_by_tokens_4_weighted["del_tokens"].apply(lambda x: pd.Series(np.concatenate(x.values,axis=0)))