In [25]:
import sys,os
sys.path.append("../")

import pandas as pd
import numpy as np
import pickle
import itertools
from sklearn.cluster import DBSCAN
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import pairwise_distances  
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import mutual_info_score
from sklearn.metrics import normalized_mutual_info_score
from scipy.stats import entropy
import seaborn as sns




import matplotlib.pyplot as plt
from IPython.display import HTML
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets





## Reading the change object and clustering.

In [2]:
article_name = "John_Logie_Baird"
change_object_dir =  "../data/change objects/"

change_object_file_name = f"{article_name}_vec.npz"
filename =  f"{article_name}_change.h5"

change_object_file = os.path.join(change_object_dir, filename)

content_dir = "../data/content/"

filename = article_name + ".h5"
filepath = os.path.join(content_dir, filename)
with pd.HDFStore(filepath, 'r') as store:
    token_string_df = store.get("all_tokens")
    
token_string_df = token_string_df.set_index("token_id")["str"]
token_string_df[-1] = "St@rt"
token_string_df[-2] = "$nd"
change_vector_dir = "../data/change_vector/"
change_vec_filename = f"{article_name}.npz"
change_vector_file = os.path.join(change_vector_dir, change_vec_filename)

content_dir = "../data/content/"
len_file = article_name + "_rev_len.h5"
len_file_path = os.path.join(content_dir, len_file)


if os.path.exists(change_object_file):
    with pd.HDFStore(change_object_file, 'r') as store:
        change_object_dataframe = store.get("data")
else:
    print("file do not exist")
    
    


change_object_dataframe["del_string_tokens"] = change_object_dataframe["del_tokens"].apply(
    lambda x:  tuple(token_string_df[np.array(x)].tolist()))

change_object_dataframe["ins_string_tokens"] = change_object_dataframe["ins_tokens"].apply(
    lambda x:  tuple(token_string_df[np.array(x)].tolist()))

change_object_dataframe["edit_string_tokens"] = change_object_dataframe["ins_string_tokens"] + change_object_dataframe["del_string_tokens"]


# rev_len_df = pd.read_hdf(len_file_path, key = "rev_len")
vectors ={}

with open(change_vector_file, "rb") as file:
    arrays_dict = np.load(file)
    vectors[2] = arrays_dict["2_clean_not_weighted"]
    vectors[4] = arrays_dict["4_clean_not_weighted"]
    vectors[6] = arrays_dict["6_clean_not_weighted"]
    vectors[8] = arrays_dict["8_clean_not_weighted"]
    vectors[10] = arrays_dict["10_clean_not_weighted"]
    vectors[12] = arrays_dict["12_clean_not_weighted"]
    vectors[15] = arrays_dict["15_clean_not_weighted"]
    vectors[20] = arrays_dict["20_clean_not_weighted"]
    vectors[25] = arrays_dict["25_clean_not_weighted"]
    vectors[30] = arrays_dict["30_clean_not_weighted"]

#### read annotations

In [3]:
file_name = article_name + "_FULL.csv"
annotation_dir = "../data/annotation/"
full_file_path = os.path.join(annotation_dir, file_name)
annotation_df = pd.read_csv(full_file_path)
annotation_df = annotation_df[["revid_ctxt", "token_id",
                               "rev_id", "nationality", "birth_place" ]]

### clustering


In [4]:
vector_names = list(vectors.keys())
context_array  = vector_names
eps_array = [0.5, 0.75, 1.0,  1.25,1.5,1.75, 2.0]
min_samples_array = [2]
all_combinations = list(itertools.product(context_array, eps_array,
                                          min_samples_array))
dbscan_params = list(itertools.product(eps_array,min_samples_array))
idx = pd.MultiIndex.from_product([context_array, eps_array,min_samples_array],
                                names=["context","eps","min_samples"])
cluster_df = pd.DataFrame(columns=idx)

evaluation_df = pd.DataFrame(index=idx, columns=["rand", "entropy", "token_entropy"])



In [5]:
%%time 
for cluster_by in vector_names:
    distances = pairwise_distances(vectors[cluster_by])
    for eps, min_samples in dbscan_params:
        cluster_df[cluster_by,eps, min_samples] = DBSCAN(eps=eps, min_samples=min_samples, 
                                                         metric="precomputed").fit(distances).labels_

CPU times: user 1min 37s, sys: 27 s, total: 2min 4s
Wall time: 52.4 s


In [6]:
def weighted_token_entropy(dataframe, group_by):
    cluster_sizes = dataframe.groupby(group_by).size()
    token_entropy_clusters = dataframe.groupby(group_by)["edit_string_tokens"].apply(
                    lambda token_tuples: entropy(pd.Series(
                    [token for token_tuple in token_tuples.tolist() for token in token_tuple]
                    ).value_counts().values))
    cluster_entropy = (cluster_sizes * token_entropy_clusters).sum()
    return cluster_entropy

In [7]:
cluster_df.index = change_object_dataframe.index
dbscan_results = pd.concat([change_object_dataframe, cluster_df], axis=1)

In [8]:
%%time
# entropy_series = pd.Series(index=all_combinations)

for context, eps, min_samples in all_combinations:
    evaluation_df.loc[(context, eps, min_samples),"token_entropy"] = weighted_token_entropy(dbscan_results, (context, eps, min_samples))
# all_combinations_without_optimization[0]

In [9]:
# entropy_series.sort_values()

NameError: name 'entropy_series' is not defined

In [None]:

# cluster_dir = "../data/clusters/"

# file_name = article_name + "_dbscan_cluster_4and10.h5"
# full_file_path = os.path.join(cluster_dir, file_name)
# with pd.HDFStore(full_file_path, 'w') as store:
#     store.put("cluster", change_object_dataframe[["clean_4", "clean_10"]], table=False)

In [None]:
# change_object_dataframe[["ins_tokens"]]

#### Splitting change object to match annotations

In [10]:
# insert array is always done in to revision so taking it and leaving other change object where 
ins_array = change_object_dataframe.reset_index().loc[
    change_object_dataframe["ins_start_pos"].values != -1, 
                  ["to revision id","ins_tokens", 'to revision id']].values
ins_cluster = cluster_df.loc[
    change_object_dataframe["ins_start_pos"].values != -1, :]

# delete array is always done in from revision so taking it and leaving other change object where delete does not come.
del_array = change_object_dataframe.reset_index().loc[
    change_object_dataframe["del_start_pos"].values != -1, 
                  ["from revision id","del_tokens", 'to revision id']].values
del_cluster = cluster_df.loc[
    change_object_dataframe["del_start_pos"].values != -1, :]

gap_array = np.concatenate([ins_array,del_array], axis=0)
gap_df = pd.DataFrame(gap_array,columns=["revid_ctxt", "token_id",
                               "rev_id"])

gap_cluster= pd.concat([ins_cluster, del_cluster], axis=0)
gap_df = gap_df.set_index(['revid_ctxt', 'rev_id'])
gap_cluster_df = pd.concat([ins_cluster, del_cluster], axis=0)

gap_cluster_df.index=gap_df.index

In [11]:
def token_in_gap(ann, gap_df, gap_cluster_df):
    context_gap = gap_df.loc[ann[['revid_ctxt', 'rev_id']]]
    context_cluster = gap_cluster_df.loc[ann[['revid_ctxt', 'rev_id']]]
    clusters = context_cluster.loc[ context_gap["token_id"].apply(
            lambda x: ann["token_id"] in x),:].values
    if clusters.size >0:
            clusters = pd.Series(clusters[0],index=gap_cluster_df.columns)
    else:
        clusters = pd.Series(-10, index=gap_cluster_df.columns)
    return clusters

In [12]:
def weighted_entropy(dataframe, entropy_column, group_columns="cluster", ):
    group_size = dataframe.groupby(group_columns).size()
    group_entropy = dataframe.groupby(group_columns)[entropy_column].apply(lambda x: entropy(x.value_counts().values))
    weighted_entropy = (group_size * group_entropy).mean()
    return weighted_entropy

In [13]:
# Finding the tokens who were in the gap.
al_combination_clusters_df = annotation_df.apply(token_in_gap, axis=1, args=(gap_df, gap_cluster_df))

In [14]:
annotation_clusters = pd.concat([annotation_df, al_combination_clusters_df], axis=1)

In [15]:


true_labels = np.zeros((annotation_df.shape[0]))
true_labels[(annotation_df["nationality"].str.strip() == "Y").values] = 1
annotation_df["nationality"] = true_labels
#true_labels[true_lable_df["birth_place"].str.strip() == "Y"] = 2


In [16]:
for context, eps, min_samples in all_combinations:
    evaluation_df.loc[(context, eps, min_samples),"entropy"] = weighted_entropy(annotation_clusters, 
                                                                                entropy_column="nationality", 
                                                                                group_columns=(context, eps, min_samples))
    evaluation_df.loc[(context, eps, min_samples),"rand"] = adjusted_rand_score(annotation_clusters[(context, 
                                                                                                     eps, min_samples)], 
                                                                                true_labels)

In [17]:
print(evaluation_df.reset_index().set_index(["min_samples", "eps", "context"]).loc[2]["entropy"].sort_values().iloc[0:100].reset_index().values)

[[1.25 15 1.7330592020460496]
 [1.75 4 1.7475881220106997]
 [1.75 8 1.7718294347286676]
 [2.0 4 1.8060909013832354]
 [1.5 4 1.8137690872541257]
 [1.5 8 1.815186338360246]
 [1.25 4 1.8291930083932157]
 [0.5 4 1.8425632327146717]
 [0.75 4 1.8425632327146717]
 [1.0 4 1.842563232714672]
 [1.75 6 1.8700450817561856]
 [0.75 30 1.899235724069299]
 [1.5 12 1.914451024167899]
 [1.0 20 1.9206889660923419]
 [1.5 6 1.9252727512842187]
 [1.75 10 1.9379696940177065]
 [1.5 10 1.941768652083299]
 [0.75 20 1.9421001648571565]
 [1.25 8 1.9445861689142498]
 [0.75 25 1.9493524462194105]
 [1.25 6 1.9669072424638143]
 [2.0 6 1.9672492512095245]
 [1.25 12 1.987419707892861]
 [1.5 15 1.9930915013655888]
 [2.0 8 2.0033920820478905]
 [1.25 10 2.0097189912520936]
 [1.0 15 2.0104660278750632]
 [1.0 30 2.013985187940474]
 [1.75 12 2.020276231543493]
 [1.0 8 2.0240333545089304]
 [1.0 6 2.0556072678551103]
 [1.0 12 2.080924923767969]
 [0.5 30 2.1019599958377357]
 [0.75 6 2.101966880442974]
 [0.5 6 2.101966880442974]

In [18]:
evaluation_df["entropy"].sort_values().iloc[0:50]

context  eps   min_samples
15       1.25  2              1.73306
4        1.75  2              1.74759
8        1.75  2              1.77183
4        2.00  2              1.80609
         1.50  2              1.81377
8        1.50  2              1.81519
4        1.25  2              1.82919
         0.50  2              1.84256
         0.75  2              1.84256
         1.00  2              1.84256
6        1.75  2              1.87005
30       0.75  2              1.89924
12       1.50  2              1.91445
20       1.00  2              1.92069
6        1.50  2              1.92527
10       1.75  2              1.93797
         1.50  2              1.94177
20       0.75  2               1.9421
8        1.25  2              1.94459
25       0.75  2              1.94935
6        1.25  2              1.96691
         2.00  2              1.96725
12       1.25  2              1.98742
15       1.50  2              1.99309
8        2.00  2              2.00339
10       1.25  2       

In [19]:
al_combination_clusters_df.nunique(axis=0).reset_index().set_index(["min_samples", "eps", "context"]).loc[2].sort_values(0, ascending=False)
#[0].sort_values().iloc[0:50]



Unnamed: 0_level_0,Unnamed: 1_level_0,0
eps,context,Unnamed: 2_level_1
1.75,4,198
1.00,8,196
1.25,4,196
0.50,4,195
1.25,8,195
1.00,4,195
0.75,4,195
1.50,4,195
1.50,8,193
1.00,10,192


In [20]:
al_combination_clusters_df.nunique(axis=0)[evaluation_df["entropy"].sort_values().index]
# al_combination_clusters_df.values.shape
# al_combination_clusters_df.values[0]
# al_combination_clusters_df.head()
al_combination_clusters_df.nunique(axis=0)[evaluation_df["entropy"].sort_values().iloc[0:60].index]

context  eps   min_samples
15       1.25  2              182
4        1.75  2              198
8        1.75  2              184
4        2.00  2              189
         1.50  2              195
8        1.50  2              193
4        1.25  2              196
         0.50  2              195
         0.75  2              195
         1.00  2              195
6        1.75  2              185
30       0.75  2              184
12       1.50  2              172
20       1.00  2              185
6        1.50  2              188
10       1.75  2              172
         1.50  2              179
20       0.75  2              187
8        1.25  2              195
25       0.75  2              184
6        1.25  2              192
         2.00  2              177
12       1.25  2              177
15       1.50  2              173
8        2.00  2              167
10       1.25  2              191
15       1.00  2              181
30       1.00  2              169
12       1.75  2     

In [21]:
# result_file_name = f"{article_name}_evaluation.csv"
# result_file_path = os.path.join(annotation_dir, result_file_name)
# annotation_df.to_csv(result_file_path)

# nonoverlaping_clusters = set(annotation_df["cluster_10"].unique()) - set(annotation_df.loc[(true_lable_df["birth_place"].str.strip().values == "Y") | (true_lable_df["nationality"].str.strip().values == "Y") , 
#                   "cluster_10"].unique())

# annotation_df.loc[np.isin(annotation_df["cluster_10"], list(nonoverlaping_clusters)+[-1]),"cluster_10"] =-999

# nonoverlaping_clusters = set(annotation_df["cluster_4"].unique()) - set(annotation_df.loc[(
#                             true_lable_df["birth_place"].str.strip().values == "Y") | 
#                             (true_lable_df["nationality"].str.strip().values == "Y") , "cluster_4"].unique())

# annotation_df.loc[np.isin(annotation_df["cluster_4"], list(nonoverlaping_clusters)+[-1]),"cluster_4"] =-999

# nonoverlaping_clusters = set(annotation_df["cluster_10"].unique()) - set(annotation_df.loc[ (true_lable_df["nationality"].str.strip().values == "Y") , 
#                   "cluster_10"].unique())

# annotation_df.loc[np.isin(annotation_df["cluster_10"], list(nonoverlaping_clusters)+[-1]),"cluster_10"] =-999
# annotation_df.loc[annotation_df['cluster_10'] != -999,"cluster_10"] = 999

# nonoverlaping_clusters = set(annotation_df["cluster_4"].unique()) - set(annotation_df.loc[
#                             (true_lable_df["nationality"].str.strip().values == "Y") , "cluster_4"].unique())

# annotation_df.loc[np.isin(annotation_df["cluster_4"], list(nonoverlaping_clusters)+[-1]),"cluster_4"] =-999
# #annotation_df.loc[~np.isin(annotation_df["cluster_4"], list(nonoverlaping_clusters)+[-1]),"cluster_4"] =999
# annotation_df.loc[annotation_df['cluster_4'] != -999,"cluster_4"] = 999

# _tdf = annotation_df.merge(full_df.drop(columns=['cluster_4', 'cluster_10']), how='left', left_on=['context_id', 'rev_id', 'token_id'], right_on=['revid_ctxt', 'rev_id', 'token_id'])
# _tdf = _tdf[['context_id', 'token_id', 'rev_id','cluster_4', 'cluster_10','true_labels', "nationality", "birth_place"]]
# #_tdf['cluster_4_x'] - _tdf['cluster_10_y']
# #_tdf.shape
# #_tdf

# _tdf = annotation_df.merge(full_df.drop(columns=['cluster_4', 'cluster_10']), how='left', left_on=['context_id', 'rev_id', 'token_id'], right_on=['revid_ctxt', 'rev_id', 'token_id'])
# _tdf = _tdf[_tdf['Bulk'].str.strip() == 'N']
# _tdf = _tdf[['context_id', 'token_id', 'rev_id','cluster_4', 'cluster_10','true_labels', "nationality", "birth_place"]]

# evaluation_score = pd.Series(index=["rand_4", "rand_10", "mutual_info_4",  "mutual_info_10"])
# evaluation_score["rand_4"] = adjusted_rand_score( _tdf["cluster_4"], _tdf['true_labels'])
# evaluation_score["rand_10"] = adjusted_rand_score( _tdf["cluster_10"], _tdf['true_labels'])
# evaluation_score["mutual_info_4"] = adjusted_mutual_info_score(_tdf['true_labels'], 
#                                             _tdf["cluster_4"], average_method="max"  )
# evaluation_score["mutual_info_10"] = adjusted_mutual_info_score(_tdf['true_labels'],
#                                             _tdf["cluster_10"], average_method="max" )
# evaluation_score

# evaluation_score = pd.Series(index=["rand_4", "rand_10", "mutual_info_4",  "mutual_info_10"])
# evaluation_score["rand_4"] = adjusted_rand_score( annotation_df["cluster_4"], true_labels)
# evaluation_score["rand_10"] = adjusted_rand_score( annotation_df["cluster_10"], true_labels)
# evaluation_score["mutual_info_4"] = adjusted_mutual_info_score(true_labels, 
#                                             annotation_df["cluster_4"], average_method="max"  )
# evaluation_score["mutual_info_10"] = adjusted_mutual_info_score(true_labels,
#                                             annotation_df["cluster_10"], average_method="max" )
# evaluation_score

# normalized_mutual_info_score(true_labels, annotation_df["cluster_4"])

# set(annotation_df["cluster_10"].unique()) - set(annotation_df.loc[(true_lable_df["birth_place"].str.strip().values == "Y") | (true_lable_df["birth_place"].str.strip().values == "Y") , 
#                   "cluster_10"].unique())

# annotation_df.loc[annotation_df["cluster_10"] == -1,"cluster_10"] =-999

In [23]:
evaluation_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,rand,entropy,token_entropy
context,eps,min_samples,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,0.5,2,0.0866597,2.3735,18551.7
2,0.75,2,0.0866624,2.40557,18668.3
2,1.0,2,0.0866624,2.40557,18720.0
2,1.25,2,0.0866624,2.40557,18704.8
2,1.5,2,0.0863464,2.38552,18698.4


In [24]:
evaluation_df.corr()

In [26]:
evaluation_df.astype(np.float64).corr()

Unnamed: 0,rand,entropy,token_entropy
rand,1.0,-0.615029,-0.488641
entropy,-0.615029,1.0,0.794375
token_entropy,-0.488641,0.794375,1.0
