In [55]:
import sys,os
sys.path.append("../")

import pandas as pd
import numpy as np
import pickle
import itertools
from sklearn.cluster import DBSCAN
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import pairwise_distances  
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import mutual_info_score
from sklearn.metrics import normalized_mutual_info_score
from scipy.stats import entropy
import seaborn as sns

import matplotlib.pyplot as plt
from IPython.display import HTML
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

## Reading the change object and clustering.

In [56]:
article_name = "John_Logie_Baird"
change_object_dir =  "../data/change objects/"

change_object_file_name = f"{article_name}_vec.npz"
filename =  f"{article_name}_change.h5"

change_object_file = os.path.join(change_object_dir, filename)

content_dir = "../data/content/"

filename = article_name + ".h5"
filepath = os.path.join(content_dir, filename)
with pd.HDFStore(filepath, 'r') as store:
    token_string_df = store.get("all_tokens")
    
token_string_df = token_string_df.set_index("token_id")["str"]
token_string_df[-1] = "St@rt"
token_string_df[-2] = "$nd"
change_vector_dir = "../data/change_vector/"
change_vec_filename = f"{article_name}.npz"
change_vector_file = os.path.join(change_vector_dir, change_vec_filename)

content_dir = "../data/content/"
len_file = article_name + "_rev_len.h5"
len_file_path = os.path.join(content_dir, len_file)


if os.path.exists(change_object_file):
    with pd.HDFStore(change_object_file, 'r') as store:
        change_object_dataframe = store.get("data")
else:
    print("file do not exist")
    
    


change_object_dataframe["del_string_tokens"] = change_object_dataframe["del_tokens"].apply(
    lambda x:  tuple(token_string_df[np.array(x)].tolist()))

change_object_dataframe["ins_string_tokens"] = change_object_dataframe["ins_tokens"].apply(
    lambda x:  tuple(token_string_df[np.array(x)].tolist()))

change_object_dataframe["edit_string_tokens"] = change_object_dataframe["ins_string_tokens"] + change_object_dataframe["del_string_tokens"]


# rev_len_df = pd.read_hdf(len_file_path, key = "rev_len")
vectors ={}

with open(change_vector_file, "rb") as file:
    arrays_dict = np.load(file)
    vectors[2] = arrays_dict["2_clean_not_weighted"]
    vectors[4] = arrays_dict["4_clean_not_weighted"]
    vectors[6] = arrays_dict["6_clean_not_weighted"]
    vectors[8] = arrays_dict["8_clean_not_weighted"]
    vectors[10] = arrays_dict["10_clean_not_weighted"]
    vectors[12] = arrays_dict["12_clean_not_weighted"]
    vectors[15] = arrays_dict["15_clean_not_weighted"]
    vectors[20] = arrays_dict["20_clean_not_weighted"]
    vectors[25] = arrays_dict["25_clean_not_weighted"]
    vectors[30] = arrays_dict["30_clean_not_weighted"]

#### read annotations

In [86]:
file_name = article_name + "_FULL.csv"
annotation_dir = "../data/annotation/"
full_file_path = os.path.join(annotation_dir, file_name)
annotation_df = pd.read_csv(full_file_path)
annotation_df = annotation_df[["revid_ctxt", "token_id",
                               "rev_id", "nationality", "birth_place", "Bulk" ]]

### clustering


In [58]:
vector_names = list(vectors.keys())
context_array  = vector_names
eps_array = [0.00000001, 0.01, 0.1, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0,2.5,3,4]

min_samples_array = [2]
all_combinations = list(itertools.product(context_array, eps_array,
                                          min_samples_array))
dbscan_params = list(itertools.product(eps_array,min_samples_array))
idx = pd.MultiIndex.from_product([context_array, eps_array,min_samples_array],
                                names=["context","eps","min_samples"])
cluster_df = pd.DataFrame(columns=idx)

evaluation_df = pd.DataFrame(index=idx, columns=["rand", "entropy", "token_entropy"])



In [59]:
%%time 
for cluster_by in vector_names:
    distances = pairwise_distances(vectors[cluster_by])
    for eps, min_samples in dbscan_params:
        cluster_df[cluster_by,eps, min_samples] = DBSCAN(eps=eps, min_samples=min_samples, 
                                                         metric="precomputed").fit(distances).labels_

CPU times: user 2min 35s, sys: 1min 42s, total: 4min 18s
Wall time: 1min 51s


In [60]:
def weighted_token_entropy(dataframe, group_by):
    cluster_sizes = dataframe.groupby(group_by).size()
    token_entropy_clusters = dataframe.groupby(group_by)["edit_string_tokens"].apply(
                    lambda token_tuples: entropy(pd.Series(
                    [token for token_tuple in token_tuples.tolist() for token in token_tuple]
                    ).value_counts().values))
    cluster_entropy = (cluster_sizes * token_entropy_clusters).sum()
    return cluster_entropy

In [61]:
cluster_df.index = change_object_dataframe.index
dbscan_results = pd.concat([change_object_dataframe, cluster_df], axis=1)

In [62]:
%%time
# entropy_series = pd.Series(index=all_combinations)

for context, eps, min_samples in all_combinations:
    evaluation_df.loc[(context, eps, min_samples),"token_entropy"] = weighted_token_entropy(dbscan_results, (context, eps, min_samples))
# all_combinations_without_optimization[0]

CPU times: user 2min 29s, sys: 132 ms, total: 2min 29s
Wall time: 2min 28s


In [None]:
# entropy_series.sort_values()

In [None]:

# cluster_dir = "../data/clusters/"

# file_name = article_name + "_dbscan_cluster_4and10.h5"
# full_file_path = os.path.join(cluster_dir, file_name)
# with pd.HDFStore(full_file_path, 'w') as store:
#     store.put("cluster", change_object_dataframe[["clean_4", "clean_10"]], table=False)

In [None]:
# change_object_dataframe[["ins_tokens"]]

#### Splitting change object to match annotations

In [63]:
# insert array is always done in to revision so taking it and leaving other change object where 
ins_array = change_object_dataframe.reset_index().loc[
    change_object_dataframe["ins_start_pos"].values != -1, 
                  ["to revision id","ins_tokens", 'to revision id']].values
ins_cluster = cluster_df.loc[
    change_object_dataframe["ins_start_pos"].values != -1, :]

# delete array is always done in from revision so taking it and leaving other change object where delete does not come.
del_array = change_object_dataframe.reset_index().loc[
    change_object_dataframe["del_start_pos"].values != -1, 
                  ["from revision id","del_tokens", 'to revision id']].values
del_cluster = cluster_df.loc[
    change_object_dataframe["del_start_pos"].values != -1, :]

gap_array = np.concatenate([ins_array,del_array], axis=0)
gap_df = pd.DataFrame(gap_array,columns=["revid_ctxt", "token_id",
                               "rev_id"])

gap_cluster= pd.concat([ins_cluster, del_cluster], axis=0)
gap_df = gap_df.set_index(['revid_ctxt', 'rev_id'])
gap_cluster_df = pd.concat([ins_cluster, del_cluster], axis=0)

gap_cluster_df.index=gap_df.index

In [64]:
def token_in_gap(ann, gap_df, gap_cluster_df):
    context_gap = gap_df.loc[ann[['revid_ctxt', 'rev_id']]]
    context_cluster = gap_cluster_df.loc[ann[['revid_ctxt', 'rev_id']]]
    clusters = context_cluster.loc[ context_gap["token_id"].apply(
            lambda x: ann["token_id"] in x),:].values
    if clusters.size >0:
            clusters = pd.Series(clusters[0],index=gap_cluster_df.columns)
    else:
        clusters = pd.Series(-10, index=gap_cluster_df.columns)
    return clusters

In [65]:
def weighted_entropy(dataframe, entropy_column, group_columns="cluster", ):
    group_size = dataframe.groupby(group_columns).size()
    group_entropy = dataframe.groupby(group_columns)[entropy_column].apply(lambda x: entropy(x.value_counts().values))
    weighted_entropy = (group_size * group_entropy).mean()
    return weighted_entropy

In [66]:
# Finding the tokens who were in the gap.
al_combination_clusters_df = annotation_df.apply(token_in_gap, axis=1, args=(gap_df, gap_cluster_df))

In [87]:
annotation_clusters = pd.concat([annotation_df, al_combination_clusters_df], axis=1)

In [68]:
true_labels = np.zeros((annotation_df.shape[0]))
true_labels[(annotation_df["nationality"].str.strip() == "Y").values] = 1
annotation_df["nationality"] = true_labels
#true_labels[true_lable_df["birth_place"].str.strip() == "Y"] = 2


In [109]:
df2 = gap_df.copy()
df2=df2.reset_index()
df2['nationality'] = 0


aci = annotation_clusters.set_index(['revid_ctxt', 'rev_id']).sort_index()
aci = aci[aci['Bulk'] =='N']
aci_y=aci[aci['nationality'] == 'Y']
aci_n=aci[aci['nationality'] == 'N']

counter = 0
def nat_val(row):
    global counter
    val = 0
    x=0
    y=0
    try:
        x = int(aci_y.loc[(row[0],row[1]), ['token_id']].isin(row[2]).sum())
        val = val + (1 if x > 0 else 0)
        
    except KeyError as e:
        pass
    try:
        y = int(aci_n.loc[(row[0],row[1]), ['token_id']].isin(row[2]).sum())
        val = val - (1 if y > 0 else 0)
        counter += y
    except KeyError as e:
        pass
#     if (x + y) > 1:
#         print((aci.loc[(row[0],row[1]),['token_id','nationality','Bulk']])[aci.loc[(row[0],row[1]), 'token_id'].isin(row[2])])
#         print(aci.loc[(row[0],row[1]), ['token_id']].isin(row[2]))
        
#     if (1 if x > 0 else 0) + (1 if y > 0 else 0) > 1:
#         try:
#             if not (aci.loc[(row[0],row[1]),['Bulk']] == 'Y').all()[0] or True:
#                 print((aci.loc[(row[0],row[1]),['token_id','nationality','Bulk']])[aci.loc[(row[0],row[1]), 'token_id'].isin(row[2])])
#                 print(aci.loc[(row[0],row[1]), ['token_id']].isin(row[2]))
#         except:
#             import pdb; pdb.set_trace()
    return val 
    
df2['nationality'] = df2[['revid_ctxt', 'rev_id', 'token_id']].apply(nat_val, axis=1)
df3 = pd.concat([df2, gap_cluster_df.reset_index()], axis=1)

In [108]:
df3['nationality'].value_counts()

 0    6241
 1     405
-1     272
Name: nationality, dtype: int64

In [110]:
entropies = []
print("Without bulks")
for context, eps, min_samples in all_combinations:
    print(str((context, eps, min_samples)) + ": " + str(weighted_entropy(df3, entropy_column="nationality", group_columns=(context, eps, min_samples))))

Without bulks
(2, 1e-08, 2): 1.9867810900482359
(2, 0.01, 2): 1.4858458222106017
(2, 0.1, 2): 1.4858458222106017
(2, 0.25, 2): 1.4858458222106017
(2, 0.5, 2): 1.4858458222106017
(2, 0.75, 2): 1.5071841634990952
(2, 1.0, 2): 1.513538453097494
(2, 1.25, 2): 1.5109643272732556
(2, 1.5, 2): 1.5046704314783486
(2, 1.75, 2): 1.5034484230771907
(2, 2.0, 2): 1.5452693225419276
(2, 2.5, 2): 1.7931284839353576
(2, 3, 2): 2.6117212958577536
(2, 4, 2): 7.789703046152093
(4, 1e-08, 2): 1.4214091160218247
(4, 0.01, 2): 1.0357292537540328
(4, 0.1, 2): 1.0357292537540328
(4, 0.25, 2): 1.0357292537540328
(4, 0.5, 2): 1.0357292537540328
(4, 0.75, 2): 1.0369956392785165
(4, 1.0, 2): 1.0395016797096868
(4, 1.25, 2): 1.0259857221530067
(4, 1.5, 2): 0.995497621989483
(4, 1.75, 2): 0.9976439747823428
(4, 2.0, 2): 1.0233821584226985
(4, 2.5, 2): 1.5667937904280422
(4, 3, 2): 2.928653680845209
(4, 4, 2): 18.964253900823497
(6, 1e-08, 2): 1.428937887669013
(6, 0.01, 2): 1.153708824190206
(6, 0.1, 2): 1.15370882

In [94]:
entropies = []
print("With bulks")
for context, eps, min_samples in all_combinations:
    print(str((context, eps, min_samples)) + ": " + str(weighted_entropy(df3, entropy_column="nationality", group_columns=(context, eps, min_samples))))
    
    

With bulks
(2, 1e-08, 2): 2.772904646693351
(2, 0.01, 2): 2.3561279561235495
(2, 0.1, 2): 2.3561279561235495
(2, 0.25, 2): 2.3561279561235495
(2, 0.5, 2): 2.3561279561235495
(2, 0.75, 2): 2.397156331522571
(2, 1.0, 2): 2.410324952222437
(2, 1.25, 2): 2.40610784786207
(2, 1.5, 2): 2.3927716261593166
(2, 1.75, 2): 2.3916542088137605
(2, 2.0, 2): 2.4558325274589827
(2, 2.5, 2): 2.885547099996353
(2, 3, 2): 3.999872247057809
(2, 4, 2): 10.895400517397643
(4, 1e-08, 2): 2.050943681386586
(4, 0.01, 2): 1.6598129684962035
(4, 0.1, 2): 1.6598129684962035
(4, 0.25, 2): 1.6598129684962035
(4, 0.5, 2): 1.6598129684962035
(4, 0.75, 2): 1.6623432804183367
(4, 1.0, 2): 1.6673229471947226
(4, 1.25, 2): 1.640818986225281
(4, 1.5, 2): 1.601013180672444
(4, 1.75, 2): 1.6068964144409135
(4, 2.0, 2): 1.6329443722131844
(4, 2.5, 2): 2.396619406205798
(4, 3, 2): 4.415250558394471
(4, 4, 2): 24.430395840846845
(6, 1e-08, 2): 2.09705170742411
(6, 0.01, 2): 1.7726451369657767
(6, 0.1, 2): 1.7726451369657767
(6

In [100]:
df4 = annotation_clusters[annotation_clusters['Bulk']=='N']
for context, eps, min_samples in all_combinations:
    evaluation_df.loc[(context, eps, min_samples),"entropy"] = weighted_entropy(df4, 
                                                                                entropy_column="nationality", 
                                                                                group_columns=(context, eps, min_samples))
evaluation_df.reset_index().set_index(["min_samples", "eps", "context"]).loc[2]["entropy"].sort_values().iloc[0:50]

eps   context
2.00  4          0.97293
1.75  4          0.97415
1.50  4          1.05774
1.00  4           1.0903
1.25  4           1.0903
0.75  4           1.0903
0.50  4           1.0903
0.25  4           1.0903
0.10  4           1.0903
0.01  4           1.0903
1.25  15         1.13612
1.75  8           1.1664
1.50  6          1.18681
1.25  6          1.19074
1.75  6          1.20398
1.50  8          1.22739
2.00  6          1.22824
1.00  6          1.23382
0.01  6          1.29177
0.25  6          1.29177
0.50  6          1.29177
0.75  6          1.29177
0.10  6          1.29177
1.00  15         1.29467
1.25  8           1.3073
2.00  8           1.3122
1.00  8          1.34942
1.50  12         1.35512
0.75  20         1.36965
1.25  12         1.37586
1.75  2          1.41378
1.50  2          1.41378
1.25  2          1.41378
1.00  2          1.41378
0.75  2          1.41378
0.50  2          1.41378
0.25  2          1.41378
0.10  2          1.41378
0.01  2          1.41378
0.75  25   

Y    1029
N     746
Name: Bulk, dtype: int64

In [49]:
for context, eps, min_samples in all_combinations:
    evaluation_df.loc[(context, eps, min_samples),"entropy"] = weighted_entropy(annotation_clusters, 
                                                                                entropy_column="nationality", 
                                                                                group_columns=(context, eps, min_samples))
    evaluation_df.loc[(context, eps, min_samples),"rand"] = adjusted_rand_score(annotation_clusters[(context, 
                                                                                                     eps, min_samples)], 
                                                                                true_labels)

In [50]:
evaluation_df.reset_index().set_index(["min_samples", "eps", "context"]).loc[2]["entropy"].sort_values().iloc[0:50]

eps   context
0.10  4          1.84256
0.25  4          1.84256
0.50  4          1.84256
0.75  4          1.84256
      30         1.89924
      20          1.9421
      25         1.94935
0.50  30         2.10196
0.10  6          2.10197
0.25  6          2.10197
0.50  6          2.10197
0.75  6          2.10197
0.50  25         2.21462
0.75  8          2.23276
      15         2.24167
      10         2.26895
0.50  8          2.27903
      20         2.29684
0.75  12         2.31711
0.10  8           2.3225
0.25  8           2.3225
      2           2.3735
0.50  2           2.3735
0.10  2           2.3735
0.75  2          2.40557
0.50  10         2.49668
0.25  10         2.49938
0.10  10         2.49938
0.50  15         2.54442
      12          2.6279
0.25  20         2.71612
      12         2.74786
0.10  12         2.74786
0.25  15         2.75932
0.10  20         2.77461
      15         2.77831
0.25  25         2.81524
      30         2.87173
0.10  25          2.8821
      30   

In [None]:
evaluation_df["entropy"].sort_values()#.iloc[0:50]

In [None]:
al_combination_clusters_df.nunique(axis=0).reset_index().set_index(["min_samples", "eps", "context"]).loc[2].sort_values(0, ascending=False)
#[0].sort_values().iloc[0:50]



In [None]:
al_combination_clusters_df.nunique(axis=0)[evaluation_df["entropy"].sort_values().index]
# al_combination_clusters_df.values.shape
# al_combination_clusters_df.values[0]
# al_combination_clusters_df.head()
al_combination_clusters_df.nunique(axis=0)[evaluation_df["entropy"].sort_values().iloc[0:60].index]

In [None]:
# result_file_name = f"{article_name}_evaluation.csv"
# result_file_path = os.path.join(annotation_dir, result_file_name)
# annotation_df.to_csv(result_file_path)

# nonoverlaping_clusters = set(annotation_df["cluster_10"].unique()) - set(annotation_df.loc[(true_lable_df["birth_place"].str.strip().values == "Y") | (true_lable_df["nationality"].str.strip().values == "Y") , 
#                   "cluster_10"].unique())

# annotation_df.loc[np.isin(annotation_df["cluster_10"], list(nonoverlaping_clusters)+[-1]),"cluster_10"] =-999

# nonoverlaping_clusters = set(annotation_df["cluster_4"].unique()) - set(annotation_df.loc[(
#                             true_lable_df["birth_place"].str.strip().values == "Y") | 
#                             (true_lable_df["nationality"].str.strip().values == "Y") , "cluster_4"].unique())

# annotation_df.loc[np.isin(annotation_df["cluster_4"], list(nonoverlaping_clusters)+[-1]),"cluster_4"] =-999

# nonoverlaping_clusters = set(annotation_df["cluster_10"].unique()) - set(annotation_df.loc[ (true_lable_df["nationality"].str.strip().values == "Y") , 
#                   "cluster_10"].unique())

# annotation_df.loc[np.isin(annotation_df["cluster_10"], list(nonoverlaping_clusters)+[-1]),"cluster_10"] =-999
# annotation_df.loc[annotation_df['cluster_10'] != -999,"cluster_10"] = 999

# nonoverlaping_clusters = set(annotation_df["cluster_4"].unique()) - set(annotation_df.loc[
#                             (true_lable_df["nationality"].str.strip().values == "Y") , "cluster_4"].unique())

# annotation_df.loc[np.isin(annotation_df["cluster_4"], list(nonoverlaping_clusters)+[-1]),"cluster_4"] =-999
# #annotation_df.loc[~np.isin(annotation_df["cluster_4"], list(nonoverlaping_clusters)+[-1]),"cluster_4"] =999
# annotation_df.loc[annotation_df['cluster_4'] != -999,"cluster_4"] = 999

# _tdf = annotation_df.merge(full_df.drop(columns=['cluster_4', 'cluster_10']), how='left', left_on=['context_id', 'rev_id', 'token_id'], right_on=['revid_ctxt', 'rev_id', 'token_id'])
# _tdf = _tdf[['context_id', 'token_id', 'rev_id','cluster_4', 'cluster_10','true_labels', "nationality", "birth_place"]]
# #_tdf['cluster_4_x'] - _tdf['cluster_10_y']
# #_tdf.shape
# #_tdf

# _tdf = annotation_df.merge(full_df.drop(columns=['cluster_4', 'cluster_10']), how='left', left_on=['context_id', 'rev_id', 'token_id'], right_on=['revid_ctxt', 'rev_id', 'token_id'])
# _tdf = _tdf[_tdf['Bulk'].str.strip() == 'N']
# _tdf = _tdf[['context_id', 'token_id', 'rev_id','cluster_4', 'cluster_10','true_labels', "nationality", "birth_place"]]

# evaluation_score = pd.Series(index=["rand_4", "rand_10", "mutual_info_4",  "mutual_info_10"])
# evaluation_score["rand_4"] = adjusted_rand_score( _tdf["cluster_4"], _tdf['true_labels'])
# evaluation_score["rand_10"] = adjusted_rand_score( _tdf["cluster_10"], _tdf['true_labels'])
# evaluation_score["mutual_info_4"] = adjusted_mutual_info_score(_tdf['true_labels'], 
#                                             _tdf["cluster_4"], average_method="max"  )
# evaluation_score["mutual_info_10"] = adjusted_mutual_info_score(_tdf['true_labels'],
#                                             _tdf["cluster_10"], average_method="max" )
# evaluation_score

# evaluation_score = pd.Series(index=["rand_4", "rand_10", "mutual_info_4",  "mutual_info_10"])
# evaluation_score["rand_4"] = adjusted_rand_score( annotation_df["cluster_4"], true_labels)
# evaluation_score["rand_10"] = adjusted_rand_score( annotation_df["cluster_10"], true_labels)
# evaluation_score["mutual_info_4"] = adjusted_mutual_info_score(true_labels, 
#                                             annotation_df["cluster_4"], average_method="max"  )
# evaluation_score["mutual_info_10"] = adjusted_mutual_info_score(true_labels,
#                                             annotation_df["cluster_10"], average_method="max" )
# evaluation_score

# normalized_mutual_info_score(true_labels, annotation_df["cluster_4"])

# set(annotation_df["cluster_10"].unique()) - set(annotation_df.loc[(true_lable_df["birth_place"].str.strip().values == "Y") | (true_lable_df["birth_place"].str.strip().values == "Y") , 
#                   "cluster_10"].unique())

# annotation_df.loc[annotation_df["cluster_10"] == -1,"cluster_10"] =-999

In [None]:
evaluation_df.head()

In [None]:
evaluation_df.corr()

In [None]:
evaluation_df.astype(np.float64).corr()