In [33]:
import sys,os
sys.path.append("../")

import pandas as pd
import numpy as np
import pickle
from sklearn.cluster import DBSCAN
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import pairwise_distances  
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import mutual_info_score
from sklearn.metrics import normalized_mutual_info_score


import matplotlib.pyplot as plt
from IPython.display import HTML
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets





## Reading the change object and clustering.

In [34]:
article_name = "John_Logie_Baird"
change_object_dir =  "../data/change objects/"

change_object_file_name = f"{article_name}_vec.npz"
filename =  f"{article_name}_change.h5"

change_object_file = os.path.join(change_object_dir, filename)


change_vector_dir = "../data/change_vector_optimised/"
change_vec_filename = f"{article_name}_comp_vec.npz"
change_vector_file = os.path.join(change_vector_dir, change_vec_filename)

content_dir = "../data/content/"
len_file = article_name + "_rev_len.h5"
len_file_path = os.path.join(content_dir, len_file)


if os.path.exists(change_object_file):
    with pd.HDFStore(change_object_file, 'r') as store:
        change_object_dataframe = store.get("data")
else:
    print("file do not exist")
    
# change_object_dataframe["edited_tokens"] = change_object_dataframe["ins_tokens"]  + change_object_dataframe["del_tokens"]
# rev_len_df = pd.read_hdf(len_file_path, key = "rev_len")
vectors ={}

with open(change_vector_file, "rb") as file:
    arrays_dict = np.load(file)
    vectors["clean_4"] = arrays_dict["4_clean_not_weighted"]
    vectors["clean_10"] = arrays_dict["10_clean_not_weighted"]

#### read annotations

In [3]:
annotation_dir = "../data/annotation/"
annotation_file_name = f"{article_name}.csv"
annotation_file_path = os.path.join(annotation_dir, annotation_file_name)

annotation_df = pd.read_csv(annotation_file_path)
annotation_df.columns =["context_id", "token_id", "rev_id", "cluster_4", "cluster_10"]

### clustering


In [4]:
%%time 
vector_names = ["clean_4", "clean_10"]
cluster_df = pd.DataFrame(columns=vector_names)
# cluster_by= vector_names[0]
dbscan_param = { "eps": 1.5, "min_samples": 4 }
for cluster_by in vector_names:
    distances = pairwise_distances(vectors[cluster_by])
    cluster_df[cluster_by] = DBSCAN(**dbscan_param, metric="precomputed").fit(distances).labels_
cluster_df.index = change_object_dataframe.index
change_object_dataframe[vector_names] = cluster_df

CPU times: user 9.73 s, sys: 1.83 s, total: 11.6 s
Wall time: 5.08 s


In [32]:
cluster_dir = "../data/clusters/"

file_name = article_name + "_dbscan_cluster_4and10.h5"
full_file_path = os.path.join(cluster_dir, file_name)
with pd.HDFStore(full_file_path, 'w') as store:
    store.put("cluster", change_object_dataframe[["clean_4", "clean_10"]], table=False)

In [6]:
# insert array is always done in to revision so taking it and leaving other change object where 
ins_array = change_object_dataframe.reset_index().loc[
    change_object_dataframe["ins_start_pos"].values != -1, 
                  ["to revision id","ins_tokens", 'to revision id',"clean_4", "clean_10"]].values

# delete array is always done in from revision so taking it and leaving other change object where delete does not come.
del_array = change_object_dataframe.reset_index().loc[
    change_object_dataframe["del_start_pos"].values != -1, 
                  ["from revision id","del_tokens", 'to revision id',"clean_4", "clean_10"]].values

gap_array = np.concatenate([ins_array,del_array], axis=0)
gap_df = pd.DataFrame(gap_array,columns=annotation_df.columns)

gap_df = gap_df.set_index(['context_id', 'rev_id'])

In [7]:
def token_in_gap(ann, gap):
    context_gap = gap.loc[ann[['context_id', 'rev_id']]]
    clusters = context_gap.loc[ context_gap["token_id"].apply(
            lambda x: ann["token_id"] in x), ["cluster_4","cluster_10"]].values
    if clusters.size >0:
            clusters = pd.Series(clusters[0],index=["cluster_4","cluster_10"])
    else:
        clusters = pd.Series([-10,-10], index=["cluster_4","cluster_10"])
    return clusters

In [8]:
# Finding the tokens who were in the gap.
annotation_df[["cluster_4","cluster_10"]] = annotation_df.apply(token_in_gap, axis=1, args=(gap_df,))

In [9]:
result_file_name = f"{article_name}_evaluation.csv"
result_file_path = os.path.join(annotation_dir, result_file_name)
annotation_df.to_csv(result_file_path)

In [10]:

file_name = article_name + "_FULL.csv"
full_file_path = os.path.join(annotation_dir, file_name)
full_df = pd.read_csv(full_file_path)
true_lable_df = full_df[["nationality", "birth_place"]]
true_lable_df.head()

true_labels = np.zeros((true_lable_df.shape[0]))
true_labels[true_lable_df["nationality"].str.strip() == "Y"] = 1
#true_labels[true_lable_df["birth_place"].str.strip() == "Y"] = 2

full_df['true_labels']=true_labels
pd.Series(true_labels).value_counts()

0.0    1279
1.0     496
dtype: int64

In [11]:
# nonoverlaping_clusters = set(annotation_df["cluster_10"].unique()) - set(annotation_df.loc[(true_lable_df["birth_place"].str.strip().values == "Y") | (true_lable_df["nationality"].str.strip().values == "Y") , 
#                   "cluster_10"].unique())

# annotation_df.loc[np.isin(annotation_df["cluster_10"], list(nonoverlaping_clusters)+[-1]),"cluster_10"] =-999

# nonoverlaping_clusters = set(annotation_df["cluster_4"].unique()) - set(annotation_df.loc[(
#                             true_lable_df["birth_place"].str.strip().values == "Y") | 
#                             (true_lable_df["nationality"].str.strip().values == "Y") , "cluster_4"].unique())

# annotation_df.loc[np.isin(annotation_df["cluster_4"], list(nonoverlaping_clusters)+[-1]),"cluster_4"] =-999

In [12]:
nonoverlaping_clusters = set(annotation_df["cluster_10"].unique()) - set(annotation_df.loc[ (true_lable_df["nationality"].str.strip().values == "Y") , 
                  "cluster_10"].unique())

annotation_df.loc[np.isin(annotation_df["cluster_10"], list(nonoverlaping_clusters)+[-1]),"cluster_10"] =-999
annotation_df.loc[annotation_df['cluster_10'] != -999,"cluster_10"] = 999

nonoverlaping_clusters = set(annotation_df["cluster_4"].unique()) - set(annotation_df.loc[
                            (true_lable_df["nationality"].str.strip().values == "Y") , "cluster_4"].unique())

annotation_df.loc[np.isin(annotation_df["cluster_4"], list(nonoverlaping_clusters)+[-1]),"cluster_4"] =-999
#annotation_df.loc[~np.isin(annotation_df["cluster_4"], list(nonoverlaping_clusters)+[-1]),"cluster_4"] =999
annotation_df.loc[annotation_df['cluster_4'] != -999,"cluster_4"] = 999

In [13]:
# _tdf = annotation_df.merge(full_df.drop(columns=['cluster_4', 'cluster_10']), how='left', left_on=['context_id', 'rev_id', 'token_id'], right_on=['revid_ctxt', 'rev_id', 'token_id'])
# _tdf = _tdf[['context_id', 'token_id', 'rev_id','cluster_4', 'cluster_10','true_labels', "nationality", "birth_place"]]
# #_tdf['cluster_4_x'] - _tdf['cluster_10_y']
# #_tdf.shape
# #_tdf

In [20]:
_tdf = annotation_df.merge(full_df.drop(columns=['cluster_4', 'cluster_10']), how='left', left_on=['context_id', 'rev_id', 'token_id'], right_on=['revid_ctxt', 'rev_id', 'token_id'])
#_tdf = _tdf[_tdf['Bulk'].str.strip() == 'N']
_tdf = _tdf[['context_id', 'token_id', 'rev_id','cluster_4', 'cluster_10','true_labels', "nationality", "birth_place"]]

In [21]:
evaluation_score = pd.Series(index=["rand_4", "rand_10", "mutual_info_4",  "mutual_info_10"])
evaluation_score["rand_4"] = adjusted_rand_score( _tdf["cluster_4"], _tdf['true_labels'])
evaluation_score["rand_10"] = adjusted_rand_score( _tdf["cluster_10"], _tdf['true_labels'])
evaluation_score["mutual_info_4"] = adjusted_mutual_info_score(_tdf['true_labels'], 
                                            _tdf["cluster_4"], average_method="max"  )
evaluation_score["mutual_info_10"] = adjusted_mutual_info_score(_tdf['true_labels'],
                                            _tdf["cluster_10"], average_method="max" )
evaluation_score

rand_4            0.306269
rand_10           0.377451
mutual_info_4     0.178438
mutual_info_10    0.241638
dtype: float64

In [16]:
evaluation_score = pd.Series(index=["rand_4", "rand_10", "mutual_info_4",  "mutual_info_10"])
evaluation_score["rand_4"] = adjusted_rand_score( annotation_df["cluster_4"], true_labels)
evaluation_score["rand_10"] = adjusted_rand_score( annotation_df["cluster_10"], true_labels)
evaluation_score["mutual_info_4"] = adjusted_mutual_info_score(true_labels, 
                                            annotation_df["cluster_4"], average_method="max"  )
evaluation_score["mutual_info_10"] = adjusted_mutual_info_score(true_labels,
                                            annotation_df["cluster_10"], average_method="max" )
evaluation_score

rand_4            0.306269
rand_10           0.377451
mutual_info_4     0.178438
mutual_info_10    0.241638
dtype: float64

In [17]:
normalized_mutual_info_score(true_labels, annotation_df["cluster_4"])



0.17923641305186078

In [18]:
set(annotation_df["cluster_10"].unique()) - set(annotation_df.loc[(true_lable_df["birth_place"].str.strip().values == "Y") | (true_lable_df["birth_place"].str.strip().values == "Y") , 
                  "cluster_10"].unique())

set()

In [19]:
annotation_df.loc[annotation_df["cluster_10"] == -1,"cluster_10"] =-999