In [1]:
import sys,os
sys.path.append("../")

import pandas as pd
import numpy as np
import pickle

from scipy.stats import entropy
from sklearn.cluster import DBSCAN
from sklearn.metrics import pairwise_distances  
from sklearn.metrics import mutual_info_score

import matplotlib.pyplot as plt
from IPython.display import HTML
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

import seaborn as sns




## Reading the change object and clustering.

In [8]:
article_name = "John_Logie_Baird"
change_object_dir =  "../data/change objects/"

change_object_file_name = f"{article_name}_vec.npz"
filename =  f"{article_name}_change.h5"

change_object_file = os.path.join(change_object_dir, filename)


change_vector_dir = "../data/change_vector_optimised/"
change_vec_filename = f"{article_name}_comp_vec.npz"
change_vector_file = os.path.join(change_vector_dir, change_vec_filename)

content_dir = "../data/content/"
len_file = article_name + "_rev_len.h5"
len_file_path = os.path.join(content_dir, len_file)


In [9]:
%%time
if os.path.exists(change_object_file):
    with pd.HDFStore(change_object_file, 'r') as store:
        change_object_dataframe = store.get("data")
else:
    print("file do not exist")
    
rev_len_df = pd.read_hdf(len_file_path, key = "rev_len")
vectors ={}
change_object_dataframe["edit_tokens"] = change_object_dataframe["ins_tokens"] + change_object_dataframe["del_tokens"]
with open(change_vector_file, "rb") as file:
    arrays_dict = np.load(file)
    
    vectors["clean_weighted_4"] = arrays_dict["4_clean_weighted"]
    vectors["clean_notweighted_4"] = arrays_dict["4_clean_not_weighted"]
    vectors["notclean_weighted_4"] = arrays_dict["4_notclean_weighted"]
    vectors["notclean_notweighted_4"] = arrays_dict["4_notclean_not_weighted"]
    
    vectors["clean_weighted_10"] = arrays_dict["10_clean_weighted"]
    vectors["clean_notweighted_10"] = arrays_dict["10_clean_not_weighted"]
    vectors["notclean_weighted_10"] = arrays_dict["10_notclean_weighted"]
    vectors["notclean_notweighted_10"] = arrays_dict["10_notclean_not_weighted"]
    
    vectors["clean_weighted_30"] = arrays_dict["filtered_weighted_neighbour30_matrix"]
    vectors["clean_notweighted_30"] = arrays_dict["filtered_not_weighted_neighbour30_matrix"]
    vectors["notclean_weighted_30"] = arrays_dict[ "not_filtered_weighted_neighbour30_matrix"]
    vectors["notclean_notweighted_30"] = arrays_dict["not_filtered_not_weighted_neighbour30_matrix"]



CPU times: user 1.75 s, sys: 5.29 s, total: 7.04 s
Wall time: 7.05 s


## Finding relative positions of change object

In [4]:
change_object_dataframe = change_object_dataframe.reset_index().set_index('from revision id')
change_object_dataframe = change_object_dataframe.join(rev_len_df.set_index("rev_id"))
change_object_dataframe.index.name = "from revision id"

change_object_dataframe["relative_position"] =(change_object_dataframe["left_neigh"]+1)/(change_object_dataframe["length"]).round(3)

change_object_dataframe = change_object_dataframe.reset_index().set_index(["from revision id","timestamp", "level_5"])
# rel_pos = change_object_dataframe["relative_position"]
# rel_pos_r = rel_pos.round(4)
# rel_pos_r.size

In [5]:
def gini(array):
    # Number of array elements:
    n = array.shape[0]
    index = np.arange(1, n+1)
    # Gini coefficient:
    return ((np.sum((2 * index - n  - 1) * array)) / (n * np.sum(array)))

In [6]:
def evaluate(change_object_dataframe, clusters, evaluation_df, column_names):
    
    change_object_dataframe["cluster"] = pd.Series(clusters, index= change_object_dataframe.index)
    
    stats_series = pd.Series(index=column_names)
    
    non_negative_cluster_mask = clusters != -1
    non_neg_cluster_df = change_object_dataframe.loc[non_negative_cluster_mask, :]

    stats_series["no_of_outliers"]  = np.count_nonzero(~non_negative_cluster_mask)
    stats_series["no_of_clusters"] = np.unique(clusters[non_negative_cluster_mask]).size

    rank_by_size = non_neg_cluster_df.groupby("cluster").size().sort_values(ascending=False)

    relative_postion_std = non_neg_cluster_df.groupby("cluster")["relative_position"].std()
#     stats_series["relative_position_std_max"] = relative_postion_std.max()
#     stats_series["relative_position_std_min"] = relative_postion_std.min()
#     stats_series["relative_position_std_median"] = relative_postion_std.median()
#     stats_series["relative_position_std_skewness"] = relative_postion_std.skew()
#     stats_series["relative_position_std_kurtosis"] = relative_postion_std.kurt()

    stats_series["relative_position_std_less_than_.1"] = np.count_nonzero(relative_postion_std <.1)
    

    size_stats = rank_by_size.describe()
    if rank_by_size.shape[0] > 1:
        stats_series["top2_ratio"] = rank_by_size.iloc[1]/rank_by_size.iloc[0]
    else:
        stats_series["top2_ratio"] = 0
    stats_series["max_cluster_size"] = size_stats["max"]
    stats_series["min_cluster_size"] = size_stats["min"]
    stats_series["mean_cluster_size"] = size_stats["mean"]
    stats_series["median_cluster_size"] = size_stats["50%"]
    stats_series["inter_quartile_range_cluster_size"] = size_stats["75%"] - size_stats["25%"]

    stats_series["variance_cluster_size"] = rank_by_size.var()
    stats_series["standard_deviation_cluster_size"] = size_stats["std"]
    stats_series["skewness_cluster_size"] = rank_by_size.skew()
    stats_series["kurtosis_cluster_size"] = rank_by_size.kurt()
    
    cluster_sizes = non_neg_cluster_df.groupby("cluster").size().values
    cluster_sizes = cluster_sizes / cluster_sizes.sum()
    
    token_entropy_clusters = non_neg_cluster_df.groupby("cluster")["edit_tokens"].apply(
        lambda x: entropy(pd.Series(np.concatenate(x.values, axis=0)).value_counts().values))
    stats_series["token_entropy"] = (cluster_sizes * token_entropy_clusters).sum()
    
    position_entropy_clusters = non_neg_cluster_df.groupby("cluster")["relative_position"].apply(
        lambda x: entropy(x.value_counts().values))
    stats_series["position_entropy"] = (cluster_sizes * position_entropy_clusters).sum()

 

    
    stats_series["gini"] = gini(rank_by_size.values)
    
    change_object_dataframe  = change_object_dataframe.drop("cluster",axis=1)
    
    return stats_series

# Evaluation of DBSCAN


Following is the description of Pre evaluation metric 


### represents the noise in the tokens in the clusters. 
* __"token_entropy"__: Sum of Shanon entropies of cluster tokens weighted by cluster sizes. Weighting is done to give more weight to bigger clusters. 
As its easier to have smaller cluster with relatively less kind of data hence high shannon entropy

### These represent ability of DBSCAN to find clusters.
* "no_of_outliers": Total number of data points which were not put in any clusters by dbscan.
* "no_of_clusters": Total number of clusters found by the algorithm.

### These represents stability of clusters with respect to relative position of change object in its revision.
* __"position_entropy"__: Sum of Shanon entropies of relative position in a cluster weighted by cluster sizes. Weighting is done to give more weight to bigger clusters. As its easier to have smaller cluster with relatively less kind of data hence high shannon entropy.
* "relative_position_std_less_than_.1": Counts the clusters where relative position standard deviation is less thatn 0.1

### These represents  discriptive statistics representing distribution  of clusters sizes.
* "max_cluster_size": Gives size of points inside biggest clusters.
* "min_cluster_size": Gives size of points inside samllest clusters.
* "mean_cluster_size": Gives mean cluster size.
* "skewness_cluster_size": 
* "kurtosis_cluster_size":
* "median_cluster_size": Gives median cluster size.
* "inter_quartile_range_cluster_size": Gives inter quartile range of cluster size. i.e. Difference between 1st quartile and 3rd quartile. 
* "variance_cluster_size": Variance in cluster size values across all clusters.
* "standard_deviation_cluster_size": Square root of clusters. Easy to interpret as is direct representation of dispersion in distribution.

#### These Two represent distribution of cluster sizes
* "top2_ratio": Ratio of second biggest cluster size with the biggest cluster size
* "gini": Gini coeffecient of cluster size distribution among clusters.


---
---
__Shanon Entropy__ : Shanon's entropy formally $\displaystyle\sum_{i}p_i\log{p_i}$ where $p_i$ is probability of ith kind of data.
This value is zero when infromation is pure and it tends towards infinity as noise of type of data starts increasing.
Similar idea is used in decision tree to find pure classification.

__Total Entropy__: While calucalting total entropy we do not take oridinary sum because smaller clusters will tend to have less noise. 



In [7]:
change_object_dataframe["relative_position"] = (change_object_dataframe["relative_position"] - change_object_dataframe["relative_position"].mean())/change_object_dataframe["relative_position"].std()
dbscan_params =[
    { "eps": 0.5, "min_samples": 5 }, 
    { "eps": 1.0, "min_samples": 5 },
    { "eps": 1.5, "min_samples": 5 },
    { "eps": 2.0, "min_samples": 5 }
]

vector_names = ["clean_weighted_4", "clean_weighted_10", "clean_weighted_30",
                "clean_notweighted_4", "clean_notweighted_10", "clean_notweighted_30",
                "notclean_weighted_4", "notclean_weighted_10", "notclean_weighted_30",
                "notclean_notweighted_4", "notclean_notweighted_10",  "notclean_notweighted_30"  
               ]

# vector_names = ["clean_weighted_4", "clean_weighted_10", "clean_weighted_30",
#                 "clean_notweighted_4", "clean_notweighted_10",  "clean_notweighted_30"
#                ]

column_names = ["top2_ratio","no_of_outliers", "no_of_clusters",

                "relative_position_std_less_than_.1",
                "max_cluster_size", "min_cluster_size", "mean_cluster_size",
                "skewness_cluster_size", "kurtosis_cluster_size",
                 "median_cluster_size",  "inter_quartile_range_cluster_size",
                "variance_cluster_size", "standard_deviation_cluster_size",
                "gini", "token_entropy", "position_entropy"]
idx = pd.MultiIndex.from_product([vector_names, 
                                  [ param["eps"] for param in dbscan_params]],
                             names=['types', 'eps'])
evaluation_df = pd.DataFrame(index=idx, columns=column_names)
# sns.distplot(change_object_dataframe["relative_position"])



In [8]:
%%time
for cluster_by in vector_names:
    distances = pairwise_distances(vectors[cluster_by])
    for dbscan_param in dbscan_params:
        clusters = DBSCAN(**dbscan_param, metric="precomputed").fit(distances)
        evaluation_df.loc[(cluster_by,dbscan_param["eps"]),:] = evaluate(
            change_object_dataframe, clusters.labels_, evaluation_df, column_names)

CPU times: user 1min 12s, sys: 4min 17s, total: 5min 29s
Wall time: 2min 31s


In [9]:
split_df = evaluation_df.reset_index()["types"].str.split("_", expand = True)[[0,1,2]]
split_df.index = evaluation_df.index
evaluation_df[["cleaned", "weighted", "neighbour_size"]] = split_df
# evaluation_df = evaluation_df.reset_index().set_index(["cleaned", "weighing", "neighbour"])
evaluation_df=evaluation_df.reset_index().set_index(["cleaned", "weighted", "neighbour_size","eps"])

In [546]:
#uncomment to save again
# pre_evaluation_dir = "../data/pre_evaluation/"
# file_name = f"{article_name}.csv"
# full_file_path = os.path.join(pre_evaluation_dir, file_name)
# evaluation_df.to_csv(full_file_path)
# for name in column_names:
#     file_name = f"{article_name}_{name}.csv"
#     full_file_path = os.path.join(pre_evaluation_dir, file_name)
#     with open(full_file_path, "w"):
#         evaluation_df[name].unstack([-2,-1]).to_csv(full_file_path)

In [14]:

evaluation_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,types,top2_ratio,no_of_outliers,no_of_clusters,relative_position_std_less_than_.1,max_cluster_size,min_cluster_size,mean_cluster_size,skewness_cluster_size,kurtosis_cluster_size,median_cluster_size,inter_quartile_range_cluster_size,variance_cluster_size,standard_deviation_cluster_size,gini,token_entropy,position_entropy
cleaned,weighted,neighbour_size,eps,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
clean,weighted,4,0.5,clean_weighted_4,0.463768,3194,79,36,69,5,9.35443,4.20151,22.6288,6.0,3.0,84.9753,9.21821,-0.350559,3.37336,2.40829
clean,weighted,4,1.0,clean_weighted_4,0.463768,3096,91,37,69,5,9.1978,4.45823,25.8392,7.0,4.0,74.6716,8.64127,-0.333557,3.35729,2.37105
clean,weighted,4,1.5,clean_weighted_4,0.57971,2865,112,41,69,5,9.53571,3.92193,19.3813,6.5,5.0,79.7645,8.9311,-0.346174,3.45313,2.42775
clean,weighted,4,2.0,clean_weighted_4,0.239766,2656,121,40,171,5,10.5537,8.18234,78.2792,7.0,5.0,267.699,16.3615,-0.398209,3.62038,2.70439
clean,weighted,10,0.5,clean_weighted_10,0.95,3602,48,38,20,5,6.89583,2.91881,9.15301,6.0,2.0,10.223,3.19734,-0.189892,3.00382,1.85579
clean,weighted,10,1.0,clean_weighted_10,0.95,3459,65,45,20,5,7.29231,2.20431,5.52889,6.0,3.0,10.1163,3.18062,-0.203051,2.94588,1.93208
clean,weighted,10,1.5,clean_weighted_10,0.952381,3108,112,77,21,5,7.36607,2.215,5.40338,6.0,3.0,10.6306,3.26045,-0.204924,2.7615,1.98382
clean,weighted,10,2.0,clean_weighted_10,0.532847,2487,142,76,137,3,10.1831,6.70439,54.2164,6.0,5.0,190.988,13.8198,-0.385093,3.34099,2.62188
clean,weighted,30,0.5,clean_weighted_30,0.95,3508,67,57,20,5,6.34328,3.73315,15.726,5.0,1.0,7.38037,2.71668,-0.158174,2.84455,1.77385
clean,weighted,30,1.0,clean_weighted_30,0.730769,2572,147,100,52,3,9.2585,3.12856,13.0615,6.0,5.0,46.2615,6.80158,-0.313235,3.19601,2.32068


In [11]:
evaluation_df["top2_ratio"].unstack([-2,-1])

Unnamed: 0_level_0,neighbour_size,4,4,4,4,10,10,10,10,30,30,30,30
Unnamed: 0_level_1,eps,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0
cleaned,weighted,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
clean,notweighted,0.463768,0.463768,0.463768,0.194737,0.95,0.95,0.958333,0.100304,1.0,0.487179,0.234275,0.0120219
clean,weighted,0.463768,0.463768,0.57971,0.239766,0.95,0.95,0.952381,0.532847,0.95,0.730769,0.163978,0.0125428
notclean,notweighted,0.916667,0.916667,0.352941,0.239899,0.631579,0.866667,0.0546584,0.0149682,0.956522,0.0489251,0.0112873,0.0
notclean,weighted,0.916667,0.958333,0.930233,0.241983,0.95,0.909091,0.769231,0.0117688,0.769231,0.68,0.019037,0.0


In [15]:
display("no_of_clusters")
display(evaluation_df["no_of_clusters"].unstack([-2,-1]))
display("no_of_outliers")
display(evaluation_df["no_of_outliers"].unstack([-2,-1]))

'no_of_clusters'

Unnamed: 0_level_0,neighbour_size,4,4,4,4,10,10,10,10,30,30,30,30
Unnamed: 0_level_1,eps,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0
cleaned,weighted,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
clean,notweighted,79,82,91,110,33,57,115,141,67,153,124,14
clean,weighted,79,91,112,121,48,65,112,142,67,147,139,19
notclean,notweighted,48,59,93,110,25,71,98,30,94,126,8,1
notclean,weighted,55,69,98,107,35,87,145,34,87,153,14,1


'no_of_outliers'

Unnamed: 0_level_0,neighbour_size,4,4,4,4,10,10,10,10,30,30,30,30
Unnamed: 0_level_1,eps,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0
cleaned,weighted,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
clean,notweighted,3188,3172,3087,2818,3696,3548,3008,2251,3490,2334,876,129
clean,weighted,3194,3096,2865,2656,3602,3459,3108,2487,3508,2572,1181,208
notclean,notweighted,3526,3443,3143,2237,3766,3385,2135,517,3250,1147,71,6
notclean,weighted,3483,3372,3043,2173,3684,3271,2112,633,3316,1383,136,11


In [13]:
display(evaluation_df["max_cluster_size"].unstack([-2,-1]))

Unnamed: 0_level_0,neighbour_size,4,4,4,4,10,10,10,10,30,30,30,30
Unnamed: 0_level_1,eps,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0
cleaned,weighted,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
clean,notweighted,69,69,69,190,20,20,24,329,20,117,1097,3660
clean,weighted,69,69,69,171,20,20,21,137,20,52,744,3508
notclean,notweighted,24,24,68,396,19,30,805,3140,23,1349,3721,3927
notclean,weighted,24,24,43,343,20,22,117,2889,26,400,3572,3922


In [526]:
evaluation_df["token_entropy"].unstack([-2,-1])

Unnamed: 0_level_0,neighbour_size,4,4,4,4,10,10,10,10,30,30,30,30
Unnamed: 0_level_1,eps,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0
cleaned,weighted,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
clean,notweighted,3.37595,3.37532,3.40105,3.53415,3.06973,2.68719,2.82291,3.62986,2.85128,3.42777,4.69026,5.69034
clean,weighted,3.37336,3.35729,3.45313,3.62038,3.00382,2.94588,2.7615,3.34099,2.84455,3.19601,4.38008,5.64475
notclean,notweighted,2.8765,2.78421,2.83378,3.94387,3.0072,2.87451,4.3642,5.60036,2.66743,4.63325,5.75182,5.78588
notclean,weighted,2.82964,2.76041,2.80784,3.92621,2.78459,2.79908,3.46042,5.52011,2.64315,4.14139,5.71765,5.78584


In [527]:
evaluation_df["position_entropy"].unstack([-2,-1])

Unnamed: 0_level_0,neighbour_size,4,4,4,4,10,10,10,10,30,30,30,30
Unnamed: 0_level_1,eps,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0,0.5,1.0,1.5,2.0
cleaned,weighted,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2
clean,notweighted,2.41077,2.39194,2.37805,2.72264,1.85381,1.84158,2.0923,3.00568,1.82513,2.54256,4.56695,7.90224
clean,weighted,2.40829,2.37105,2.42775,2.70439,1.85579,1.93208,1.98382,2.62188,1.77385,2.32068,3.94538,7.75419
notclean,notweighted,2.11719,2.10509,2.25419,3.60074,1.74027,2.02787,4.32818,7.51269,1.96038,4.79141,7.94551,8.18022
notclean,weighted,2.09019,2.08247,2.28205,3.49394,1.85432,2.00703,2.90923,7.233,1.94769,3.55214,7.79724,8.17883


In [10]:
# g = sns.FacetGrid(evaluation_df.loc["clean"].reset_index(), col="types", col_wrap=2, sharex=True, sharey=True, aspect=1.6, height=4.5, despine=False, hue="eps")
# g = (g.map(plt.scatter,  "no_of_clusters", "token_entropy").add_legend())
# plt.yscale("log")