In [1]:
import sys,os
sys.path.append("../")

import pandas as pd
import numpy as np
import pickle
import itertools
from sklearn.cluster import DBSCAN
from sklearn.metrics import adjusted_mutual_info_score
from sklearn.metrics import pairwise_distances  
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import mutual_info_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import completeness_score
from sklearn.metrics import homogeneity_score
from sklearn.metrics import v_measure_score


from scipy.stats import entropy
import seaborn as sns

import matplotlib.pyplot as plt
from IPython.display import HTML
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

sns.set()
sns.set_style("darkgrid")

## Reading the change object and clustering.

In [2]:
article_name = "John_Logie_Baird"
change_object_dir =  "../data/change objects/"

change_object_file_name = f"{article_name}_vec.npz"
filename =  f"{article_name}_change.h5"

change_object_file = os.path.join(change_object_dir, filename)

content_dir = "../data/content/"

filename = article_name + ".h5"
filepath = os.path.join(content_dir, filename)
with pd.HDFStore(filepath, 'r') as store:
    token_string_df = store.get("all_tokens")
    
token_string_df = token_string_df.set_index("token_id")["str"]
token_string_df[-1] = "St@rt"
token_string_df[-2] = "$nd"
change_vector_dir = "../data/change_vector/"
change_vec_filename = f"{article_name}.npz"
change_vector_file = os.path.join(change_vector_dir, change_vec_filename)

content_dir = "../data/content/"
len_file = article_name + "_rev_len.h5"
len_file_path = os.path.join(content_dir, len_file)


if os.path.exists(change_object_file):
    with pd.HDFStore(change_object_file, 'r') as store:
        change_object_dataframe = store.get("data")
else:
    print("file do not exist")
    
    


change_object_dataframe["del_string_tokens"] = change_object_dataframe["del_tokens"].apply(
    lambda x:  tuple(token_string_df[np.array(x)].tolist()))

change_object_dataframe["ins_string_tokens"] = change_object_dataframe["ins_tokens"].apply(
    lambda x:  tuple(token_string_df[np.array(x)].tolist()))

change_object_dataframe["left_context"] = change_object_dataframe["left_token"].apply(
    lambda x:  tuple(token_string_df[np.array(x)].tolist())).str.join(" ")

change_object_dataframe["right_context"] = change_object_dataframe["right_token"].apply(
    lambda x:  tuple(token_string_df[np.array(x)].tolist())).str.join(" ")

change_object_dataframe["edit_string_tokens"] = change_object_dataframe["ins_string_tokens"] + change_object_dataframe["del_string_tokens"]


# rev_len_df = pd.read_hdf(len_file_path, key = "rev_len")
vectors ={}

with open(change_vector_file, "rb") as file:
    arrays_dict = np.load(file)
    vectors[2] = arrays_dict["2_clean_not_weighted"]
    vectors[4] = arrays_dict["4_clean_not_weighted"]
#     vectors[6] = arrays_dict["6_clean_not_weighted"]
    vectors[8] = arrays_dict["8_clean_not_weighted"]
    vectors[10] = arrays_dict["10_clean_not_weighted"]
#     vectors[12] = arrays_dict["12_clean_not_weighted"]
    vectors[15] = arrays_dict["15_clean_not_weighted"]
    vectors[20] = arrays_dict["20_clean_not_weighted"]
    vectors[25] = arrays_dict["25_clean_not_weighted"]
    vectors[30] = arrays_dict["30_clean_not_weighted"]

#### read annotations

In [3]:
file_name = article_name + "_FULL.csv"
annotation_dir = "../data/annotation/"
full_file_path = os.path.join(annotation_dir, file_name)
annotation_df = pd.read_csv(full_file_path)
annotation_df = annotation_df[["revid_ctxt", "token_id",
                               "rev_id", "nationality", "birth_place", "Bulk" ]]

### clustering


In [4]:
vector_names = list(vectors.keys())
context_array  = vector_names
eps_array = np.arange(0.25,4.1,0.25)
# [ 0.1, 0.2, 0.3,0.4, 0.5, 0.6, 0.7, 0.8, 1.0, 1.2, 1.3, 1.4, 1.5,1.6,1.8,2.0,2.2]

min_samples_array = [2, 5, 10, 20, 30, 50]
all_combinations = list(itertools.product(context_array, eps_array,
                                          min_samples_array))
dbscan_params = list(itertools.product(eps_array,min_samples_array))
idx = pd.MultiIndex.from_product([context_array, eps_array,min_samples_array],
                                names=["context","eps","min_samples"])
cluster_df = pd.DataFrame(columns=idx)

evaluation_df = pd.DataFrame(index=idx, columns=["gap_entropy"])



In [5]:
%%time 
for cluster_by in vector_names:
    distances = pairwise_distances(vectors[cluster_by])
    for eps, min_samples in dbscan_params:
        cluster_df[cluster_by,eps, min_samples] = DBSCAN(eps=eps, min_samples=min_samples, 
                                                         metric="precomputed").fit(distances).labels_


CPU times: user 2min 34s, sys: 57.4 s, total: 3min 31s
Wall time: 3min 34s


In [6]:
def weighted_token_entropy(dataframe, group_by):
    cluster_sizes = dataframe.groupby(group_by).size()
    token_entropy_clusters = dataframe.groupby(group_by)["edit_string_tokens"].apply(
                    lambda token_tuples: entropy(pd.Series(
                    [token for token_tuple in token_tuples.tolist() for token in token_tuple]
                    ).value_counts().values))
    cluster_entropy = (cluster_sizes * token_entropy_clusters).sum()
    return cluster_entropy

In [59]:
cluster_df.index = change_object_dataframe.index
dbscan_results = pd.concat([change_object_dataframe, cluster_df], axis=1)
dbscan_results.to_hdf("../data/evaluation/"+article_name+"_clusters.h5", "clusters")

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->axis0] [items->None]

  f(store)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block1_items] [items->None]

  f(store)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block2_values] [items->['ins_tokens', 'del_tokens', 'left_neigh_slice', 'right_neigh_slice', 'left_token', 'right_token', 'del_string_tokens', 'ins_string_tokens', 'left_context', 'right_context', 'edit_string_tokens']]

  return pytables.to_hdf(path_or_buf, key, self, **kwargs)


In [8]:
%%time
# entropy_series = pd.Series(index=all_combinations)

for context, eps, min_samples in all_combinations:
    evaluation_df.loc[(context, eps, min_samples),"gap_entropy"] = weighted_token_entropy(dbscan_results, (context, eps, min_samples))
# all_combinations_without_optimization[0]

CPU times: user 1min 46s, sys: 1.11 s, total: 1min 47s
Wall time: 1min 48s


#### Splitting change object to match annotations

In [9]:
# insert array is always done in to revision so taking it and leaving other change object where 
ins_array = change_object_dataframe.reset_index().loc[
    change_object_dataframe["ins_start_pos"].values != -1, 
                  ["to revision id","ins_tokens", 'to revision id']].values
ins_cluster = cluster_df.loc[
    change_object_dataframe["ins_start_pos"].values != -1, :]

# delete array is always done in from revision so taking it and leaving other change object where delete does not come.
del_array = change_object_dataframe.reset_index().loc[
    change_object_dataframe["del_start_pos"].values != -1, 
                  ["from revision id","del_tokens", 'to revision id']].values
del_cluster = cluster_df.loc[
    change_object_dataframe["del_start_pos"].values != -1, :]

gap_array = np.concatenate([ins_array,del_array], axis=0)
gap_df = pd.DataFrame(gap_array,columns=["revid_ctxt", "token_id",
                               "rev_id"])

gap_cluster= pd.concat([ins_cluster, del_cluster], axis=0)
gap_df = gap_df.set_index(['revid_ctxt', 'rev_id'])
gap_cluster_df = pd.concat([ins_cluster, del_cluster], axis=0)

gap_cluster_df.index=gap_df.index

In [10]:
def token_in_gap(ann, gap_df, gap_cluster_df):
    context_gap = gap_df.loc[ann[['revid_ctxt', 'rev_id']]]
    context_cluster = gap_cluster_df.loc[ann[['revid_ctxt', 'rev_id']]]
    clusters = context_cluster.loc[ context_gap["token_id"].apply(
            lambda x: ann["token_id"] in x),:].values
    if clusters.size >0:
            clusters = pd.Series(clusters[0],index=gap_cluster_df.columns)
    else:
        clusters = pd.Series(-10, index=gap_cluster_df.columns)
    return clusters

In [11]:
def weighted_entropy(dataframe, entropy_column, group_columns="cluster", ):
    group_size = dataframe.groupby(group_columns).size()
    group_entropy = dataframe.groupby(group_columns)[entropy_column].apply(lambda x: entropy(x.value_counts().values))
    weighted_entropy = (group_size * group_entropy).mean()
    return weighted_entropy

In [12]:
# Finding the tokens who were in the gap.
al_combination_clusters_df = annotation_df.apply(token_in_gap, axis=1, args=(gap_df, gap_cluster_df))

In [13]:
annotation_clusters = pd.concat([annotation_df, al_combination_clusters_df], axis=1)

In [14]:
true_labels = np.zeros((annotation_df.shape[0]))
true_labels[(annotation_df["nationality"].str.strip() == "Y").values] = 1
annotation_df["nationality"] = true_labels
#true_labels[true_lable_df["birth_place"].str.strip() == "Y"] = 2


# Calculating the entropy of the Annotated change objects

In [15]:
df2 = gap_df.copy()
df2=df2.reset_index()
df2['nationality'] = 0

aci = annotation_clusters.set_index(['revid_ctxt', 'rev_id']).sort_index()
aci = aci[aci['Bulk'] =='N']
aci_y=aci[aci['nationality'] == 'Y']
aci_n=aci[aci['nationality'] == 'N']

counter = 0
def nat_val(row):
    global counter
    val = 0
    x=0
    y=0
    try:
        x = int(aci_y.loc[(row[0],row[1]), ['token_id']].isin(row[2]).sum())
        val = val + (1 if x > 0 else 0)
        
    except KeyError as e:
        pass
    try:
        y = int(aci_n.loc[(row[0],row[1]), ['token_id']].isin(row[2]).sum())
        val = val - (1 if y > 0 else 0)
        counter += y
    except KeyError as e:
        pass
#     if (x + y) > 1:
#         print((aci.loc[(row[0],row[1]),['token_id','nationality','Bulk']])[aci.loc[(row[0],row[1]), 'token_id'].isin(row[2])])
#         print(aci.loc[(row[0],row[1]), ['token_id']].isin(row[2]))
        
#     if (1 if x > 0 else 0) + (1 if y > 0 else 0) > 1:
#         try:
#             if not (aci.loc[(row[0],row[1]),['Bulk']] == 'Y').all()[0] or True:
#                 print((aci.loc[(row[0],row[1]),['token_id','nationality','Bulk']])[aci.loc[(row[0],row[1]), 'token_id'].isin(row[2])])
#                 print(aci.loc[(row[0],row[1]), ['token_id']].isin(row[2]))
#         except:
#             import pdb; pdb.set_trace()
    return val 

counter = 0
def nat_val2(row):
    global counter
    val = 0
    x=0
    y=0

    try:
        x = int(aci_y.loc[(row[0],row[1]), ['token_id']].isin(row[2]).sum())
        val = val + (1 if x > 0 else 0)
        
    except KeyError as e:
        pass
#     if (x + y) > 1:
#         print((aci.loc[(row[0],row[1]),['token_id','nationality','Bulk']])[aci.loc[(row[0],row[1]), 'token_id'].isin(row[2])])
#         print(aci.loc[(row[0],row[1]), ['token_id']].isin(row[2]))
        
#     if (1 if x > 0 else 0) + (1 if y > 0 else 0) > 1:
#         try:
#             if not (aci.loc[(row[0],row[1]),['Bulk']] == 'Y').all()[0] or True:
#                 print((aci.loc[(row[0],row[1]),['token_id','nationality','Bulk']])[aci.loc[(row[0],row[1]), 'token_id'].isin(row[2])])
#                 print(aci.loc[(row[0],row[1]), ['token_id']].isin(row[2]))
#         except:
#             import pdb; pdb.set_trace()
    return val 

df2['nationality_full'] = df2[['revid_ctxt', 'rev_id', 'token_id']].apply(nat_val, axis=1)
df2['nationality'] = df2[['revid_ctxt', 'rev_id', 'token_id']].apply(nat_val2, axis=1)
df3 = pd.concat([df2, gap_cluster_df.reset_index()], axis=1)

In [16]:
df3['nationality'].value_counts()

0    6528
1     405
Name: nationality, dtype: int64

In [17]:
entropies = []
print("Without bulks")
for context, eps, min_samples in all_combinations:
    #print(str((context, eps, min_samples)) + ": " + str(weighted_entropy(df3, entropy_column="nationality", group_columns=(context, eps, min_samples))))
    evaluation_df.loc[(context, eps, min_samples),"change_object_entropy"] = weighted_entropy(df3, 
                                                                            entropy_column="nationality", 
                                                                            group_columns=(context, eps, min_samples))
    evaluation_df.loc[(context, eps, min_samples),"change_object_completness"] = completeness_score(df3["nationality"], df3[(context, eps, min_samples)])
    evaluation_df.loc[(context, eps, min_samples),"change_object_homegenity"] = homogeneity_score(df3["nationality"], df3[(context, eps, min_samples)])
    evaluation_df.loc[(context, eps, min_samples),"change_object_vmeasure"] = v_measure_score(df3["nationality"], df3[(context, eps, min_samples)])

Without bulks


# Calculating the entropy of the Annotated cases

In [18]:
df4 = annotation_clusters[annotation_clusters['Bulk']=='N']
for context, eps, min_samples in all_combinations:
    evaluation_df.loc[(context, eps, min_samples),"row_based_entropy"] = weighted_entropy(df4, 
                                                                                entropy_column="nationality", 
                                                                                group_columns=(context, eps, min_samples))
    
    evaluation_df.loc[(context, eps, min_samples),"row_based_completness"] = completeness_score(df4["nationality"], df4[(context, eps, min_samples)])
    evaluation_df.loc[(context, eps, min_samples),"row_based_homegenity"] = homogeneity_score(df4["nationality"], df4[(context, eps, min_samples)])
    evaluation_df.loc[(context, eps, min_samples),"row_based_vmeasure"] = v_measure_score(df4["nationality"], df4[(context, eps, min_samples)])
#evaluation_df.reset_index().set_index(["min_samples", "eps", "context"]).loc[2]["entropy"].sort_values().iloc[0:50]

evaluation_df.reset_index().to_csv("../data/evaluation/"+article_name+".csv", index=False)

In [19]:
evaluation_df[['change_object_completness', 'change_object_homegenity',
    "change_object_vmeasure", "gap_entropy",]].astype(np.float64).corr()

Unnamed: 0,change_object_completness,change_object_homegenity,change_object_vmeasure,gap_entropy
change_object_completness,1.0,0.071451,0.196078,0.0001
change_object_homegenity,0.071451,1.0,0.646505,-0.903799
change_object_vmeasure,0.196078,0.646505,1.0,-0.343899
gap_entropy,0.0001,-0.903799,-0.343899,1.0


In [20]:
_df = evaluation_df.reset_index().set_index(["min_samples", 
"eps", "context"]).loc[2].reset_index()
_df[['change_object_completness', 'change_object_homegenity',
    "change_object_vmeasure", "gap_entropy",]].astype(np.float64).corr()

Unnamed: 0,change_object_completness,change_object_homegenity,change_object_vmeasure,gap_entropy
change_object_completness,1.0,0.931672,0.994662,-0.873206
change_object_homegenity,0.931672,1.0,0.944028,-0.979712
change_object_vmeasure,0.994662,0.944028,1.0,-0.899518
gap_entropy,-0.873206,-0.979712,-0.899518,1.0


In [21]:
_df = evaluation_df.reset_index().set_index(["min_samples", 
"eps", "context"]).loc[2].reset_index()



cols = [
    'change_object_completness', 'change_object_homegenity', "Legend",
    "change_object_vmeasure", "gap_entropy",
]

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(50,20))

axes[0,0].set_ylim([0,1])
axes[0,1].set_ylim([0,1])
axes[1,0].set_ylim([0,1])
fig.suptitle(f'V-Measure analysis of John_Logie_Baird for various context_length and eps with min_samples=2 ', 
             fontsize=24 , fontweight="bold")


for i, column in enumerate(cols):
    if column =="Legend":
            
            handles, labels = _ax.get_legend_handles_labels()
            #fig.legend(handles, labels, loc='lower right')
            _ax = axes[int(i/3),i%3]
            _ax.legend(handles, labels, loc='upper left')
            _ax.axis('off')

    else:
#         _ax = axes[i]
        _ax = axes[int(i/3),i%3]


#         _ax.xaxis.label.set_visible(False)
#         _ax.legend(False)
        _ax.set_xlabel("eps")

        for name2, subdf2 in _df.groupby(['context']): 
            subdf2.plot(x = 'eps', y=column, ax=_ax, label=str(name2), title=column.split("_")[-1],  xticks=eps_array, legend=False)

handles, labels = _ax.get_legend_handles_labels()
axes[0,2].legend(handles, labels, loc='upper left', title="context size",)
# axes[3].legend(handles, labels, loc='upper left', title="context size",)


axes[0,2].axis('off')
axes[1,2].axis('off')

fig.savefig("visualisation/jlb-ours-co-2.png", dpi=600)


In [22]:
_df = evaluation_df.reset_index().set_index(["min_samples", 
"eps", "context"]).loc[5].reset_index()



cols = [
    'change_object_completness', 'change_object_homegenity', "Legend",
    "change_object_vmeasure", "gap_entropy",
]

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(50,20))
axes[0,0].set_ylim([0,1])
axes[0,1].set_ylim([0,1])
axes[1,0].set_ylim([0,1])
fig.suptitle(f'V-Measure analysis of John_Logie_Baird for various context_length and eps with min_samples=5 ', 
             fontsize=36, fontweight="bold")


for i, column in enumerate(cols):
    if column =="Legend":
            
            handles, labels = _ax.get_legend_handles_labels()
            #fig.legend(handles, labels, loc='lower right')
            _ax = axes[int(i/3),i%3]
            _ax.legend(handles, labels, loc='upper left')
            _ax.axis('off')

    else:
#         _ax = axes[i]
        _ax = axes[int(i/3),i%3]


#         _ax.xaxis.label.set_visible(False)
#         _ax.legend(False)
        _ax.set_xlabel("eps")

        for name2, subdf2 in _df.groupby(['context']): 
            subdf2.plot(x = 'eps', y=column, ax=_ax, label=str(name2), title=column.split("_")[-1],  xticks=eps_array, legend=False)

handles, labels = _ax.get_legend_handles_labels()
axes[0,2].legend(handles, labels, loc='upper left', title="context size",)
# axes[3].legend(handles, labels, loc='upper left', title="context size",)


axes[0,2].axis('off')
axes[1,2].axis('off')

fig.savefig("visualisation/jlb-ours-co-5.png", dpi=600)



In [23]:
_df = evaluation_df.reset_index().set_index(["min_samples", 
"eps", "context"]).loc[50].reset_index()



cols = [
    'change_object_completness', 'change_object_homegenity', "Legend",
    "change_object_vmeasure", "gap_entropy",
]

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(50, 20))
fig.suptitle(f'V-Measure analysis of John_Logie_Baird for various context_length and eps with min_samples=50 ', 
             fontsize=36, fontweight="bold")
axes[0,0].set_ylim([0,1])
axes[0,1].set_ylim([0,1])
axes[1,0].set_ylim([0,1])

for i, column in enumerate(cols):
    if column =="Legend":
            
            handles, labels = _ax.get_legend_handles_labels()
            #fig.legend(handles, labels, loc='lower right')
            _ax = axes[int(i/3),i%3]
            _ax.legend(handles, labels, loc='upper left')
            _ax.axis('off')

    else:
#         _ax = axes[i]
        _ax = axes[int(i/3),i%3]


#         _ax.xaxis.label.set_visible(False)
#         _ax.legend(False)
        _ax.set_xlabel("eps")

        for name2, subdf2 in _df.groupby(['context']): 
            subdf2.plot(x = 'eps', y=column, ax=_ax, label=str(name2), title=column.split("_")[-1],  xticks=eps_array, legend=False)

handles, labels = _ax.get_legend_handles_labels()
axes[0,2].legend(handles, labels, loc='upper left', title="context size",)
# axes[3].legend(handles, labels, loc='upper left', title="context size",)


axes[0,2].axis('off')
axes[1,2].axis('off')

fig.savefig("visualisation/jlb-ours-co-50.png", dpi=600)




In [24]:
# _df = evaluation_df.reset_index().set_index(["min_samples", 
# "eps", "context"]).loc[5].reset_index()

# cols = [
#     'row_based_completness', 'row_based_homegenity', 'row_based_vmeasure',

# ]

# fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(24,5))

# axes[0,0].set_ylim([0,1])
# axes[0,1].set_ylim([0,1])
# axes[1,0].set_ylim([0,1])
# for i, column in enumerate(cols):
#     if column in _df.columns:
#         _ax = axes[i]
#         _ax.xaxis.label.set_visible(False)

#         for name2, subdf2 in _df.groupby(['context']): 
#             subdf2.plot(x = 'eps', y=column, ax=_ax, label=str(name2), title=column,  xticks=eps_array)

# handles, labels = _ax.get_legend_handles_labels()
# axes[3].legend(handles, labels, loc='upper left')
# axes[3].axis('off')


In [25]:
evaluation_df.reset_index().set_index(["min_samples", 
"eps", "context"])["change_object_vmeasure"].sort_values().iloc[-20:]

min_samples  eps   context
10           0.75  20         0.138329
20           1.00  15         0.139846
             0.50  25         0.140066
             2.00  8          0.142369
50           2.25  10         0.145335
30           1.50  15         0.146098
50           1.25  30         0.146502
                   25         0.149093
10           0.50  25         0.151193
30           1.25  20         0.151276
             1.75  10         0.158588
50           1.50  20         0.161853
20           1.00  20         0.162006
             0.75  30         0.162820
30           2.00  10         0.168044
20           1.50  10         0.178537
50           1.00  30         0.181075
20           1.75  8          0.184929
                   10         0.190063
50           1.75  15         0.211642
Name: change_object_vmeasure, dtype: float64

In [26]:
evaluation_df.reset_index().set_index(["min_samples", 
"eps", "context"])["change_object_completness"].sort_values().iloc[-20:]

min_samples  eps   context
20           1.00  20         0.107533
             0.75  30         0.108901
             0.50  25         0.110127
50           1.25  25         0.113593
30           1.75  10         0.114095
20           1.50  10         0.118733
             1.75  10         0.122981
50           1.50  20         0.123888
20           1.75  8          0.124093
50           1.75  10         0.129318
             1.50  8          0.138596
             1.00  30         0.139274
             1.75  8          0.140918
                   15         0.165547
             0.50  20         1.000000
             0.25  30         1.000000
                   15         1.000000
                   25         1.000000
                   20         1.000000
             0.50  15         1.000000
Name: change_object_completness, dtype: float64

In [27]:
evaluation_df.reset_index().set_index(["min_samples", 
"eps", "context"])["change_object_homegenity"].sort_values().iloc[-20:]

min_samples  eps   context
2            2.25  4          0.744450
             2.50  2          0.746132
             1.75  8          0.748709
             1.00  4          0.751517
             0.50  4          0.751617
             0.25  4          0.751617
             0.75  4          0.751678
             1.25  4          0.752165
             2.00  4          0.753725
             1.75  4          0.754929
             1.50  4          0.755881
             2.25  2          0.758501
             2.00  2          0.758574
             0.50  2          0.759280
             0.25  2          0.759280
             0.75  2          0.759321
             1.00  2          0.759341
             1.25  2          0.759383
             1.50  2          0.759435
             1.75  2          0.759571
Name: change_object_homegenity, dtype: float64

In [28]:
evaluation_df.reset_index().set_index([  "min_samples", "context", "eps", ]).loc[2]["change_object_homegenity"].sort_values().iloc[-10:]

context  eps 
4        1.50    0.755881
2        2.25    0.758501
         2.00    0.758574
         0.50    0.759280
         0.25    0.759280
         0.75    0.759321
         1.00    0.759341
         1.25    0.759383
         1.50    0.759435
         1.75    0.759571
Name: change_object_homegenity, dtype: float64

In [29]:
evaluation_df.reset_index().set_index([ "min_samples", "context", "eps", ]).loc[2]["change_object_completness"].sort_values().iloc[-10:]

context  eps 
2        0.75    0.036372
         1.25    0.036373
         1.00    0.036424
         2.25    0.038025
         3.50    0.038813
         3.00    0.039097
         2.50    0.039783
         2.75    0.042229
         3.75    0.042699
         4.00    0.049454
Name: change_object_completness, dtype: float64

In [30]:
evaluation_df.reset_index().set_index(["min_samples", "context", "eps", ]).loc[2]["change_object_vmeasure"].sort_values().iloc[-10:]

context  eps 
2        0.75    0.069420
         1.25    0.069421
         1.00    0.069514
         3.50    0.071344
         2.25    0.072419
         3.00    0.073568
         2.50    0.075538
         3.75    0.077300
         2.75    0.079900
         4.00    0.087916
Name: change_object_vmeasure, dtype: float64

In [31]:
evaluation_df.reset_index().set_index([ "eps",  "min_samples", "context"]).loc[0.75,2,15]

gap_entropy                    17457.9
change_object_entropy         0.539539
change_object_completness    0.0319079
change_object_homegenity      0.677996
change_object_vmeasure       0.0609475
row_based_entropy              1.51017
row_based_completness         0.130169
row_based_homegenity          0.687536
row_based_vmeasure            0.218896
Name: (0.75, 2, 15), dtype: object

In [32]:
evaluation_df.reset_index().set_index([ "eps",  "min_samples", "context"]).loc[0.5,10,2]

gap_entropy                    26342.9
change_object_entropy          14.4742
change_object_completness    0.0577926
change_object_homegenity      0.474758
change_object_vmeasure        0.103042
row_based_entropy              9.40906
row_based_completness         0.127962
row_based_homegenity          0.435975
row_based_vmeasure            0.197853
Name: (0.5, 10, 2), dtype: object

### Example of best groups

In [33]:
pd.set_option('display.max_colwidth', 860)

In [34]:
dbscan_results.groupby((4, 2.25, 2,)).get_group(df3.groupby((4, 2.25, 2,))["token_id"].size().sort_values(ascending=False).index[2]).reset_index()[[ 'left_context',
     'del_string_tokens',  'ins_string_tokens',   'right_context']]



Unnamed: 0,left_context,del_string_tokens,ins_string_tokens,right_context
0,"resting _ place _ coordinates = <!-- {{ coord | lat | long | display = inline }} --> | monuments = | residence = scotland , england | nationality =","(scottish,)","(brittish,)","| other _ names = | citizenship = united kingdom | education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] , glasgow"
1,"resting _ place _ coordinates = <!-- {{ coord | lat | long | display = inline }} --> | monuments = | residence = scotland , england | nationality =","(brittish,)","(british,)","| other _ names = | citizenship = united kingdom | education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] , glasgow"
2,"resting _ place _ coordinates = <!-- {{ coord | lat | long | display = inline }} --> | monuments = | residence = scotland , england | nationality =","(british,)","(scottish,)","| other _ names = | citizenship = united kingdom | education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] , glasgow"
3,"resting _ place _ coordinates = <!-- {{ coord | lat | long | display = inline }} --> | monuments = | residence = scotland , england | nationality =","(scottish,)","(british,)","| other _ names = | citizenship = united kingdom | education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] , glasgow"
4,"resting _ place _ coordinates = <!-- {{ coord | lat | long | display = inline }} --> | monuments = | residence = scotland , england | nationality =","(british,)","(scottish,)","| other _ names = | citizenship = united kingdom | education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] , glasgow"
5,"resting _ place _ coordinates = <!-- {{ coord | lat | long | display = inline }} --> | monuments = | residence = scotland , england | nationality =","(scottish,)","(american,)","| other _ names = | citizenship = united kingdom | education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] , glasgow"
6,"resting _ place _ coordinates = <!-- {{ coord | lat | long | display = inline }} --> | monuments = | residence = america , york | nationality =","(american,)","(scottish,)","| other _ names = | citizenship = america | education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] , glasgow <"
7,]] | resting _ place _ coordinates = <!-- {{ coord | lat | long | display = inline }} --> | monuments = | residence = scotland | nationality =,"(scottish,)","(british,)","| other _ names = | citizenship = united kingdom | education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] , glasgow"
8,]] | resting _ place _ coordinates = <!-- {{ coord | lat | long | display = inline }} --> | monuments = | residence = scotland | nationality =,"(british,)","(scottish,)","| other _ names = | citizenship = united kingdom | education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] , glasgow"
9,]] | resting _ place _ coordinates = <!-- {{ coord | lat | long | display = inline }} --> | monuments = | residence = scotland | nationality =,"(scottish,)","(maltese,)","| other _ names = | citizenship = united kingdom | education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] , glasgow"


In [35]:
dbscan_results.groupby((4,2.25,2)).get_group(df3.groupby((4,2.25,2))["token_id"].size().sort_values(ascending=False).index[1]).reset_index()[[ 'left_context',
     'del_string_tokens',  'ins_string_tokens',   'right_context']]



Unnamed: 0,left_context,del_string_tokens,ins_string_tokens,right_context
0,St@rt ' ' ' john logie baird ' ' ' of scotland ( [[ university of glasgow ]] ) was the first to demonstrate [[ television ]],"((, which, see, ), in)","(,, a, device, he, presented, to)",the mid 1920s ( [[ 1926 ]] ? ) .
1,' ' john logie baird ' ' ' of scotland ( [[ university of glasgow ]] ) was the first to demonstrate [[ television ]] ( which see ) in the,"(mid, 1920s, ()","([[, royal, institute, ]], and, a, reporter, from, [[, the, times, ]], on, [[, january, 26, ]])",[[ 1926 ]] ? ) .
2,' ' of scotland ( [[ university of glasgow ]] ) was the first to demonstrate [[ television ]] ( which see ) in the mid 1920s ( [[ 1926 ]],"(?, ), .)",(),
3,St@rt ' ' ' john logie baird ' ' ' ( b . [[ august,"(14,)","(13,)","]] [[ 1888 ]] , d . [[ june 14 ]] [[ 1946 ]] ) of [[ scotland ]] ( [[ university of glasgow ]] ) was the first to demonstrate"
4,", a device he presented to the [[ royal institution ]] and a reporter from ' ' [[ the times ]] ' ' on [[ january 26 ]] [[ 1926 ]]",(),"(., altough, his, mechanical, system, was, soon, replaced, by, a, russian, inventor, who, used, an, electric, system, .)",
5,", a device he presented to the [[ royal institution ]] and a reporter from ' ' [[ the times ]] ' ' on [[ january 26 ]] [[ 1926 ]]",(),"(in, the, [[, soho, ]], district, of, [[, london, ]])",. altough his mechanical system was soon replaced by a russian inventor who used an electric system .
6,a device he presented to the [[ royal institution ]] and a reporter from ' ' [[ the times ]] ' ' on [[ january 26 ]] [[ 1926 ]] .,"(altough, his)","(the, [[, bbc, ]], made, broadcasts, for, some, years, using, the, baird, television, system, ,, in, the, [[, 1930s, ]], alternating, it, broadcasts, of, electronic, scanning, system, television, ,, until, discontinuing, the, baird, system, in, [[, 1937, ]], ., baird, ', s)",mechanical system was soon replaced by a russian inventor who used an electric system .
7,/ baird . html baird on digitalcentury . com ] * [ http : / / www . mztv . com / baird . html mechanical tv : baird television ],(),"(*, [, http, :, /, /, www, ., bbc, ., co, ., uk, /, history, /, historic, _, figures, /, baird, _, logie, ., shtml, baird, bio, on, bbc, site, ])",
8,"broadcasts using the baird television system , alternating these with broadcasts of electronic scanning system television signals during the [[ 1930s ]] , until it finally discontinued broadcasts of the baird","(system,)","(ya, mum)",in [[ 1937 ]] . baird ' s mechanical television system was replaced by the electronic television system described by [[ alan archibald campbell - swinton | a . a .
9,"alternating these with broadcasts of electronic scanning system television signals during the [[ 1930s ]] , until it finally discontinued broadcasts of the baird ya mum in [[ 1937 ]] .","(baird, ', s)","(ya, mums)",mechanical television system was replaced by the electronic television system described by [[ alan archibald campbell - swinton | a . a . campbell - swinton ]] and later developed by


In [36]:
dbscan_results.groupby((30,0.5,10)).get_group(df3.groupby((30,0.5,10))["token_id"].size().sort_values(ascending=False).index[4]).reset_index()[[ 'left_context',
     'del_string_tokens',  'ins_string_tokens',   'right_context']]

Unnamed: 0,left_context,del_string_tokens,ins_string_tokens,right_context
0,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[,"(scotland,)","(united, kingdom)","| scottish ]] | citizenship = [[ united kingdom | british ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire"
1,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[ scotland |,"(scottish,)","(british,)","]] | citizenship = [[ united kingdom | british ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] ,"
2,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[,"(united, kingdom)","(scotland,)","| british ]] | citizenship = [[ united kingdom ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] ,"
3,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[,"(scotland,)","(united, kingdom)","| scottish ]] | citizenship = [[ united kingdom ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] ,"
4,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[ scotland |,"(scottish,)","(british,)","]] | citizenship = [[ united kingdom ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] , [[ scotland"
5,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[,"(united, kingdom)","(scotland,)","| british ]] | citizenship = [[ united kingdom ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] ,"
6,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality =,(),"([[, scotland, |)","scottish | citizenship = united kingdom | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] , [[ scotland ]] |"
7,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = scottish,(),"(]],)","| citizenship = united kingdom | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] , [[ scotland ]] | death"
8,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality =,"(scottish,)","(british,)","| citizenship = united kingdom | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] , [[ scotland ]] | death"
9,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality =,"(british,)","(scottish,)","| citizenship = united kingdom | birth _ date = 14 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] , [[ scotland ]] | death"


In [37]:
dbscan_results.groupby((30,0.5,10)).get_group(df3.groupby((30,0.5,10))["token_id"].size().sort_values(ascending=False).index[1]).reset_index()[[ 'left_context',
     'del_string_tokens',  'ins_string_tokens',   'right_context']]

Unnamed: 0,left_context,del_string_tokens,ins_string_tokens,right_context
0,[[ university of strathclyde ]] ) ; and the [[ university of glasgow ]] . his degree course was interrupted by [[ world war i ]] and he never graduated .,(),"(his, gay, with, dean, luu, from, newzealand)","= = television experiments = = [[ image : john logie baird , apparatus . jpg | thumb | 300px | john logie baird with his "" televisor "" , circa"
1,[[ university of strathclyde ]] ) ; and the [[ university of glasgow ]] . his degree course was interrupted by [[ world war i ]] and he never graduated .,"(his, gay, with, dean, luu, from, newzealand)",(),"= = television experiments = = [[ image : john logie baird , apparatus . jpg | thumb | 300px | john logie baird with his "" televisor "" , circa"
2,later became the [[ university of strathclyde ]] ) ; and the [[ university of glasgow ]] . his degree course was interrupted by [[ world war i ]] and he,"(never,)",(),"graduated . = = television experiments = = [[ image : john logie baird , apparatus . jpg | thumb | 300px | john logie baird with his "" televisor """
3,later became the [[ university of strathclyde ]] ) ; and the [[ university of glasgow ]] . his degree course was interrupted by [[ world war i ]] and he,(),"(never,)","graduated . = = television experiments = = [[ image : john logie baird , apparatus . jpg | thumb | 300px | john logie baird with his "" televisor """
4,of strathclyde ]] ) ; and the [[ university of glasgow ]] . his degree course was interrupted by [[ world war i ]] and he never graduated . = =,(),"(=, =)","television experiments = = [[ image : john logie baird , apparatus . jpg | thumb | 300px | john logie baird with his "" televisor "" , circa 1925 ."
5,[[ university of strathclyde ]] ) ; and the [[ university of glasgow ]] . his degree course was interrupted by [[ world war i ]] and he never graduated .,"(=, =, =, t, ., v, ., experiments, =, =, =, [[, image, :, john, logie, baird)","(hello,)",", apparatus . jpg | thumb | 300px | john logie baird with his "" televisor "" , circa 1925 . ]] [[ image : john logie baird , 1st image"
6,of strathclyde ]] ) ; and the [[ university of glasgow ]] . his degree course was interrupted by [[ world war i ]] and he never returned to graduate .,(),"(he, failed, 2nd, grade, .)","= = = television experiments = = = [[ image : john logie baird , apparatus . jpg | thumb | 300px | john logie baird with his "" televisor """
7,of strathclyde ]] ) ; and the [[ university of glasgow ]] . his degree course was interrupted by [[ world war i ]] and he never returned to graduate .,"(he, failed, 2nd, grade, .)",(),"= = = television experiments = = = [[ image : john logie baird , apparatus . jpg | thumb | 300px | john logie baird with his "" televisor """
8,of strathclyde ]] ) ; and the [[ university of glasgow ]] . his degree course was interrupted by [[ world war i ]] and he never returned to graduate .,"(=, =, =)","([[, special, :, contributions, /, 58, ., 161, ., 208, ., 79, |, 58, ., 161, ., 208, ., 79, ]], (, [[, user, talk, :, 58, ., 161, ., 208, ., 79, |, talk, ]], ), 04, :, 25, ,, 9, october, 2008, (, utc, ))","television experiments = = = [[ image : john logie baird , apparatus . jpg | thumb | john logie baird with his "" televisor "" , circa 1925 . ]]"
9,of strathclyde ]] ) ; and the [[ university of glasgow ]] . his degree course was interrupted by [[ world war i ]] and he never returned to graduate .,"([[, special, :, contributions, /, 58, ., 161, ., 208, ., 79, |, 58, ., 161, ., 208, ., 79, ]], (, [[, user, talk, :, 58, ., 161, ., 208, ., 79, |, talk, ]], ), 04, :, 25, ,, 9, october, 2008, (, utc, ))","(=, =, =)","television experiments04 : 25 , 9 october 2008 ( utc ) [[ image : john logie baird , apparatus . jpg | thumb | john logie baird with his "" televisor"


### Very tight cluster

In [38]:
df3.groupby((30,0.5,10,)).get_group(df3.groupby((30,0.5,10,))["token_id"].size().sort_values(ascending=False).index[4]).reset_index()[["nationality", "nationality_full"]]

Unnamed: 0,nationality,nationality_full
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
5,1,1
6,1,1
7,0,0
8,1,1
9,1,1


In [39]:
dbscan_results.groupby((30,0.5,10,)).get_group(df3.groupby((30,0.5,10,))["token_id"].size().sort_values(ascending=False).index[4]).reset_index()[[ 'left_context',
     'del_string_tokens',  'ins_string_tokens',   'right_context']]


Unnamed: 0,left_context,del_string_tokens,ins_string_tokens,right_context
0,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[,"(scotland,)","(united, kingdom)","| scottish ]] | citizenship = [[ united kingdom | british ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire"
1,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[ scotland |,"(scottish,)","(british,)","]] | citizenship = [[ united kingdom | british ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] ,"
2,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[,"(united, kingdom)","(scotland,)","| british ]] | citizenship = [[ united kingdom ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] ,"
3,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[,"(scotland,)","(united, kingdom)","| scottish ]] | citizenship = [[ united kingdom ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] ,"
4,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[ scotland |,"(scottish,)","(british,)","]] | citizenship = [[ united kingdom ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] , [[ scotland"
5,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = [[,"(united, kingdom)","(scotland,)","| british ]] | citizenship = [[ united kingdom ]] | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] ,"
6,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality =,(),"([[, scotland, |)","scottish | citizenship = united kingdom | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] , [[ scotland ]] |"
7,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality = scottish,(),"(]],)","| citizenship = united kingdom | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] , [[ scotland ]] | death"
8,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality =,"(scottish,)","(british,)","| citizenship = united kingdom | birth _ date = 13 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] , [[ scotland ]] | death"
9,St@rt {{ otherpeople | john baird }} {{ infobox engineer | image = | caption = | name = john logie baird | nationality =,"(british,)","(scottish,)","| citizenship = united kingdom | birth _ date = 14 august 1888 | birth _ place = [[ helensburgh ]] , [[ dunbartonshire ]] , [[ scotland ]] | death"


In [40]:
dbscan_results.groupby((15,0.5,2,)).get_group(df3.groupby((15,0.5,2,))["token_id"].size().sort_values(ascending=False).index[7]).reset_index()[[ 'left_context',
     'del_string_tokens',  'ins_string_tokens',   'right_context']]

Unnamed: 0,left_context,del_string_tokens,ins_string_tokens,right_context
0,"| lat | long | display = inline }} --> | monuments = | residence = scotland , england | nationality = scottish | other _ names = | citizenship =","(united, kingdom)","(america,)","| education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] , glasgow < br > [[ glasgow university ]] | occupation ="
1,"place = baird family grave in [[ helensburgh cemetery ]] | monuments = | residence = scotland , england | nationality = scottish | other _ names = | citizenship =","(united, kingdom)","(british,)","| education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] ( now [[ university of strathclyde ]] ) , glasgow | occupation"
2,"place = baird family grave in [[ helensburgh cemetery ]] | monuments = | residence = scotland , england | nationality = scottish | other _ names = | citizenship =","(british,)","(scottish,)","| education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] ( now [[ university of strathclyde ]] ) , glasgow | occupation"
3,place = baird family grave in [[ helensburgh cemetery ]] | monuments = | residence = scotland and england | nationality = scottish | other _ names = | citizenship =,"(british,)","(hscottish,)","| education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] ( now [[ university of strathclyde ]] ) , glasgow | occupation"
4,place = baird family grave in [[ helensburgh cemetery ]] | monuments = | residence = scotland and england | nationality = scottish | other _ names = | citizenship =,"(hscottish,)","(british,)","| education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] ( now [[ university of strathclyde ]] ) , glasgow | occupation"
5,_ place = baird family grave in [[ helensburgh cemetery ]] | monuments = | residence = scotland and england | nationality = scottish | other _ names = | citizenship,"(=, british)","(scottish,)","| education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] ( now [[ university of strathclyde ]] ) , glasgow | occupation"
6,_ place = baird family grave in [[ helensburgh cemetery ]] | monuments = | residence = scotland and england | nationality = scottish | other _ names = | citizenship,"(scottish,)","(=, british)","| education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] ( now [[ university of strathclyde ]] ) , glasgow | occupation"
7,place = baird family grave in [[ helensburgh cemetery ]] | monuments = | residence = scotland and england | nationality = scottish | other _ names = | citizenship =,"(british,)","(stupid,)","| education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] ( now [[ university of strathclyde ]] ) , glasgow | occupation"
8,place = baird family grave in [[ helensburgh cemetery ]] | monuments = | residence = scotland and england | nationality = scottish | other _ names = | citizenship =,"(stupid,)","(british,)","| education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] ( now [[ university of strathclyde ]] ) , glasgow | occupation"
9,place = baird family grave in [[ helensburgh cemetery ]] | monuments = | residence = scotland and england | nationality = scottish | other _ names = | citizenship =,"(british,)",(),"| education = [[ larchfield academy ]] , helensburgh | alma _ mater = [[ royal technical college ]] ( now [[ university of strathclyde ]] ) , glasgow | occupation"


In [41]:
dbscan_results.groupby((2,0.5,10,)).get_group(df3.groupby((2,0.5,10,))["token_id"].size().sort_values(ascending=False).index[10]).reset_index()[[ 'left_context',
     'del_string_tokens',  'ins_string_tokens',   'right_context']]

Unnamed: 0,left_context,del_string_tokens,ins_string_tokens,right_context
0,| significant _ design = | significant _ advance = | significant _ awards = }} ' ' ' juan batista ( 2 january 1995 & ndash ; never ) was,"(an, [[, american, people, |, american, ]], [[)","(a, scottish)",engineer ]] and inventor of the world ' s first working [[ television ]] system . although baird ' s [[ electromechanical ]] system was eventually displaced by purely electronic systems
1,"significant _ design = | significant _ advance = | significant _ awards = }} john logie baird ( august 13 , 1888 – june 14 , 1946 ) was a","(scottish,)","(british,)",engineer and inventor of the world ' s first working television system . although baird ' s electromechanical system was eventually displaced by purely electronic systems ( such as those of
2,"significant _ design = | significant _ advance = | significant _ awards = }} john logie baird ( august 13 , 1888 – june 14 , 1946 ) was a","(british,)","(scottish,)",engineer and inventor of the world ' s first working television system . although baird ' s electromechanical system was eventually displaced by purely electronic systems ( such as those of
3,"_ advance = | significant _ awards = }} ' ' ' john logie baird ' ' ' ( august 13 , 1888 – june 14 , 1946 ) was a","(scottish,)","(british,)",engineer and inventor of the world ' s first working television system . although baird ' s electromechanical system was eventually displaced by purely electronic systems ( such as those of
4,"_ advance = | significant _ awards = }} ' ' ' john logie baird ' ' ' ( august 13 , 1888 – june 14 , 1946 ) was a","(british,)","(scottish,)",engineer and inventor of the world ' s first working television system . although baird ' s electromechanical system was eventually displaced by purely electronic systems ( such as those of
5,"_ advance = | significant _ awards = }} ' ' ' john logie baird ' ' ' ( august 13 , 1888 – june 14 , 1946 ) was a","(scottish,)","(british,)","engineer and inventor of the world ' s first working television system , also the world ' s first ever colour broadcast . although baird ' s electromechanical system was eventually"
6,"_ advance = | significant _ awards = }} ' ' ' john logie baird ' ' ' ( august 13 , 1888 – june 14 , 1946 ) was a","(british,)","(scottish,)","engineer and inventor of the world ' s first working television system , also the world ' s first ever colour broadcast . although baird ' s electromechanical system was eventually"
7,St@rt {{ otherpeople | john baird }} {{,"(infobox,)","(infobo, <!--, i, love, this, thanks, -->, x)",engineer | image = | caption = | name = john logie baird | nationality = [[ united kingdom | british ]] | citizenship = [[ united kingdom ]] | birth
8,St@rt {{ otherpeople | john baird }} {{,"(infobo, <!--, i, love, this, thanks, -->, x)","(infobox,)",engineer | image = | caption = | name = john logie baird | nationality = [[ united kingdom | british ]] | citizenship = [[ united kingdom ]] | birth
9,| significant _ advance = | significant _ awards = }} ' ' ' john logie baird ' ' ' ( 13 august 1888 – 14 june 1946 ) was a,"([[, scottish, ]])","([[, scotland, |, scottish, ]])","[[ engineer ]] and [[ inventor ]] of the world ' s first working [[ television ]] system , also the world ' s first fully electronic [[ colour television ]]"


In [42]:
dbscan_results.groupby((2,0.5,10,)).get_group(df3.groupby((2,0.5,10,))["token_id"].size().sort_values(ascending=True).index[7]).reset_index()[[ 'left_context',
     'del_string_tokens',  'ins_string_tokens',   'right_context']]

Unnamed: 0,left_context,del_string_tokens,ins_string_tokens,right_context
0,logie baird ' ' ' ( [[ august 14 ]] [[ 1888 ]] & ndash ; [[ june 14 ]] [[ 1946 ]] ) was a [[ scotland | scottish ]],(),"([[,)","engineer , whose lasting achievment was the invention of the [[ television ]] . baird was born in [[ helensburgh ]] , [[ scotland ]] and educated at [[ larchfield academy"
1,"advance = | significant _ awards = }} ' ' ' john logie baird ' ' ' ( august 13 , 1888 – june 14 , 1946 ) was a scottish",(),"(]],)","engineer and inventor of the world ' s first working television system , also the world ' s first ever colour broadcast . although baird ' s electromechanical system was eventually"
2,| significant _ awards = }} ' ' ' john logie baird ' ' ' ( 13 august 1888 – 14 june 1946 ) was a [[ scotland | scottish ]],(),"([[,)","engineer and inventor of the world ' s first working [[ television ]] system , also the world ' s first fully electronic [[ colour television ]] broadcast . although baird"
3,significant _ advance = | significant _ awards = }} ' ' ' john logie baird ' ' ' ( 13 august 1888 – 14 june 1946 ) was a scottish,"([, (, [, united, kingdom, |, british, ], ), ])","(]],)","[[ engineer ]] and [[ inventor ]] of the world ' s first working [[ television ]] system , also the world ' s first fully electronic [[ colour television ]]"
4,"may 2007 http : / / www . oxforddnb . com / view / article / 30540 , accessed 23 april 2010 . < / ref > was a [[ scottish","(people, |, scottish, ]], [[)",(),"engineer ]] and [[ inventor ]] of the world ' s first practical , publicly demonstrated [[ television ]] system , and also the world ' s first fully electronic [["
5,", may 2007 http : / / www . oxforddnb . com / view / article / 30540 , accessed 23 april 2010 . < / ref > was a scottish",(),"(]],)","engineer and inventor of the [[ invention of television | world ' s first practical , publicly demonstrated television system ]] , and also the world ' s first fully electronic"
6,"john logie baird ( 1888 - 1946 ) ] www . bbc . co . uk , accessed 2 june 2013 < / ref > was a [[ scotland | scottish","(]],)",(),"engineer and inventor of the [[ invention of television | world ' s first practical , publicly demonstrated television system ]] , and also the world ' s first fully electronic"
7,> [[ fellow of the royal society of edinburgh | frse ]] < / small > ( 14 august 1888 {{ spaced ndash }} 14 june 1946 ) was a scottish,(),"(scientist, ,)",engineer and the inventor of the [[ invention of television | world ' s first television ]] ; < ref > [ http : / / www . bbc . co
8,"english dictionary ]] ' ' – complete & unabridged 10th edition , 2009 < / ref > 14 august 1888 {{ spaced ndash }} 14 june 1946 ) was a scottish","(scientist,)","(nationality,)",", engineer , innovator and inventor of the [[ invention of television | world ' s first television ]] ; < ref > [ http : / / www . bbc"
9,"' ' – complete & unabridged 10th edition , 2009 < / ref > 14 august 1888 {{ spaced ndash }} 14 june 1946 ) was a british scientist of scottish","(nationality,)",(),", engineer , innovator and inventor of the [[ invention of television | world ' s first television ]] ; < ref > [ http : / / www . bbc"


# Identifying worst clusters

In [43]:
evaluation_df.reset_index().set_index(["min_samples", 
"eps", "context"])["change_object_vmeasure"].sort_values().iloc[0:20]

min_samples  eps   context
50           0.25  15        -1.552455e-14
             0.50  20        -1.552455e-14
             0.25  20        -1.552455e-14
             0.50  15        -1.552455e-14
             0.25  30        -1.552455e-14
                   25        -1.552455e-14
             3.25  15         2.267821e-05
20           2.50  20         1.530253e-04
30           2.25  20         1.646425e-04
50           2.25  30         3.349052e-04
2            4.00  30         5.982194e-04
             3.75  30         5.982194e-04
30           4.00  30         6.002858e-04
10           3.75  30         6.002858e-04
5            4.00  30         6.002858e-04
10           4.00  30         6.002858e-04
20           4.00  30         6.002858e-04
5            3.75  30         6.002858e-04
50           2.50  30         6.735031e-04
             4.00  30         7.443046e-04
Name: change_object_vmeasure, dtype: float64

In [44]:
evaluation_df.reset_index().set_index(["min_samples", 
"eps", "context"])["change_object_completness"].sort_values().iloc[0:20]

min_samples  eps   context
50           3.25  15         0.000025
30           2.25  20         0.000119
20           2.50  20         0.000137
50           2.25  30         0.000320
30           2.50  20         0.000651
50           2.50  20         0.000699
30           2.25  25         0.000840
50           2.50  30         0.000949
             2.00  25         0.001017
20           2.25  25         0.001102
50           2.25  20         0.001386
30           3.00  15         0.001998
50           4.00  4          0.002070
             3.50  10         0.002271
             3.00  15         0.002550
             2.75  15         0.002577
             2.25  25         0.002742
20           2.25  20         0.003172
50           3.00  10         0.003266
             2.75  20         0.003326
Name: change_object_completness, dtype: float64

In [45]:
evaluation_df.reset_index().set_index(["min_samples", 
"eps", "context"])["change_object_homegenity"].sort_values().iloc[0: 30]

min_samples  eps   context
50           0.25  20        -7.762274e-15
                   30        -7.762274e-15
                   25        -7.762274e-15
             0.50  20        -7.762274e-15
                   15        -7.762274e-15
             0.25  15        -7.762274e-15
             3.25  15         2.100030e-05
20           2.50  20         1.732260e-04
30           2.25  20         2.687416e-04
5            4.00  30         3.122233e-04
             3.75  30         3.122233e-04
10           4.00  30         3.122233e-04
30           4.00  30         3.122233e-04
20           4.00  30         3.122233e-04
10           3.75  30         3.122233e-04
2            4.00  30         3.122233e-04
             3.75  30         3.122233e-04
50           2.25  30         3.507959e-04
             4.00  30         3.903373e-04
5            3.25  30         5.075518e-04
             3.50  30         5.075518e-04
20           3.75  30         5.075518e-04
30           3.75  30      

In [46]:
evaluation_df.reset_index().set_index([ "eps",  "min_samples", "context"]).loc[1.75]["change_object_homegenity"].sort_values().iloc[0:10]

min_samples  context
50           4          0.006139
30           4          0.041260
5            30         0.046068
2            30         0.054359
50           2          0.054441
10           30         0.130518
50           10         0.131639
             8          0.132633
20           4          0.135490
30           2          0.146896
Name: change_object_homegenity, dtype: float64

In [47]:
evaluation_df.reset_index().set_index([ "eps",  "min_samples", "context"]).loc[2.0]["change_object_completness"].sort_values().iloc[:10]

min_samples  context
50           25         0.001017
20           25         0.003882
50           30         0.004090
20           30         0.005280
5            30         0.008122
10           30         0.009666
5            25         0.010198
2            25         0.010800
             30         0.011563
50           2          0.013650
Name: change_object_completness, dtype: float64

In [48]:
evaluation_df.reset_index().set_index([ "eps",  "min_samples", "context"]).loc[2.0]["change_object_vmeasure"].sort_values().iloc[:10]

min_samples  context
50           25         0.001522
20           25         0.005712
50           30         0.005792
20           30         0.006465
5            30         0.009273
10           30         0.011326
2            30         0.014176
5            25         0.014542
2            25         0.016336
50           2          0.021839
Name: change_object_vmeasure, dtype: float64

In [49]:
dbscan_results.groupby((15,3.25,50)).get_group(df3.groupby((15,3.25,50))["token_id"].size().sort_values(ascending=False).index[1]).reset_index()[[ 'left_context',
     'del_string_tokens',  'ins_string_tokens',   'right_context']]

Unnamed: 0,left_context,del_string_tokens,ins_string_tokens,right_context
0,"30 line video signal - a primitive [[ video ]] recording device , dubbed phonovision [ http : / / www . tvdawn . com / tvimage . htm ] .","(he, televised)","(the, system, consisted, of, a, phonodisc, ,, which, was, a, 78rpm, record, that, could, play, a, 30, line, video, signal, ., his, other, developments, were, in, [[, optical, fiber, |, fibre, -, optics, ]], ,, radio, direction, finding, ,, [[, infrared, |, infrared, ]], night, viewing, and, [[, radar, ]], ., there, still, remains, ,, however, ,, questions, about, his, exact, contributions, to, the, developement, of, [[, radar, ]], ,, for, his, wartime, defense, projects, have, never, been, officially, acknowledged, by, government, ., baird, made, many, other, contributions, to, the, field, of, television, before, and, after, his, mechanical, system, fell, ...)","the first live transmission , of the [[ epsom derby ]] , in [[ 1931 ]] , and the following year he was the first to demonstrate ultra - short wave"
1,= see also = = * [[ logie award ]] s — [[ australia ]] n television * [[ university of strathclyde ]] = = external links = = * [,(),"(http, :, /, /, www, ., bairdtelevision, ., com, the, baird, television, website, ])",* [ http : / / www . mztv . com / baird . html mechanical tv : baird television ] * [ http : / / www . bbc .
2,john _ logie _ baird . htm john logie baird - gizmohighway technology guide ] * [ http : / / www . nbtv . org narrow bandwidth television association ],(),"(*, [)","http : / / www . tvdawn . com / tvimage . htm phonovision ] [[ category : 1888 births | baird , john logie ]] [[ category : 1946 deaths"
3,"systems took a backseat to electronic systems . he demonstrated a theatre television system at the [[ coliseum theatre | london coliseum ]] , [[ berlin ]] , [[ paris ]]",(),"(,,)","at the [[ coliseum theatre | london coliseum ]] , [[ berlin ]] , [[ paris ]] and [[ stockholm ]] . in 1939 he showed colour television using a cathode"
4,"ref > j . l . baird , [ http : / / www . bairdtelevision . com / 1932 . html television in 1932 ] . < / ref >",(),"(by, 1939, he, had, improved, his, theatre, projection, system, to, televise, a, boxing, match, on, a, screen, 15, by, 12, feet, (, 4, ., 6, by, 3, ., 7, m, ), .)","from 1929 - 1935 , the bbc broadcast television programs using the 30 - line baird system . in late 1936 the bbc began alternating baird 240 - line transmissions with"
5,"tiltman , ronald frank , ' ' baird of television ' ' . new york : arno press , 1974 . ( reprint of 1933 ed . ) isbn 0405060610 .",(),"(gfgg,)",= = external links = = * [ http : / / www . bairdtelevision . com the baird television website ] * [ http : / / www . digitalcentury
6,"tiltman , ronald frank , ' ' baird of television ' ' . new york : arno press , 1974 . ( reprint of 1933 ed . ) isbn 0405060610 .","(gfgg,)",(),= = external links = = * [ http : / / www . bairdtelevision . com the baird television website ] * [ http : / / www . digitalcentury
7,john _ logie _ baird . htm john logie baird - gizmohighway technology guide ] * [ http : / / www . nbtv . org narrow bandwidth television association ],(),"(*, [)","http : / / www . tvdawn . com / tvimage . htm phonovision ] {{ persondata | name = baird , logie john | alternative names = | short description"
8,http : / / www . nbtv . org narrow bandwidth television association ] * [ http : / / www . tvdawn . com / tvimage . htm phonovision ],(),"(*, [, http, :, /, /, lost, -, british, -, television, ., blogspot, ., com, /, blog, describing, many, missing, uk, television, programmes, ])","{{ persondata | name = baird , logie john | alternative names = | short description = [[ scottish people | scottish ]] [[ engineer ]] ; first person to demonstrate"
9,". * tiltman , ronald frank , ' ' baird of television ' ' . new york : arno press , 1974 . ( reprint of 1933 ed . ) isbn","(0405060610,)","(0, -, 405, -, 06061, -, 0)",. = = external links = = * [ http : / / www . bairdtelevision . com the baird television website ] * [ http : / / www .


In [50]:
# dbscan_results.groupby((30,1.75,2)).get_group(df3.groupby((30,1.75,2))["token_id"].size().sort_values().index[145]).reset_index()[[ 'left_context',
#      'del_string_tokens',  'ins_string_tokens',   'right_context']]

In [51]:
# dbscan_results.groupby((30,1.6,2)).get_group(df3.groupby((30,1.6,2))["token_id"].size().sort_values().index[130]).reset_index()[[ 'left_context',
#      'del_string_tokens',  'ins_string_tokens',   'right_context']]

In [52]:
pd.set_option('expand_frame_repr', False)
pd.reset_option('display.max_colwidth')

In [53]:
dbscan_results.columns[:20]

Index([     'ins_start_pos',        'ins_end_pos',         'left_neigh',
              'right_neigh',      'del_start_pos',        'del_end_pos',
               'ins_tokens',         'del_tokens',   'left_neigh_slice',
        'right_neigh_slice',         'left_token',        'right_token',
        'del_string_tokens',  'ins_string_tokens',       'left_context',
            'right_context', 'edit_string_tokens',         (2, 0.25, 2),
               (2, 0.25, 5),        (2, 0.25, 10)],
      dtype='object')

In [54]:
dbscan_results.head(2)["right_neigh"]

from revision id  to revision id  timestamp            timegap            editor   
203693            203699          2002-09-08 14:05:32  194 days 22:14:17  3646    0    10
                                                                                  1    11
Name: right_neigh, dtype: int64