In [1]:
import pickle
import json
import copy
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from seededHierarchicalDensityClustering import SeededHierarchicalDensityClustering
from evaluateSHDC_wos import evaluate, print_metrics, print_metrics_bcubed

In [2]:
method = 'pool'
embedding_type = 'roberta' # roberta, glove300, fastText

dataset_name = 'wos46985'
base_dir = '../../data/WOS/'
data_file = base_dir+'Meta-data/Data.csv'

text_embedding_dir = '../data/'+dataset_name
text_embedding_file = text_embedding_dir+'/'+embedding_type+'-embedding-'+method+'.pkl'

seed_indices_file = text_embedding_dir+'/seed_indices.json'
remaining_indices_file = text_embedding_dir+'/remaining_indices.json'
data_indices_test_file = text_embedding_dir+'/data_indices_test.json'
data_indices_train_file = text_embedding_dir+'/data_indices_train.json'
    
seed_embeddings_file = text_embedding_dir+'/cluster_hierarchy_seed_embeddings-roberta-'+method+'.pkl'

topic_distance_threshold_file = '../model/'+dataset_name+'/topic_distance_threshold.pkl'
topic_seed_file ='../model/'+dataset_name+'/cluster_hierarchy_seed_embeddings-roberta-'+method+'.pkl'
model_parameters_file = '../model/'+dataset_name+'/model_parameters.pkl'

eval_indices_file = text_embedding_dir+'/eval_indices.json'
eval_indices_test_file = text_embedding_dir+'/eval_set_test_indices.json'

### Read Model Parameters

In [3]:
params = pickle.load(open(model_parameters_file, "rb"))

init_neighbors_perc = params['init_neighbors_perc']
main_topic_update_method = -1
main_topic_weight = 0
create_other_main = -1
create_other_sub = -1

### Read Learnt Topic Seeds & Thresholds

In [4]:
method = 'pool'
topic_seeds = pickle.load(open(topic_seed_file, "rb"))

In [5]:
topic_distance_threshold = pickle.load(open(topic_distance_threshold_file, "rb"))

### Read Text Embeddings

In [6]:
text_embeddings = pickle.load(open(text_embedding_file, "rb"))

with open(data_indices_test_file, "r") as infile:
    data_indices = json.load(infile)
        
X = text_embeddings[data_indices]
X.shape

(15320, 768)

### Create Eval Set

In [7]:
shdc = SeededHierarchicalDensityClustering(
    topic_seeds,
    init_neighbors_perc=init_neighbors_perc,
    main_topic_update_method=main_topic_update_method, 
    main_topic_weight=main_topic_weight,
    create_other_main=create_other_main, 
    create_other_sub=create_other_sub,
    verbose=False
)

In [8]:
shdc._create_dist_matrix(X, shdc.topic_seeds)
shdc.topic_distance_threshold = topic_distance_threshold
shdc._get_L1_topic_assignments(shdc.topic_distance_threshold)
shdc.fit_L2(X)

In [9]:
df = pd.read_csv(data_file).loc[data_indices].rename(
    columns={
        'Y':'level_1',
        'Y1':'level_2',
        'Y2':'level_3'
    }
)
df.level_1 = dataset_name
df.level_2 = df.level_2.astype(str)
df.level_3 = df.level_3.astype(str)

In [10]:
topics_points_hierarchical =  shdc.topics_points_hierarchical
topics_points_distances_hierarchical = shdc.topics_points_distances_hierarchical
sub_topics_points_distances = shdc.sub_topics_points_distances

In [11]:
data_indices = np.array(data_indices)

In [12]:
df['assigned_cluster'] = 'None'
df['assigned_level_1'] = 'None'
df['assigned_level_2'] = 'None'
df['assigned_level_3'] = 'None'
df['distance_assigned_level_2'] = np.nan
df['distance_assigned_level_3'] = np.nan
for topic_hierarchical in topics_points_hierarchical.keys():
    topic_hierarchical_indices = data_indices[topics_points_hierarchical[topic_hierarchical]]
    df.loc[topic_hierarchical_indices, ['assigned_level_1']] = dataset_name
    df.loc[topic_hierarchical_indices, ['assigned_level_2']]  = topic_hierarchical
    df.loc[topic_hierarchical_indices, ['assigned_level_3']]  = 'Other'
    df.loc[topic_hierarchical_indices, ['assigned_cluster']]  = topic_hierarchical
    df.loc[topic_hierarchical_indices, ['distance_assigned_level_2']] = topics_points_distances_hierarchical[topic_hierarchical]
    if topic_hierarchical in sub_topics_points_distances.keys():      # if the topic is actually a sub-topic
        df.loc[topic_hierarchical_indices, ['distance_assigned_level_3']] = sub_topics_points_distances[topic_hierarchical]
        main,sub = topic_hierarchical.split('/')
        df.loc[topic_hierarchical_indices, ['assigned_level_2']]  = main
        df.loc[topic_hierarchical_indices, ['assigned_level_3']]  = sub

df.loc[(df['level_1']!=dataset_name),'level_1'] = 'None'
df.loc[(df['level_1']!=dataset_name),'level_2'] = 'None'
df.loc[(df['level_1']!=dataset_name),'level_3'] = 'None'

In [14]:
keep_l2_pairs = df.groupby(['level_2','assigned_level_2']).size(
).reset_index(
).groupby('level_2').apply(
    lambda gdf: pd.concat([
        gdf[gdf[0]==np.sort(gdf[0])[-1]], 
        gdf[gdf[0]==np.sort(gdf[0])[int(len(gdf)/2)]],
        gdf[gdf[0]==np.sort(gdf[0])[0]]
    ])[['assigned_level_2']]
).reset_index()[['level_2','assigned_level_2']]

In [15]:
keep_indices = []
for i,row in keep_l2_pairs.iterrows():
    keep_indices.extend(
        df[(df.level_2==row.level_2)&(df.assigned_level_2==row.assigned_level_2)].index.values.tolist()
    )

In [16]:
keep_df = df.loc[keep_indices]

In [17]:
from sklearn.metrics import homogeneity_completeness_v_measure
from b3 import calc_b3

fscore_l2,precision_l2,recall_l2 = calc_b3(
    keep_df[keep_df.level_1==keep_df.assigned_level_1].assigned_level_2, keep_df[keep_df.level_1==keep_df.assigned_level_1].level_2
)
homogeneity_l2, completeness_l2, v_measure_l2 = homogeneity_completeness_v_measure(
    keep_df[keep_df.level_1==keep_df.assigned_level_1].level_2, keep_df[keep_df.level_1==keep_df.assigned_level_1].assigned_level_2
)
(precision_l2,recall_l2,fscore_l2),(homogeneity_l2, completeness_l2, v_measure_l2)

((0.7776010385097176, 0.5666420515098053, 0.655568105186841),
 (0.6028572936713468, 0.7239364348162436, 0.6578722080347638))

In [18]:
keep_l3_pairs = keep_df.groupby(['level_3','assigned_level_3']).size(
).reset_index(
).groupby('level_3').apply(
    lambda gdf: pd.concat([
        gdf[gdf[0]==np.sort(gdf[0])[-1]], 
        gdf[gdf[0]==np.sort(gdf[0])[int(len(gdf)/2)]],
        gdf[gdf[0]==np.sort(gdf[0])[0]]
    ])[['assigned_level_3']]
).reset_index()[['level_3','assigned_level_3']]

In [19]:
keep_indices = []
for i,row in keep_l3_pairs.iterrows():
    keep_indices.extend(
        keep_df[(keep_df.level_3==row.level_3)&(keep_df.assigned_level_3==row.assigned_level_3)].index.values.tolist()
    )
keep_keep_df = keep_df.loc[keep_indices]

In [20]:
from sklearn.metrics import homogeneity_completeness_v_measure
from b3 import calc_b3
fscore_l3,precision_l3,recall_l3 = calc_b3(
    keep_keep_df[keep_keep_df.level_1==keep_keep_df.assigned_level_1].assigned_level_3, keep_keep_df[keep_keep_df.level_1==keep_keep_df.assigned_level_1].level_3
)

homogeneity_l3, completeness_l3, v_measure_l3 = homogeneity_completeness_v_measure(
    keep_keep_df[keep_keep_df.level_1==keep_keep_df.assigned_level_1].level_3, keep_keep_df[keep_keep_df.level_1==keep_keep_df.assigned_level_1].assigned_level_3
)

(precision_l3,recall_l3,fscore_l3 ), (homogeneity_l3, completeness_l3, v_measure_l3)

((0.5221774154420233, 0.24140569272472548, 0.3301712658432999),
 (0.4467852792976546, 0.6343753474244771, 0.5243061202439362))