# Inter-Class Clustering Notebook

## Setting up and preparing the data for clustering

Start off with loading in the two datasets which we will use for this notebook
Using two different ones to test if the method works well across datasets

In [1]:
#Import all necessary packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sentence_transformers import SentenceTransformer, models
from tqdm.notebook import tqdm
from sklearn.decomposition import PCA
import umap.umap_ as umap
import seaborn as sns
import hdbscan
from math import *
from statistics import *

In [None]:
#Preparing dataset 1: Exam Questions and Subject type
#Loading in the questions dataset
questions = pd.read_csv("datasets/questions.csv")

#Check the class numbers
print(questions.Subject.value_counts())

#Rename columns
questions.rename(columns={'eng': 'text', 'Subject': 'labels'}, inplace = True)

#Recode labels into numbers (can probably be done with sklearn method?)
labels_mapping = {'Biology': 0, 'Chemistry' : 1, 'Maths' : 2, 'Physics': 3}
questions.labels = questions.labels.apply(lambda x: labels_mapping[x])

#Only preprocessing can be removing the new-line character (\n)
questions.text = questions.text.apply(lambda x: x.replace("\n", " "))

#Lets just have a look at what the longest question is (BERT has a limit of tokens afer which it will cut off)
lengths = questions.text.str.len()
argmax = np.where(lengths == lengths.max())[0]
print(questions.text.iloc[argmax].to_numpy().ravel().tolist())

#Rather weird sentences and reveals that the text data contains a lot of Latex commands for formatting the text
#TODO: come back to this and see if needs to be removed but might be a nice challenge for our method
print(questions)

In [7]:
#Preparing dataset 2: Research paper abstracts and topic types
#Loading in the abstracts dataset
abstracts = pd.read_csv("datasets/science.csv")

#Check data encoding
# print(abstracts.head())
#Need to change the encodings
#A way can be to first concat the binary indicators into a list/string of numbers which then is converted to the label
abstracts['labels'] = abstracts[abstracts.columns[3:]].apply(lambda x: "".join(x.astype(str)), axis = 1)

#Check class balance
print(abstracts.labels.value_counts())

#Seems like there are cases where entries belong to multiple classes, remove those and just keep single members
#Combine with mapping dict instead of making new list. A bit ugly, change later.. 0 = Physics, 1 = CS, 2 = Maths, 3 = Stats, 4 = Quantitative Biology, 5 = Quantitative Finance
labels_mapping = {"010000" : 0, "100000" : 1, "001000" : 2, "000100" : 3, "000010" : 4, "000001" : 5}

abstracts = abstracts[abstracts.labels.isin(list(labels_mapping.keys()))]
abstracts.reset_index(drop=True, inplace= True)
abstracts.labels = abstracts.labels.apply(lambda x: labels_mapping[x])

#Rename
abstracts.rename(columns={"ABSTRACT": "text1"}, inplace= True)
abstracts.rename(columns={"TITLE": "text"}, inplace= True)

#Only preprocessing can be removing the new-line character (\n)
abstracts.text = abstracts.text.apply(lambda x: x.replace("\n", " "))
abstracts.text1 = abstracts.text1.apply(lambda x: x.replace("\n", " "))

#Lets check again the longest abstract entries:
lengths_title = abstracts.text.str.len()
argmax = np.where(lengths_title == lengths_title.max())[0]

#Check length of longest title
print(abstracts.text.iloc[argmax].to_numpy().ravel().tolist())
print(len(abstracts.text.iloc[argmax].to_numpy().ravel().tolist()[0].split()))

lengths_abstract = abstracts.text1.str.len()
argmax = np.where(lengths_abstract == lengths_abstract.max())[0]

#Check length of longest title
print(abstracts.text1.iloc[argmax].to_numpy().ravel().tolist())
print(len(abstracts.text1.iloc[argmax].to_numpy().ravel().tolist()[0].split()))


#Finally lets keep only the columns we are interested in further
abstracts = abstracts[['text', 'text1', 'labels']]



010000    5120
100000    4910
001000    3610
100100    2285
000100    1636
001100     825
101000     682
000010     443
110000     437
011000     293
000001     209
101100     179
000110     105
010100      99
110100      36
100010      30
000101      24
111000      19
100001       9
011100       9
100110       5
000011       4
100101       2
001101       1
Name: labels, dtype: int64
['Reply to Hicks et al 2017, Reply to Morrison et al 2016 Refining the relevant population in forensic voice comparison, Reply to Hicks et al 2015 The importance of distinguishing info from evidence/observations when formulating propositions']
36
["  This article is dedicated to the late Giorgio Israel. R{é}sum{é}. The aim of this article is to propose on the one hand a brief history of modeling starting from the works of Fibonacci, Robert Malthus, Pierre Francis Verhulst and then Vito Volterra and, on the other hand, to present the main hypotheses of the very famous but very little known predator-prey mod

At this point both datasets should be good to go for getting the embeddings. 
Now we can get the embeddings of each sentence using some BERT sentence library and ideally store the objects as pickles which we can access later (so we don't have to do this each time)

In [9]:
#Helper functions for saving/loading pickle objects
def save_obj(obj, name ):
    with open('datasets/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('datasets/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [6]:
#getting sentence embeddings for all the separated questions
def get_sentence_embeddings(dataset):
    
    word_embedding_model = models.Transformer('sentence-transformers/all-mpnet-base-v2', max_seq_length=384)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())

    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

    #Store the embeddings in a list
    sentence_embeddings = []

    #Go through the data and get the sentence embeddings
    for index in tqdm(range(len(dataset))):
        sentence = dataset.text[index]
        embedding = model.encode(sentence)
        sentence_embeddings.append(embedding)
    

    dataset['embeddings'] = sentence_embeddings
    
    return dataset

In [11]:
#lets try if it works

#TOo many samples, takes forever
#lets take a 1000 from each label
questions_subset = questions.groupby("labels").head(1000)
questions_subset.reset_index(drop=True, inplace = True)

questions_embeddings = get_sentence_embeddings(questions_subset)
save_obj(questions_embeddings, "questions_embeddings")

HBox(children=(FloatProgress(value=0.0, max=4000.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['embeddings'] = sentence_embeddings


In [28]:
#Getting embeddings as well for the abstract titles
abstracts_subset = abstracts.groupby("labels").head(500)
abstracts_subset.reset_index(drop = True, inplace = True)

abstract_embeddings = get_sentence_embeddings(abstracts_subset)
save_obj(abstract_embeddings, "abstract_embeddings")

HBox(children=(FloatProgress(value=0.0, max=2652.0), HTML(value='')))




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['embeddings'] = sentence_embeddings


In [86]:
#Loading in the saved dataset with emebeddings
questions_embeddings = load_obj("questions_embeddings")

#Reduce the data using PCA
pca = PCA(.50).fit(questions_embeddings.embeddings.to_list())
embeddings_pca_transformed = pca.transform(questions_embeddings.embeddings.to_list())

#Define new column for new pca reduced embeddings
questions_embeddings['pca_embeddings'] = ""
#Modify the original embeddings in the dataframe (FIX)
for index in range(len(questions_embeddings)):
    questions_embeddings.pca_embeddings[index] = embeddings_pca_transformed[index]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  questions_embeddings.pca_embeddings[index] = embeddings_pca_transformed[index]


In [10]:
#Get PCA for the abstract titles too (turn this into a function after)
abstract_embeddings = load_obj("abstract_embeddings")

#Reduce the data using PCA
pca = PCA(.75).fit(abstract_embeddings.embeddings.to_list())
embeddings_pca_transformed = pca.transform(abstract_embeddings.embeddings.to_list())

#Define new column for new pca reduced embeddings
abstract_embeddings['pca_embeddings'] = ""
#Modify the original embeddings in the dataframe
for index in range(len(abstract_embeddings)):
    abstract_embeddings.pca_embeddings[index] = embeddings_pca_transformed[index]

print(len(abstract_embeddings.pca_embeddings[0]))

                                                   text  \
0           Reconstructing Subject-Specific Effect Maps   
1                    Rotation Invariance Neural Network   
2     Spherical polyharmonics and Poisson kernels fo...   
3     A finite element approximation for the stochas...   
4     On maximizing the fundamental frequency of the...   
...                                                 ...   
2647  Asymmetric Connectedness of Fears in the U.S. ...   
2648  Beta-rhythm oscillations and synchronization t...   
2649  A metric model for the functional architecture...   
2650  Discovering the effect of nonlocal payoff calc...   
2651                Complex Valued Risk Diversification   

                                                  text1  labels  \
0       Predictive models allow subject-specific inf...       1   
1       Rotation invariance and translation invarian...       1   
2       We introduce and develop the notion of spher...       2   
3       The stochastic 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  abstract_embeddings.pca_embeddings[index] = embeddings_pca_transformed[index]


133


In [17]:
#Running the UMAP dimensionality reduction across all different parameters
def umap_reduce(dataset):

    #Define a range of parameters to iterate through
    n_neighbors = [5, 10, 15, 20, 30, 50]
    min_dist = [0.2, 0.4, 0.6, 0.8, 1]
    n_components = [2, 3, 4]
    metrics = ['euclidean', 'cosine']

    #Need to think of a way to make a list of dict, where the index of the dict inside the list relates to the label of the data
    #Then within each one of those smaller dicts, each key will be a cluster and the value its frequency
    dict_list = [dict() for x in range(len(dataset.labels.unique()))]

    #Iterate over each parameter
    for neighbors in tqdm(n_neighbors):
        for dist in tqdm(min_dist):
            for component in tqdm(n_components):
                for metric in tqdm(metrics):

                    #Set up the UMAP reducer, set a random state so reproducible
                    reducer = umap.UMAP(random_state= 0, n_neighbors = neighbors, min_dist=dist, n_components = component, metric=metric, verbose = 0)

                    #reduce the PCA-transformed embeddigns
                    umap_embeddings = reducer.fit_transform(dataset.pca_embeddings.to_list())

                    #Create a new temporary column in dataset to store UMAP embeddings
                    dataset['umap_embeddings'] = ""
                    
                    for index in range(len(dataset)):
                        dataset['umap_embeddings'][index] = umap_embeddings[index]
                    
                    #Store parameters so can send over for plotting and checking if any lead to really bad results
                    parameters = [neighbors, dist, component, metric]
                    #Now time to do the clustering of the UMAP projections
                    clustered_dataset = hdb_clustering(dataset, parameters)

                    #Next, analyze the clusters from this parameter run
                    iteration_results = analyze_clusters(clustered_dataset)

                    #Now need to go through iteration results.
                    #Iterate over the list of dicts with index
                    for key, value in iteration_results.items():

                        #THIS METHOD RELIES ON THE LABEL OF THE DATA TO START AT 0
                        #Now for each key go through the found clusters and check if they exist in overall dict (key relates to the index in the dict_list)
                        #Add if not
                        for cluster in value:
                            if tuple(cluster) in dict_list[key]:
                                dict_list[key][tuple(cluster)] += 1
                            else:
                                dict_list[key][tuple(cluster)] = 1

    #At this point should have our list of dicts ready
    #Lets save it since this was such a time consuming process
    save_obj(dict_list, "dict_list")


In [21]:
#Write a function which takes in a dict_list and spits out the final CSV/Dataframe with the clusters ordered and merged
def order_clusters(dict_list):

    #Create dataframe where to store everything
    all_clusters = pd.DataFrame()

    #Input should be a dict_list where each index of the dict refers to the label of the data
    for idx, dictionary in enumerate(dict_list):
        #Order the dictionary
        ordered_dict = sorted(dictionary.items(), key = lambda x: x[1], reverse= True)
        
        #Filter out all entries that have less than X entries
        filtered_dicts = []

        for cluster in ordered_dict:
            if cluster[1] >= 2 and len(cluster[0]) >=2:
                filtered_dicts.append(cluster)

        #Put into a label DF
        label_df = pd.DataFrame(filtered_dicts, columns = [f'label_{idx}', f'freq_{idx}'])

        #Concat label df into all clusters
        all_clusters = pd.concat([all_clusters, label_df], axis=1)
    
    all_clusters.to_csv("datasets/clusters.csv")

In [18]:
#Calculating the optimum min size
def get_optimum_min_size(dataset):

    #Store scores
    scores = []

    #Iterate over range of min sizes
    for min_size in range(50):
        if min_size > 1:
            #set up the clusterer
            clusterer = hdbscan.HDBSCAN(min_cluster_size = min_size)

            #TODO: Here double check if you get same results if you actually separate into the two lists, like you do for the plotting
            clusterer.fit(dataset.umap_embeddings.to_list())

            #Attach the probabilities to the dataset so can compute proportions
            dataset['cluster_probabilities'] = clusterer.probabilities_

            #Compute score
            score = len(dataset.loc[dataset.cluster_probabilities < 0.05])/len(dataset)
            score_tuple = (min_size, score)
            scores.append(score_tuple)
            

    #Now we return the minimum size
    return(min(scores, key = lambda t: t[1])[0])


#Running the HDBSCAN clustering
def hdb_clustering(dataset, parameters):

    #First need to find the optimum min_size
    optimum_size = get_optimum_min_size(dataset)

    #Set up clusterer with optimum size
    clusterer = hdbscan.HDBSCAN(min_cluster_size=optimum_size, min_samples=1)

    #Form clusters
    clusterer.fit(dataset.umap_embeddings.to_list())

    #Code for plotting the clusters (uncomment to check if it works)
    fig = plt.figure()
    #set up colour palette
    color_palette = sns.color_palette('deep', clusterer.labels_.max()+1)
    cluster_colors = [color_palette[x] if x >= 0 else (0.5, 0.5, 0.5) for x in clusterer.labels_]
    cluster_member_colors = [sns.desaturate(x, p) for x, p in zip(cluster_colors, clusterer.probabilities_)]
    plt.scatter([e[0] for e in dataset.umap_embeddings.to_list()], [e[1] for e in dataset.umap_embeddings.to_list()], s = 50, linewidth = 0, c = cluster_member_colors, alpha = 0.25)
    plt.title(f"{parameters}")
    plt.show()

    #Attach the results back to the dataset and remove some entries
    dataset['cluster_labels'] = clusterer.labels_
    dataset['cluster_probabilities'] = clusterer.probabilities_

    #Here remove -1 cluster and probabilities less than 0.8 (???)
    dataset = dataset.loc[dataset.cluster_labels != -1]
    dataset = dataset.loc[dataset.cluster_probabilities > 0.8]

    dataset.reset_index(drop = True, inplace = True)


    return dataset

In [19]:
#Going through the found clusters and getting most frequent label and associated sentences
def analyze_clusters(dataset):

    #Neet to set up a dict with the keys as the labels from dataset
    keys = list(set(dataset.labels))

    labels_dict = dict([(key, []) for key in keys])

    #Get list of unique clusters
    unique_clusters = dataset.cluster_labels.unique()

    #iterate over each cluster number
    for cluster_number in unique_clusters:

        #Subset based on cluster
        cluster_subset = dataset.loc[dataset.cluster_labels == cluster_number]

        #Find most common label in subset
        max_label = mode(cluster_subset.labels)

        #Subset again to just look at most common label
        label_subset = cluster_subset.loc[cluster_subset.labels == max_label]

        #Append to dict for that label
        labels_dict[max_label].append(label_subset.text.values.tolist())
    
    return labels_dict
        

In [None]:
#Run the UMAP reduction and clustering pipeline
umap_reduce(abstract_embeddings)

#Load and get some clusters
dict_list = load_obj("dict_list")

order_clusters(dict_list)

In [None]:
#Seems like a pretty non-trivial problem actually...
#Try and merge the clusters
def merge_clusters(dict_list):

    
    for idx, dictionary in enumerate(dict_list):
        
        #FOr each dictionary we want a list of clusters on which we can merge afterwards
        list_of_clusters = []

        #Order the dictionary (and to get it in a tuple format)
        ordered_dict = sorted(dictionary.items(), key = lambda x: x[1], reverse= True)
        
        #Go over each cluster
        for cluster in ordered_dict:
            #Just looking at ones which have some value to us
            if cluster[1] >= 2 and len(cluster[0]) >=2:

                #Add them to the list of clusters for this dict
                list_of_clusters.append(list(cluster[0]))



dict_list = load_obj("dict_list")

merge_clusters(dict_list)
