In [1]:
import pickle
import pandas as pd
import plotly.offline as pyo
import plotly.express as px
import dimensionality_reduction
import json 
import random
import fast_hdbscan

# Load the pickled dictionary
with open('../glanos-data/big_consulting_export-sbert.pickle', 'rb') as f:
    embeddings = pickle.load(f)

json_file_path = "../glanos-data/big_consulting_export.json"

with open(json_file_path, 'r') as j:
     contents = json.loads(j.read())

# Get the list of item objects
items = contents['items']
df = pd.DataFrame(items)
df["embedding"] = df["snippet"].map(embeddings)
df.dropna(inplace=True)
len(df)

13504

In [2]:
sample_size = 50
df = df.sample(n=sample_size, random_state=42)
# df.to_csv(f'big_consulting_export_clustered-sample-50.tsv', sep="\t", index=True)

sample_dict = dict(random.sample(embeddings.items(), sample_size))
sample_sent = "conducted a survey about sustainable supply chains with 525 large corporations across Argentina, Brazil, Canada, Mexico and the United States"
company_names = [    'Louis Dreyfus Company',    'Lenovo Group Limited',    'Marubeni Corp.',    'LUKOIL PJSC',    'Midea Group Company Limited',    'Medtronic PLC Holding',    'Medipal Holding Corp.',    'Mitsubishi Corp.',    'Nestlè SA',    'Oil & Natural gas Corp.',    'Pegatron Corp.',    'POSCO Group',    'PACCAR Inc.',    'Pemex',    "People's Insurance Company of China",    'Quanta Computer Inc.',    'Plains GP Holding LP',
                'Apple', 'Microsoft', 'Aldi', 'Tesco', 'Mercadona', 'BeReal']
sample_sents = []
for company in company_names:
    sample_sents.append(company+" "+sample_sent)

with open('companies.pkl', 'wb') as f:
    pickle.dump(sample_dict, f)

In [3]:
import hdbscan
import pandas as pd
from hdbscan import flat
import numpy as np


def calculate_cluster(df, cols=["x", "y", "z"], n_clusters=None, *args, **kwargs) -> pd.DataFrame:
    """
    Cluster data using hdbscan
    :param df:
    :param cols:
    :param args:
    :param kwargs:
    :return:
    """
    print("cluster: ", cols, args, kwargs)
    
    if n_clusters is None:
        clusterer = hdbscan.HDBSCAN(*args, **kwargs)
        clusterer.fit(df[cols].to_numpy())
    else:
        clusterer = flat.HDBSCAN_flat(df[cols], cluster_selection_method='leaf', n_clusters=n_clusters)
    df["cluster_id"] = clusterer.labels_
    df["cluster_id"] = df["cluster_id"].astype(str)
    df["cluster_prob"] = clusterer.probabilities_


    # print amount of clusters
    print("amount clusters: ", len(df["cluster_id"].unique()))
    return df, clusterer
df

In [4]:
# import clustering
n_clusters = 33
df_split = pd.DataFrame(df['embedding'].to_list())

# Concatenate the split columns with the 'snippets' column
df_result = pd.concat([df['snippet'].reset_index(drop=True), df_split], axis=1)

# df_clustered = calculate_cluster(df[:10], cols="vector", n_clusters=n_clusters)
df_clustered, clusterer = calculate_cluster(df_result, cols=range(0, 383), n_clusters=n_clusters)

df_clustered = pd.concat([df_clustered['cluster_id'], df.reset_index(drop=True)], axis=1) # .reset_index(drop=True)
df_clustered

In [None]:
df_dim_3d = dimensionality_reduction.reduce_dimensionality_of_project_data(df_clustered, metric="cosine", vector_column_name="embedding")
df_dim_3d

In [None]:
import json

def save_as_json(df):
    data = {"clusters": []}

    # Group the data by cluster_id
    grouped = df.groupby("cluster_id")
    for cluster_id, group in grouped:
        cluster = {"name": cluster_id, "data": []}
        for index, row in group.iterrows():
            row_data = {
                "id": row["id"],
                "tooltip": row["tooltip"],
                "x": row["x"],
                "y": row["y"],
                "z": row["z"]
            }
            cluster["data"].append(row_data)
        data["clusters"].append(cluster)

    with open("big_consulting_export-clustered.json", "w") as outfile:
        json.dump(data, outfile)
!pwd

In [5]:
pyo.init_notebook_mode()

filtered_df = df_dim_3d
# filtered_df = filtered_df[filtered_df['cluster_id'] != "31"]
# filtered_df = filtered_df[filtered_df['cluster_id'] != "-1"]
filtered_df["tooltip"] = filtered_df["tooltip"].str.replace(r"/n", "<br>")


fig = px.scatter_3d(filtered_df,
                    x="x", y="y", z="z",
                    hover_data=['snippet', "tooltip"],
                    color="cluster_id",
                    width=1200, height=1200
                    )
fig.show()
fig.write_image(f"{json_file_path.split('.')[0]}.svg")
fig.write_html(f"{json_file_path.split('.')[0]}.html", include_plotlyjs=True, full_html=True)


In [6]:
from scipy.spatial.distance import cdist

# https://notebook.community/scikit-learn-contrib/hdbscan/notebooks/Looking%20at%20cluster%20consistency
class RankedPoints:
    
    def __init__(self, points, clusterer, metric='euclidean', selection_method='centroid'):
        """ Rank points in a cluster based on their distance to the cluster centroid/medoid
        
        Parameters
        ----------
        
        points : array of shape (n_samples, n_features), and must be the same data passed into
                 HDBSCAN
        
        clusterer : Instance of HDBSCAN that has been fit to data
        
        metric: string or callable, optional (default='euclidean')
            The metric to use when calculating distance between points in a cluster and 
            the cluster centroid/medoid. If metric is a string or callable, it must be one of
            the options allowed by scipy.spatial.distance.cdist for its metric parameter.
        
        selection_method: string, optional (default='centroid')
            Method to use to find the weighted cluster center. Allowed options are 'centroid' 
            and 'medoid'.
        
        """
        self.clusterer = clusterer
        self.metric = metric
        
        allowed_methods = ['centroid', 'medoid']
        if selection_method not in allowed_methods:
            raise ValueError(f'Selection method must be one of {allowed_methods}')
        
        if selection_method == 'centroid' and metric != 'euclidean':
            raise ValueError(f'Metric must be euclidian when using selection_method centroid. '
                             f'Current metric is {metric}')
        
        self.selection_method = selection_method
        
        self._embedding_cols = [str(i) for i in range(points.shape[1])]
        self.embedding_df = pd.DataFrame(points, columns=self._embedding_cols)
        self.embedding_df['cluster'] = clusterer.labels_
    
    def calculate_all_distances_to_center(self):
        """For each cluster calculate the distance from each point to the centroid/medoid"""
        all_distances = pd.DataFrame()
        for label in np.unique(self.embedding_df['cluster']):           
            distance_df = self.calculate_distances_for_cluster(label)
            all_distances = pd.concat([all_distances, distance_df])
        
        self.embedding_df = self.embedding_df.merge(all_distances, left_index=True, right_index=True)
    
    def calculate_distances_for_cluster(self, cluster_id):
        """For a given cluster_id calculate the distance from each point to the centroid/medoid.
        
        Parameters
        ----------

        cluster_id : int
            The id of the cluster to compute the distances for. If the cluster id is -1 which
            corresponds to the noise point cluster, then this will return a distance of NaN.

        Returns
        -------

        df : A pandas DataFrame containing the distances from each point to the cluster centroid/medoid.
             The index of the dataframe corresponds to the index in the original data. 

        """
        cluster_of_interest = self.embedding_df[self.embedding_df['cluster'] == cluster_id].copy()
        
        if cluster_of_interest.empty:
            raise ValueError(f'Cluster id {cluster_id} not found')
        
        # Don't calculate distances for the noise cluster
        if cluster_id == -1:
            return pd.DataFrame(np.nan, columns=['dist_to_rep_point'], index=cluster_of_interest.index)
        
        if self.selection_method == 'centroid':
            rep_point = self.clusterer.weighted_cluster_centroid(cluster_id)
        if self.selection_method == 'medoid':
            rep_point = self.clusterer.weighted_cluster_medoid(cluster_id)
        
        dists = cdist(rep_point.reshape((1,len(self._embedding_cols))), cluster_of_interest[self._embedding_cols].values, metric=self.metric)
        return pd.DataFrame(dists[0], columns=['dist_to_rep_point'], index=cluster_of_interest.index)
    
    def rank_cluster_points_by_distance(self, cluster_id):
        """For a given cluster return a pandas dataframe of points ranked 
           by distance to the cluster centroid/medoid
        """
        cluster_of_interest = self.embedding_df[self.embedding_df['cluster'] == cluster_id].copy()
        
        if cluster_of_interest.empty:
            raise ValueError(f'Cluster id {cluster_id} not found')
            
        if 'dist_to_rep_point' not in self.embedding_df.columns:
            distance_df = self.calculate_distances_for_cluster(cluster_id)
            cluster_of_interest = cluster_of_interest.merge(distance_df, left_index=True, right_index=True)
        
        cluster_of_interest.sort_values('dist_to_rep_point', inplace=True)
        return cluster_of_interest
    
    def get_closest_samples_for_cluster(self, cluster_id, n_samples=5):
        """Get the N closest points to the cluster centroid/medoid"""
        return self.rank_cluster_points_by_distance(cluster_id).head(n_samples)
    
    def get_furthest_samples_for_cluster(self, cluster_id, n_samples=5):
        """Get the N points furthest away from the cluster centroid/medoid"""
        return self.rank_cluster_points_by_distance(cluster_id).tail(n_samples)


In [7]:
points = df_result[range(0, 383)].to_numpy()
examples = RankedPoints(points, clusterer, metric='euclidean', selection_method='medoid')
examples.calculate_all_distances_to_center()

In [49]:
emb_df = examples.embedding_df # [examples.embedding_df['cluster'] != -1]
emb_df_result = pd.concat([emb_df['dist_to_rep_point'].reset_index(drop=True), df_dim_3d], axis=1)

# emb_df_result_main_cols = emb_df_result.iloc[:, [0, -2, -1]]
# emb_df_result_main_cols.to_csv(f'big_consulting_export_clustered-hdbscan-{n_clusters}-leaf.csv', index=True)

emb_df_result["final_score"] = emb_df_result["dist_to_rep_point"] / emb_df_result["score"]
emb_df_result.loc[emb_df_result["dist_to_rep_point"].isna(), "final_score"] = emb_df_result["score"]

emb_df_result.to_csv(f'big_consulting_export_clustered-hdbscan-{n_clusters}-leaf.csv', index=True)
emb_df_result

Unnamed: 0,dist_to_rep_point,cluster_id,tooltip,score,snippet,id,embedding,x,y,z,final_score
0,,-1,(2023-04-21) Strategy|Company Info\n\nat KPMG ...,1.0,at KPMG where he focused on complex financial ...,ID0,"[0.06154659017920494, 0.08247778564691544, -0....",3.828489,0.478022,5.029356,1.000000
1,,-1,(2023-04-21) Leadership\n\nLatentView has been...,1.0,LatentView has been recognized as an industry ...,ID1,"[-0.017490660771727562, -0.036421071738004684,...",3.319351,2.188711,12.027831,1.000000
2,,-1,"(2023-04-21) \n\nIn his last role, Prashant wa...",0.8,"In his last role, Prashant was playing the rol...",ID2,"[0.07539796829223633, -0.025188380852341652, -...",2.926881,13.724097,5.736260,0.800000
3,,-1,(2023-04-21) \n\nWe're proud of the end result...,0.8,We're proud of the end result of this implemen...,ID3,"[-0.030860286206007004, 0.047984544187784195, ...",0.304015,3.750470,12.124339,0.800000
4,1.295331,31,(2023-04-21) Market Share Growth\n\nWipro cons...,1.0,Wipro consolidates presence in foods with acqu...,ID4,"[0.08279228955507278, 0.027778802439570427, -0...",3.036174,-3.929799,13.274285,1.295331
...,...,...,...,...,...,...,...,...,...,...,...
13499,,-1,(2023-03-29) \n\nProfessional services firmAcc...,0.8,Professional services firmAccenture has flagge...,ID13499,"[-0.005730913951992989, -0.0424995943903923, -...",8.403257,9.505851,2.837073,0.800000
13500,1.248475,31,(2023-03-29) \n\nAccenture stated that it had ...,0.8,Accenture stated that it had put aside $1.5 bi...,ID13500,"[0.03951822593808174, 0.005480567459017038, 0....",3.673986,8.014185,-0.528267,1.560593
13501,,-1,"(2023-03-29) \n\nMeanwhile, Accenture last wee...",0.6,"Meanwhile, Accenture last week closed two deals",ID13501,"[0.010188382118940353, -0.0003712352190632373,...",7.694917,7.426668,7.825679,0.600000
13502,,-1,(2023-03-29) \n\nS4 Capital was forced to dela...,0.8,S4 Capital was forced to delay its results twi...,ID13502,"[-0.037712085992097855, 0.016464950516819954, ...",2.621881,5.651892,1.409518,0.800000
