In [2]:
# Import / install relevant Python packages
import numpy as np
import pandas as pd  
import datetime as dt
import matplotlib.pyplot as plt

from sklearn.cluster import (KMeans, MiniBatchKMeans, AgglomerativeClustering, DBSCAN)
from sklearn.neighbors import NearestNeighbors

In [5]:
feature_names_list = ['mom1m','absacc','acc','aeavol','age','agr','baspread',
'beta','betasq','bm','bm_ia','cash','cashdebt','cashpr','cfp','cfp_ia',
'chatoia','chcsho','chempia','chinv','chmom','chpmia','chtx','cinvest',
'convind','currat','depr','divi','divo','dolvol','dy','ear','egr','ep',
'gma','herf','hire','idiovol','ill','indmom','invest','lev','lgr','maxret',
'ms','mve_ia','mvel1','nincr','operprof','pchcapx_ia','pchcurrat','pchdepr',
'pchgm_pchsale','pchquick','pchsale_pchrect','pctacc','pricedelay',
'ps','quick','rd','retvol','roaq','roeq','roic','rsup','salecash',
'salerec','securedind','sgr','sin','sp','std_dolvol','std_turn',
'tang','tb','turn','zerotrade']

mom_names_list = ["mom1m","mom2m","mom3m","mom4m","mom5m","mom6m","mom7m","mom8m","mom9m","mom10m","mom11m","mom12m",
                "mom13m","mom14m","mom15m","mom16m","mom17m","mom18m","mom19m","mom20m","mom21m","mom22m","mom23m","mom24m",
                "mom25m","mom26m","mom27m","mom28m","mom29m","mom30m","mom31m","mom32m","mom33m","mom34m","mom35m","mom36m",
                "mom37m","mom38m","mom39m","mom40m","mom41m","mom42m","mom43m","mom44m","mom45m","mom46m","mom47m","mom48m"]

feature_names_list_full =  feature_names_list + mom_names_list[1:]

PCA_components_number = len(feature_names_list_full)
PCA_components_column_names = ['PCA '+ str(i+1) for i in range(PCA_components_number)]
PCA_components_tresholds_column_names=['n_to_reach_90','n_to_reach_95','n_to_reach_99']
Original_df_columns_names = ['DATE','permno','sic2']
PCA_table_full_columns_names = Original_df_columns_names+PCA_components_column_names+PCA_components_tresholds_column_names
Agglomerative_column_names = ['cluster alpha='+ str(i) for i in range(10,100,10)]

In [3]:
df_PCA = pd.read_csv('D:/USB Drive/Data/DachengXiu/df_PCA_done.csv')

In [4]:
#inicilize df for Agglomerative - make empty df
#initial dataframe
df_Agglomerative=pd.DataFrame(columns=Original_df_columns_names + Agglomerative_column_names)
dates = sorted(df_PCA['DATE'].unique())
for date in dates:
    print("DATE ",date)
    #date = 19791231
    df_Agglomerative_window = df_PCA[df_PCA['DATE'] == date].sort_values('permno')

    #list of column names of PCA components needed to explain 99% of variance
    number_of_PCA_components_for_99_variance_explainability =  df_Agglomerative_window['n_to_reach_99'].min()
    PCA_components_for_99_variance_explainability_column_names = ['PCA '+ str(i+1) for i in range(number_of_PCA_components_for_99_variance_explainability)]

    # Create an instance of the class NearestNeighbors and fit the model
    #So we have to add 1 since 0 position in distances is 0
    n_neighbors = 1 + 1 
    #that l1 norm provides more significant and meaningful clustering results for high-dimensional datasets, we choose l1 norm as the distance metric.
    nn_model    = NearestNeighbors(n_neighbors=n_neighbors, metric='l1')
    nn_model_fited   = nn_model.fit(df_Agglomerative_window[PCA_components_for_99_variance_explainability_column_names])

    # Calcuate average distances to n_neighbors
    distances, indices = nn_model_fited.kneighbors(df_Agglomerative_window[PCA_components_for_99_variance_explainability_column_names])

    for alpha in range(10,100,10):
        #we use alpha from 10 - 90
        #epsilon is set as an α percentile of the distances between a pair of nearest data points.
        distance_of_alpha_percentile = np.percentile(distances[:,1], alpha)

        # Create an instance of the class AgglomerativeClustering and fit the model
        hca_model = AgglomerativeClustering(linkage='average', 
                                            distance_threshold=distance_of_alpha_percentile, metric='l1',
                                            n_clusters=None)
        hca_model.fit(df_Agglomerative_window[PCA_components_for_99_variance_explainability_column_names])
        
        #determine cluster lables and fill data to df
        df_Agglomerative_window[f'cluster alpha={alpha}'] = hca_model.labels_

    df_Agglomerative=pd.concat([df_Agglomerative,df_Agglomerative_window],axis=0)
df_Agglomerative = df_Agglomerative[Original_df_columns_names+Agglomerative_column_names]


DATE  19791231
DATE  19800131
DATE  19800229
DATE  19800331
DATE  19800430
DATE  19800530
DATE  19800630
DATE  19800731
DATE  19800829
DATE  19800930
DATE  19801031
DATE  19801128
DATE  19801231
DATE  19810130
DATE  19810227
DATE  19810331
DATE  19810430
DATE  19810529
DATE  19810630
DATE  19810731
DATE  19810831
DATE  19810930
DATE  19811030
DATE  19811130
DATE  19811231
DATE  19820129
DATE  19820226
DATE  19820331
DATE  19820430
DATE  19820528
DATE  19820630
DATE  19820730
DATE  19820831
DATE  19820930
DATE  19821029
DATE  19821130
DATE  19821231
DATE  19830131
DATE  19830228
DATE  19830331
DATE  19830429
DATE  19830531
DATE  19830630
DATE  19830729
DATE  19830831
DATE  19830930
DATE  19831031
DATE  19831130
DATE  19831230
DATE  19840131
DATE  19840229
DATE  19840330
DATE  19840430
DATE  19840531
DATE  19840629
DATE  19840731
DATE  19840831
DATE  19840928
DATE  19841031
DATE  19841130
DATE  19841231
DATE  19850131
DATE  19850228
DATE  19850329
DATE  19850430
DATE  19850531
DATE  1985

In [5]:
#df_Agglomerative.to_csv('D:/USB Drive/Data/DachengXiu/df_Agglomerative.csv')

Analysis:

In [6]:
df_Agglomerative = pd.read_csv('D:/USB Drive/Data/DachengXiu/df_Agglomerative.csv')

In [7]:
df_1990_2000 = df_Agglomerative[(df_Agglomerative['DATE'] > 19891231 ) & (df_Agglomerative['DATE'] < 20210101)]

In [14]:
#Number of clusters 188
(df_1990_2000.groupby(['DATE','cluster alpha=30']).count() > 1)['permno'].sum()/372
#219.69623655913978

219.69623655913978

In [15]:
#Number of stocks in total 3157
df_1990_2000.groupby(['DATE'])['permno'].count().mean()
#3286.5779569892475

3286.5779569892475

In [91]:
#Number of stocks in clusters 769 (24.58)
(df_1990_2000.groupby(['DATE','cluster alpha=30']).count() )['permno'].pipe(lambda  dfx: dfx.loc[dfx>1]).sum()/372
#798.1451612903226

798.1451612903226

In [92]:
#Number of outliers 2388 (75.42)
(df_1990_2000.groupby(['DATE','cluster alpha=30']).count() )['permno'].pipe(lambda  dfx: dfx.loc[dfx==1]).sum()/372
#2488.4327956989246

2488.4327956989246

In [219]:
#Number of stocks in the biggest cluster 107 (13.72)
df = (df_1990_2000.groupby(['DATE','cluster alpha=30']).count() )['permno'].pipe(lambda  dfx: dfx.loc[dfx>1])
df = df.to_frame('number_in_cluster')
mean1st = df.groupby('DATE').max().sum()/372
mean1st
#101.811828

number_in_cluster    101.811828
dtype: float64

In [221]:
#Number of stocks in the second biggest cluster 47 (6.09)
df = (df_1990_2000.groupby(['DATE','cluster alpha=30']).count() )['permno'].pipe(lambda  dfx: dfx.loc[dfx>1])
df = df.to_frame('number_in_cluster')
mean2nd = df.sort_values(['DATE','number_in_cluster'],ascending=False).groupby('DATE').head(2).mean()*2 - mean1st
mean2nd
#41.865591

number_in_cluster    41.865591
dtype: float64

In [222]:
#Number of stocks in the third biggest cluster 29 (3.74)
df = (df_1990_2000.groupby(['DATE','cluster alpha=30']).count() )['permno'].pipe(lambda  dfx: dfx.loc[dfx>1])
df = df.to_frame('number_in_cluster')
mean3rd = df.sort_values(['DATE','number_in_cluster'],ascending=False).groupby('DATE').head(3).mean()*3 - mean1st - mean2nd
mean3rd
#24.22043

number_in_cluster    24.22043
dtype: float64

In [8]:
df_clustering_results_comparation_with_paper_results = pd.read_csv('D:/USB Drive/Data/DachengXiu/Poredjenje.csv')

In [10]:
df_clustering_results_comparation_with_paper_results.T

Unnamed: 0,0,1,2,3,4,5,6,7
Unnamed: 0,,Number of clusters,Number of stocks in total,Number of stocks in clusters,Number of outliers 1900,Number of stocks in the biggest cluster,Number of stocks in the second biggest cluster,Number of stocks in the third biggest cluster
k-means,paper,129,3157,1257,1900,63,51,45
k=500,moji,218,3286,1990,1296,67,56,50
DBSCAN,paper,2,3157,376,2781,336,32,6
alpha=10,moji,1.54,3286,429,2847,420,19,9
Agglomerative,paper,188,3157,769,2388,107,47,29
alpha=30,moji,220,3286,798,2488,102,42,24
