In [None]:
import seaborn as sns
from tqdm import tqdm

import matplotlib.pyplot as plt
from clustering import *
from utils import plot

In [None]:
#df of log returns
df_60m = pd.read_parquet("data/clean/60m/data_clean_60m_log_ret.parquet")

In [None]:
df_60m

In [None]:
#Download tickers
tickers = pd.read_csv('tickers/s&p.csv')
tickers.sort_values(by="Symbol", inplace=True)
tickers.reset_index(drop=True, inplace=True)

#Keep only tickers in df_60m
tickers = tickers.iloc[np.where(tickers["Symbol"].isin(df_60m.columns))]
tickers.reset_index(drop=True, inplace=True)
tickers

In [None]:
#Order df_60m by ticker name
df_60m = df_60m[tickers["Symbol"]]

In [None]:
#Rolling clustering for 60m dataset
#Cluster tickers

#Compute T=3*N
N = df_60m.shape[1]
T = 3 * N

#Number of windows possible
lag_max = df_60m.shape[0] - T
liste = RolledCluster(df_60m, T, "louvain_cluster/asset", False, False, lag_max)

In [None]:
liste_cluster = dask.compute(dask.compute(liste))[0][0]
#liste_cluster

In [None]:
number_of_clusters = []
for element in tqdm(liste_cluster):
    number_of_clusters.append(element["Cluster"].unique().max() + 1)

#Create dataframe for plotting reasons
df_plot_clusters = pd.DataFrame(number_of_clusters)
df_plot_clusters["Datetime"] = df_60m[df_60m.index > "2020-12-04 11:30:00-05:00"].index
df_plot_clusters.rename(columns={0: "Clusters", "Datetime": "Datetime"}, inplace=True)
df_plot_clusters

In [None]:
plot(df_plot_clusters, x="Datetime", y="Clusters", hue=None, title='Number of clusters per period', x_label=None,
     y_label=None, save_name="Plot_number_clusters_assets")

In [None]:
#Get position of drop
cut = 0
while number_of_clusters[cut] > 10:
    cut = cut + 1

#This gives time from which we have a reasonable amount of clusters
df_60m.index[cut+T]

In [None]:
plot(df_plot_clusters[cut:], x="Datetime", y="Clusters", hue=None, title='Number of clusters per period',
     x_label=None, y_label=None, save_name=None)

In [None]:
#Add column for ARI measure
ARI = RolledARI(liste_cluster)

#Insert a value for ARI at time 0 for plotting
ARI.insert(0, 1)
df_plot_clusters["ARI"] = ARI

df_plot_clusters

In [None]:
plot(df_plot_clusters, x="Datetime", y="ARI", hue=None, title='ARI measure with respect to time', x_label=None,
     y_label=None, save_name="Plot_ARI_assets")

In [None]:
#Relabeled clusters but only from cut moment
new_liste = liste_cluster[cut:]

#Define a new list of clusters relabeled
relabeled = map_clusters(new_liste)

In [None]:
#relabeled

In [None]:
#create dataframe of composition of clusters of size k
def composition_matrix(tickers, liste_cluster, axis=0, threshold=1):
    
    #Group tickers by sector
    grouped_tickers_sector = tickers.groupby("Sector")
    
    #Get number of rows to be the maximum number a cluster have
    liste_max = []
    for element in liste_cluster:
        liste_max.append(element["Cluster"].max())
    
    nb_clusters_all = max(liste_max)+1
    
    
    #Create dataframe
    df = pd.DataFrame(np.zeros((nb_clusters_all, len(tickers["Sector"].unique()))), index=range(nb_clusters_all),
                      columns=tickers["Sector"].unique())

    #loop over each cluster
    for cluster in tqdm(liste_cluster):
        
        #Get the number value of clusters
        value_cluster = cluster["Cluster"].unique()
        
        #Group tickers by cluster belonging
        grouped_cluster = cluster.groupby("Cluster")
        
        
        #Loop over each value of cluster
        for i in value_cluster:
            
            
            #Get corresponding group of tickers
            tickers_in_i = grouped_cluster.get_group(i)
            
            #Loop for each sector
            for sector in tickers["Sector"].unique():
                

                #Number of tickers in cluster i belonging to sector
                matching_tickers = len(np.where(grouped_tickers_sector.get_group(sector)["Symbol"].isin(grouped_cluster.get_group(i).index))[0])
                #Add this number of matching tickers to dataframe at right place
                
                df[sector][i] = df[sector][i] + matching_tickers
    
    #create percentages by column of by line
    if axis == 0:
        for index in range(k):
            df.iloc[index] = df.iloc[index] / df.iloc[index].sum() * 100
    else:
        df = df / df.sum(axis=0) * 100

        
    #Take out lines with all values under threshold for visibility
    drop_index = np.where(((df >= threshold) * df).sum(axis=1) == 0)[0]
    df.drop(drop_index, axis=0, inplace=True)
    
    #Round number with 1 decimal
    df = df.round(decimals=1)
    return (df)

In [None]:
df = composition_matrix(tickers, relabeled, axis=1, threshold=2)
df

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
sns.heatmap(df, annot=True, cmap="YlGnBu")
plt.savefig("plots/composition_matrix.svg", format="svg")