In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from data_clean.loadData import getTickers
from sklearn.cluster import affinity_propagation, SpectralClustering, KMeans
from sklearn.cluster import AffinityPropagation as AP, affinity_propagation
import networkx as nx
from networkx.algorithms import community
from data_clean.newClean import get_data, get_corr_from_year
import random
import scipy as sp
import backtesting as bt

import yfinance as yf

In [None]:
snp500_df = yf.download('^GSPC', start=f'2005-01-01', end=f'2021-12-31')

In [None]:
snp500_df

In [None]:
yearly_dfs, min_year, max_year = get_data()

In [None]:
corr = get_corr_from_year(2019, yearly_dfs, min_year, to_numpy=False)
getTickers = corr.columns
corr = corr.to_numpy()

In [None]:
bt.baseline_backtest(2012, yearly_dfs, pct_returns=True).plot()

In [None]:
# get abselute value of correlation matrix
abscorr = np.abs(corr)
abscorr.shape

In [None]:
np.isnan(abscorr).sum().sum()

In [None]:
def get_random_corr(A):
    x, y = A.shape
    rand_x = random.randint(0, x-1)
    rand_y = random.randint(0, y-1)
    rand_corr = A[rand_x, rand_y]
    if rand_corr == 1:
        rand_corr = get_random_corr(A)
    return rand_corr

def fill_diag_with_random_sample(A):
    x, y = A.shape
    for i in range(x):
        A[i, i] = get_random_corr(A)
    return A

In [None]:
G = nx.from_numpy_matrix(abscorr)

In [None]:
L = nx.laplacian_matrix(G).astype(float)

In [None]:
def create_affinity_from_abscorr(corr_matrix):
    A = np.abs(corr_matrix)
    np.fill_diagonal(A, 0)
    A = 1 - A
    A = -A
    return A

In [None]:
A = create_affinity_from_abscorr(abscorr)

clustering = AP(verbose=True).fit(A)
cluster_centers = clustering.cluster_centers_indices_
labels = clustering.labels_

In [None]:
label_order = np.argsort(labels)

In [None]:
np.unique(labels)

In [None]:
diag_filled_corr = fill_diag_with_random_sample(abscorr)

# reorder rows to match label order
abscorr_rowsort = diag_filled_corr[label_order, :]
# reorder columns to match label order
abscorr_sorted = abscorr_rowsort[:, label_order]
mean = np.mean(abscorr_sorted)

plt.figure(figsize=(20, 30))

plt.subplot(1,2,1)
# sns.heatmap(diag_filled_corr, cmap='hot', vmin=0, vmax=1, center=mean)
plt.imshow(diag_filled_corr, cmap='hot')
plt.title('Correlation Matrix before sorting')

plt.subplot(1,2,2)
# sns.heatmap(abscorr_sorted, cmap='hot', vmin=0, vmax=1, center=mean)
plt.imshow(abscorr_sorted, cmap='hot')
plt.title('Correlation Matrix after sorting')

In [None]:
corr = get_corr_from_year(year, yearly_dfs, min_year, to_numpy=True)

abscorr = np.abs(corr)
diag_filled_corr = fill_diag_with_random_sample(abscorr)
plt.hist(diag_filled_corr.flatten(), bins=100)
# plt.yscale('log')
plt.title(f"Correlation Histogram {year}")
plt.xlabel("Correlation coefficient")
plt.ylabel("Frequency")

In [None]:
plt.figure(figsize=(20, 20))

for i, year in enumerate(range(min_year, max_year + 1)):
    plt.subplot(4, 5, i+1)
    corr = get_corr_from_year(year, yearly_dfs, min_year, to_numpy=True)

    abscorr = np.abs(corr)
    diag_filled_corr = fill_diag_with_random_sample(abscorr)
    plt.hist(diag_filled_corr.flatten(), bins=100)
    # plt.yscale('log')
    plt.title(f"Correlation Histogram {year}")
    plt.xlabel("Correlation coefficient")
    plt.ylabel("Frequency")

plt.show()


In [None]:
asset_graph = np.abs(corr)
cutoff = 0.3

# set all values on the diagonal to 0
np.fill_diagonal(asset_graph, 0)

# set all corelations under 0.5 to 0
asset_graph[asset_graph < cutoff] = 0

# get graph from correlation matrix
asset_graph = nx.from_numpy_matrix(asset_graph)

In [None]:
G = nx.from_numpy_matrix(abscorr)

In [None]:
print(f"number of nodes: {G.number_of_nodes()} \nNumber of edges: {G.number_of_edges()}")

In [None]:
# get number of components
print(f"number of components: {nx.number_connected_components(G)}")

In [None]:
# get largest component
largest_cc = max(nx.connected_components(G), key=len)

# get subgraph of largest component
largest_cc_subgraph = G.subgraph(largest_cc)

In [None]:
import community as community_louvain
partition = community_louvain.best_partition(asset_graph)
modularity = community_louvain.modularity(partition, asset_graph)
values = [partition.get(node) for node in asset_graph.nodes()]
plt.figure(figsize=(10,10))
nx.draw_spring(asset_graph, cmap = plt.get_cmap('jet'), node_color = values, node_size=30, with_labels=False)
print(modularity)
print("Total number of Communities=", len(set(partition.values())))

In [None]:
label_order = np.argsort(values)

# reorder rows to match label order
abscorr_rowsort = diag_filled_corr[label_order, :]
# reorder columns to match label order
abscorr_sorted = abscorr_rowsort[:, label_order]

plt.figure(figsize=(20,20))

plt.subplot(1,2,1)
# sns.heatmap(diag_filled_corr, cmap='hot', vmin=0, vmax=1)
plt.imshow(diag_filled_corr, cmap='hot')
plt.title('Correlation Matrix before sorting')

plt.subplot(1,2,2)
# sns.heatmap(abscorr_sorted, cmap='hot', vmin=0, vmax=1)
plt.imshow(abscorr_sorted, cmap='hot')
plt.title('Correlation Matrix after sorting')

# Plotting of cumulative degree distributions

In [None]:
abscorr = np.abs(corr)
G = nx.from_numpy_matrix(abscorr)

In [None]:
asset_graph = np.abs(corr)
cutoff = 0.3

# set all values on the diagonal to 0
np.fill_diagonal(asset_graph, 0)

# set all corelations under 0.5 to 0
asset_graph[asset_graph < cutoff] = 0

# get graph from correlation matrix
asset_graph = nx.from_numpy_matrix(asset_graph)

In [None]:
asset_tree = nx.minimum_spanning_tree(G, weight='weight', algorithm='prim')

In [None]:
import collections

degree_sequence = sorted([int(d) for n, d in G.degree(weight="weight")], reverse=True)  # degree sequence
degreeCount = collections.Counter(degree_sequence)
deg, cnt = zip(*degreeCount.items())
deg = np.array(deg)[::-1]
cnt = np.cumsum(cnt)[::-1]

plt.plot(deg, cnt, 'b-', marker='o')
plt.title(f"Cumulative degree distribution")
plt.ylabel("p(k>=x))")
plt.xlabel("x")
# plt.xscale('log')
# plt.yscale('log')

In [None]:
import collections

graphs = [G, asset_graph, asset_tree]
names = ['Correlation Graph', 'Asset Graph', 'Asset Tree']
fig = plt.figure(figsize=(15, 5))

for i, graph in enumerate(graphs, 1):
    plt.subplot(1, 3, i)
    # plot cumulative degree distribution of asset tree
    # select largest component of graph
    # get largest component
    if i == 1:
        degree_sequence = sorted([int(d) for n, d in graph.degree(weight="weight")], reverse=True)  # degree sequence
    else:
        degree_sequence = sorted([int(d) for n, d in graph.degree()], reverse=True)  # degree sequence
    degreeCount = collections.Counter(degree_sequence)
    deg, cnt = zip(*degreeCount.items())
    deg = np.array(deg)[::-1]
    cnt = np.cumsum(cnt)[::-1]

    plt.plot(deg, cnt, 'b-', marker='o')
    plt.title(f"Cumulative degree distribution - {names[i-1]}")
    plt.ylabel("p(k>=x))")
    plt.xlabel("x")
    plt.xscale('log')
    plt.yscale('log')
# tight_layout
plt.tight_layout()
plt.show()

In [None]:
def basic_eda(graph):
    print(f"# Nodes: {graph.number_of_nodes()}")
    print(f"# Edges: {graph.number_of_edges()}")
    print(f"Connected components: {nx.number_connected_components(graph)}")
    print(f"Self loops: {nx.number_of_selfloops(graph)}")
    print(f"Density: {nx.density(graph):.3%}")
    
    if nx.is_connected(graph):
        print(f"Diameter: {nx.diameter(graph)}")
    else:
        largest_cc_subgraph = graph.subgraph(max(nx.connected_components(graph), key=len))
        print(f"done {largest_cc_subgraph.number_of_nodes()}", end="\r")
        print(f"Diameter: {nx.diameter(largest_cc_subgraph)}")

    if graph.number_of_edges() < 500_000:
        print(f"Average clustering coefficient: {nx.average_clustering(graph):.3f}")
        
        if nx.is_connected(graph):
            print(f"Average shortest path length: {nx.average_shortest_path_length(graph):.3f}")
        else:
            print(f"Average shortest path length: N/A")
    else:
        print(f"Average clustering coefficient: N/A")
        print(f"Average shortest path length: N/A")

In [None]:
for graph, name in zip(graphs, names):
    print(f"{name}:")
    basic_eda(graph)
    print()

In [None]:
basic_eda(asset_graph)

In [None]:
# spectral clustering on asset tree
k = 3
clusters = community.greedy_modularity_communities(asset_tree, weight='weight')

In [None]:
label_order = []
for cluster in clusters:
    label_order.extend(cluster)

In [None]:
diag_filled_corr = fill_diag_with_random_sample(abscorr)

# reorder rows to match label order
abscorr_rowsort = diag_filled_corr[label_order, :]
# reorder columns to match label order
abscorr_sorted = abscorr_rowsort[:, label_order]
mean = np.mean(abscorr_sorted)

plt.figure(figsize=(20, 30))

plt.subplot(1,2,1)
# sns.heatmap(diag_filled_corr, cmap='hot', vmin=0, vmax=1, center=mean)
plt.imshow(diag_filled_corr, cmap='hot')
plt.title('Correlation Matrix before sorting')

plt.subplot(1,2,2)
# sns.heatmap(abscorr_sorted, cmap='hot', vmin=0, vmax=1, center=mean)
plt.imshow(abscorr_sorted, cmap='hot')
plt.title('Correlation Matrix after sorting')

In [None]:
import backtesting as bt

In [None]:
year = 2019
baseline = bt.baseline_backtest(year, yearly_dfs, pct_returns=True)
cluster_tickers = bt.cluster_backtest(year, yearly_dfs, clusters, pct_returns=True)

plt.figure(figsize=(10, 5))
plt.plot(baseline, label='Baseline')
plt.plot(cluster_tickers, label='Cluster Tickers')
plt.xlabel('Date')
plt.ylabel('Portfolio Return (%)')
plt.legend()