In [2]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
import numpy as np
import numpy.linalg as la

In [3]:
df = pd.read_csv("data/sp500_close.csv")
df.set_index("Date", inplace=True)
df

Unnamed: 0_level_0,A,AAPL,ABBV,ABNB,ABT,ACGL,ACN,ADBE,ADI,ADM,...,WTW,WY,WYNN,XEL,XOM,XYL,YUM,ZBH,ZBRA,ZTS
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2022-10-31 04:00:00,138.35,153.34,146.40,106.91,98.94,57.50,283.90,318.50,142.62,96.98,...,218.21,30.93,63.90,65.11,110.81,102.43,118.25,113.35,283.22,150.78
2022-11-01 04:00:00,140.89,150.65,146.91,109.05,99.31,55.44,281.47,316.02,144.70,97.53,...,221.20,30.39,67.86,65.90,111.91,105.54,118.15,111.84,238.30,152.72
2022-11-02 04:00:00,135.27,145.03,144.52,94.41,98.04,56.22,272.45,301.22,141.24,95.80,...,220.13,29.66,67.13,65.31,109.61,104.27,117.37,107.69,236.03,147.36
2022-11-03 04:00:00,134.46,138.88,144.42,92.02,96.45,56.53,256.88,285.93,138.02,96.28,...,217.96,29.39,66.48,65.15,111.10,107.17,119.50,105.42,227.32,131.14
2022-11-04 04:00:00,136.08,138.38,145.28,96.09,98.07,56.86,261.16,285.75,144.29,95.19,...,221.21,30.28,70.81,65.55,112.31,107.21,121.78,104.85,230.56,133.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-22 05:00:00,139.57,193.60,154.94,140.80,109.03,73.84,354.45,598.75,197.10,71.46,...,240.09,34.36,90.42,62.06,101.91,112.64,130.00,120.56,269.41,194.98
2023-12-26 05:00:00,139.81,193.05,154.62,138.72,109.23,73.51,353.43,598.26,198.87,72.39,...,239.09,34.63,91.00,62.25,102.14,113.62,130.26,121.42,275.50,195.50
2023-12-27 05:00:00,139.82,193.15,154.88,136.55,109.79,73.54,353.82,596.08,199.35,72.17,...,239.30,34.76,90.63,61.54,101.66,114.38,130.84,121.13,275.79,196.90
2023-12-28 05:00:00,139.77,193.58,154.75,137.00,110.40,74.06,351.59,595.52,200.24,72.27,...,239.32,35.10,91.76,61.89,100.19,114.32,130.52,121.63,275.35,197.16


In [4]:
# Visualize correlation matrix
corr = df.corr()
px.imshow(corr)

In [80]:
def graph_from_adj_matrix(adj_matrix, node_list, threshold=0.85):
    G = nx.Graph()
    G.add_nodes_from(node_list)
    for i in range(len(node_list)):
        for j in range(i+1, len(node_list)):
            if abs(adj_matrix[i, j]) > threshold:
                G.add_edge(node_list[i], node_list[j], weight=adj_matrix[i, j])

    return G

In [82]:
# Create graph representation such that only correlations above 0.85 are edges
G = graph_from_adj_matrix(corr.values, corr.columns, threshold=0.85)
len(G.nodes), len(G.edges)

(489, 5198)

In [83]:
def draw_graph(G):
    # G = nx.random_geometric_graph(200, 0.125)
    pos = nx.spring_layout(G)
    # pos = nx.bipartite_layout(G, G.nodes)
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.append(x0)
        edge_x.append(x1)
        edge_x.append(None)
        edge_y.append(y0)
        edge_y.append(y1)
        edge_y.append(None)

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.5, color='#888'),
        hoverinfo='none',
        mode='lines')

    node_x = []
    node_y = []
    for node in G.nodes():
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)

    node_text = []
    for node in G.nodes():
        node_text.append(node)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers',
        hoverinfo='text',
        hovertext=node_text,
        marker=dict(
            showscale=True,
            # colorscale options
            #'Greys' | 'YlGnBu' | 'Greens' | 'YlOrRd' | 'Bluered' | 'RdBu' |
            #'Reds' | 'Blues' | 'Picnic' | 'Rainbow' | 'Portland' | 'Jet' |
            #'Hot' | 'Blackbody' | 'Earth' | 'Electric' | 'Viridis' |
            colorscale='YlGnBu',
            reversescale=True,
            color=[],
            size=10,
            colorbar=dict(
                thickness=15,
                title=dict(
                text='Node Connections',
                side='right'
                ),
                xanchor='left',
            ),
            line_width=2))


    node_adjacencies = []
    node_text = []
    for node, adjacencies in enumerate(G.adjacency()):
        node_adjacencies.append(len(adjacencies[1]))
        node_text.append('# of connections: '+str(len(adjacencies[1])))

    node_trace.marker.color = node_adjacencies
    node_trace.text = node_text

    fig = go.Figure(data=[edge_trace, node_trace],
                layout=go.Layout(
                    title=dict(
                        text="<br>Network graph made with Python",
                        font=dict(
                            size=16
                        )
                    ),
                    showlegend=False,
                    hovermode='closest',
                    margin=dict(b=20,l=5,r=5,t=40),
                    annotations=[ dict(
                        text="Python code: <a href='https://plotly.com/python/network-graphs/'> https://plotly.com/python/network-graphs/</a>",
                        showarrow=False,
                        xref="paper", yref="paper",
                        x=0.005, y=-0.002 ) ],
                    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False))
                    )
    fig.show(renderer="browser")

In [91]:
draw_graph(G)

In [85]:
adj_matrix = nx.adjacency_matrix(G).todense()
adj_matrix, len(adj_matrix)

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 489)

In [77]:
def loss(m1, m2):
    return la.norm(m1-m2, ord='fro')

def SVD(adj_matrix, c):
    c_rows = int(np.ceil(c*adj_matrix.shape[0]))
    L, M, R = np.linalg.svd(adj_matrix)
    A = np.dot(L[:,:c_rows] * M[:c_rows], R[:c_rows, :])

    return A

def ColibriS(adj_matrix, c, epsilon=0.5):
    A = adj_matrix
    c_rows = int(np.ceil(c*A.shape[0]))

    A_A = A**2
    p_x = A_A.sum(axis=0)/A_A.sum()
    I = np.random.choice(A.shape[1], size=(c_rows,), p=p_x)
    L = A[:, I[0]].reshape((A.shape[0], 1))
    M = (1/(L.transpose() @ L)).reshape((1,1))

    for k in range(1, c_rows):
        res = A[:, I[k]] - L @ M @ L.transpose() @ A[:, I[k]]
        if la.norm(res) < epsilon*la.norm(A[:, I[k]]):
            continue
        else:
            delta = la.norm(res) ** 2
            y = (M @ L.transpose() @ A[:, I[k]]).reshape((M.shape[0],1))
            M_lu = M + y @ y.transpose() / (delta ** 2)
            M_u = np.concatenate((M_lu, -y/delta), axis=1)
            M_d = np.concatenate((-y.transpose()/delta, np.array(1/delta).reshape((1,1))), axis=1)
            M = np.concatenate((M_u, M_d), axis=0)
            L = np.concatenate((L, A[:, I[k]].reshape(A.shape[0], 1)), axis=1)

    R = L.transpose() @ A
    A_reconstruct = L @ M @ R
    adj_matrix_reconstruct = A_reconstruct

    return L, M, R, adj_matrix_reconstruct

In [66]:
L, M, R, adj_matrix_reconstruct = ColibriS(adj_matrix, 0.3)
L.shape, M.shape, R.shape, adj_matrix_reconstruct.shape

((489, 141), (141, 141), (141, 489), (489, 489))

In [88]:
G_reconstruct = graph_from_adj_matrix(adj_matrix_reconstruct, corr.columns, threshold=0.85)
draw_graph(G_reconstruct)
loss(adj_matrix_reconstruct, adj_matrix)

np.float64(94.1850963538142)

In [89]:
svd_reconstruct = SVD(adj_matrix, 0.3)

In [109]:
G_svd_reconstruct = graph_from_adj_matrix(svd_reconstruct, corr.columns, threshold=0.85)
draw_graph(G_svd_reconstruct)
loss(svd_reconstruct, adj_matrix)

np.float64(15.604808668527973)

In [98]:
# Get info on CME
values = corr["CME"].sort_values(ascending=False)
values[values > 0.85]

CME           1.000000
CBOE          0.924663
Unnamed: 0    0.909319
ANET          0.901423
VST           0.900421
PKG           0.898362
INTU          0.897718
TRGP          0.895749
DELL          0.890486
JBL           0.881609
LLY           0.880396
ADBE          0.876718
MPC           0.875080
AJG           0.866904
SNPS          0.866770
FICO          0.866612
BR            0.864702
ROP           0.862101
BKNG          0.858433
MAR           0.857070
WELL          0.856180
PCAR          0.854024
ETN           0.851838
CDNS          0.850402
Name: CME, dtype: float64

In [99]:
# Create function to compare same stock across different graphs
def compare_stock(ticker, graphs):
    ticker_info = []
    idx = corr.columns.get_loc(ticker)
    for graph in graphs:
        ticker_info.append(len(graph.edges(ticker)))

    return ticker_info

In [102]:
compare_stock("CME", [G, G_reconstruct, G_svd_reconstruct])

[38, 3, 20]

In [105]:
t_info = [compare_stock(ticker, [G, G_reconstruct, G_svd_reconstruct]) for ticker in corr.columns]
t_info = pd.DataFrame(t_info, columns=["Original", "ColibriS", "SVD"], index=corr.columns)

In [108]:
t_info.to_csv("data/colibri_svd_edge_count.csv")
t_info

Unnamed: 0,Original,ColibriS,SVD
A,24,1,16
AAPL,54,20,45
ABBV,0,0,0
ABNB,4,0,0
ABT,0,0,0
...,...,...,...
XYL,0,0,0
YUM,1,0,0
ZBH,2,0,0
ZBRA,4,0,0


In [107]:
sum(t_info["Original"]), sum(t_info["ColibriS"]), sum(t_info["SVD"])

(10396, 1028, 7006)