In [104]:
import os
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
from signet.cluster import Cluster
import glob

In [105]:
# Path to the 'Yearly' directory
data_root = "Data/CRSP/Yearly"

In [106]:
def load_gzipped_data(data_root):
    """
    Loads all .csv.gz daily files from Yearly/2000 through Yearly/2015.
    Extracts the date from the filename and adds it as a column.
    """
    all_files = []
    for year in range(2000, 2016):
        pattern = os.path.join(data_root, str(year), "*.csv.gz")
        files = sorted(glob.glob(pattern))
        
        for f in files:
            try:
                df_day = pd.read_csv(f, compression='gzip')
                date_str = os.path.basename(f).replace(".csv.gz", "")
                df_day["date"] = pd.to_datetime(date_str, format="%Y%m%d")
                all_files.append(df_day)
            except Exception as e:
                print(f"Skipping {f}: {e}")

    df = pd.concat(all_files, ignore_index=True)
    return df

In [81]:
# def compute_lookback_return_matrix(df, lookback=20):
#     """
#     Computes log-returns over a given lookback window and returns a
#     (tickers × dates) matrix.
#     """
#     df = df.copy()
#     df = df.sort_values(by=['ticker', 'date'])
#     df['log_adjclose'] = np.log(df['prevAdjClose'])
    
#     df['log_return'] = df.groupby('ticker')['log_adjclose'].transform(
#         lambda x: x - x.shift(lookback)
#     )
    
#     df = df.dropna(subset=['log_return']) ##CHECK THIS LATER
#     return df.pivot(index='ticker', columns='date', values='log_return')

##maybe do it this way instead


In [107]:
def build_signed_graph(R, threshold=0.3):
    """
    R: (n_assets x n_times) return matrix
    threshold: correlation cutoff for edge inclusion

    Returns: (Ap, An) positive and negative adjacency matrices
    """
    corr = R.T.corr().fillna(0)
    np.fill_diagonal(corr.values, 0)  # remove self loops

    
    Ap = csr_matrix(corr.values * (corr.values >= threshold))
    An = csr_matrix(-corr.values * (corr.values <= -threshold))

    return Ap, An


In [108]:
def sponge_clustering(Ap, An, k=5, tau_p=1, tau_n=1):
    """
    Perform SPONGE clustering on signed graph defined by (Ap, An)
    """
    clusterer = Cluster((Ap, An))
    labels = clusterer.SPONGE(k=k, tau_p=tau_p, tau_n=tau_n)
    return labels


In [84]:
# def run_sponge_pipeline(data_root, k=5, lookback=20):
#     df = load_data(data_root)
    
#     print(f"Computing log-returns with lookback = {lookback} days...")
#     R = compute_lookback_return_matrix(df, lookback=lookback)
    
#     print("Constructing signed graph...")
#     Ap, An = construct_signed_graph(R)
    
#     print("Running SPONGE clustering...")
#     labels = sponge_clustering(Ap, An, k=k)
    
#     return pd.DataFrame({'ticker': R.index, 'cluster': labels})


In [87]:
# if __name__ == "__main__":
#     data_root = "Data/SYearly"
#     lookback = 20
#     num_clusters = 5
#     threshold = 0.3

#     print("Loading data...")
#     df = load_gzipped_data(data_root)

#     print(f"Computing log-returns with lookback = {lookback} days...")
#     R = compute_lookback_return_matrix(df, lookback=lookback)

#     print("Building signed graph from correlation matrix...")
#     Ap, An = build_signed_graph(R, threshold=threshold)

#     print(f"Running SPONGE clustering into {num_clusters} clusters...")
#     labels = sponge_clustering(Ap, An, k=num_clusters)

#     tickers = R.index.tolist()
#     clustered = pd.DataFrame({'ticker': tickers, 'cluster': labels})
#     print(clustered.head())



Loading data...
Computing log-returns with lookback = 20 days...


  result = getattr(ufunc, method)(*inputs, **kwargs)


Building signed graph from correlation matrix...
Running SPONGE clustering into 5 clusters...
  ticker  cluster
0     AA        0
1    AAA        3
2    AAC        4
3    AAG        1
4    AAI        3


In [109]:
df = load_gzipped_data("Data/CRSP/Yearly")


In [102]:
def compute_lookback_return_matrix(df, window_dates):
    df_window = df[df['date'].isin(window_dates)].copy()
    df_window = df_window[df_window['prevAdjClose'] > 0]  # filter non-positive prices
    
    df_window = df_window.sort_values(['ticker', 'date'])
    df_window['log_adjclose'] = np.log(df_window['prevAdjClose'])
    # For each ticker, shift the log_adjclose by `lookback` days ##CHECK THIS PART OF THE CODE
    df_window['log_adjclose_shifted'] = df_window.groupby('ticker')['log_adjclose'].shift(lookback)

    #Calculate the window log return explicitly over the lookback window
    df_window['log_return'] = df_window['log_adjclose'] - df_window['log_adjclose_shifted']
    df_window = df_window.dropna(subset=['log_return'])
    
    R = df_window.pivot(index='ticker', columns='date', values='log_return')
    return R


In [98]:
def build_signed_graph(R, threshold=0.3):
    corr = R.T.corr().fillna(0)
    np.fill_diagonal(corr.values, 0)
    Ap = csr_matrix(corr.values * (corr.values >= threshold))
    An = csr_matrix(-corr.values * (corr.values <= -threshold))
    return Ap, An

In [99]:
def sponge_clustering(Ap, An, k=5, tau_p=1, tau_n=1):
    clusterer = Cluster((Ap, An))
    labels = clusterer.SPONGE(k=k, tau_p=tau_p, tau_n=tau_n)
    return labels

In [110]:
def run_rolling_sponge_clustering(df, lookback=20, threshold=0.3, k=5):
    df = df.sort_values('date')
    unique_dates = df['date'].drop_duplicates().sort_values().reset_index(drop=True)

    results = []

    for i in range(lookback, len(unique_dates)):
        window_dates = unique_dates[i - lookback:i]
        R = compute_lookback_return_matrix(df, window_dates)
        
        if R.shape[1] < 2:
            # Not enough data to compute correlation
            continue

        Ap, An = build_signed_graph(R, threshold=threshold)
        labels = sponge_clustering(Ap, An, k=k)

        results.append({
            'date': unique_dates[i],
            'tickers': R.index.tolist(),
            'labels': labels
        })
        print(f"Processed clustering for window ending {unique_dates[i].date()}")

        print({
            'date': unique_dates[i],
            'tickers': R.index.tolist(),
            'labels': labels
        })

    return results

In [111]:
#df = your_loaded_dataframe_with_all_data
results = run_rolling_sponge_clustering(df, lookback=20, threshold=0.3, k=5)

Processed clustering for window ending 2000-02-01
{'date': Timestamp('2000-02-01 00:00:00'), 'tickers': ['AA', 'AAC', 'AAG', 'AAIR', 'AAM', 'AAP', 'AAS', 'AAT', 'AB', 'ABERF', 'ABF', 'ABG', 'ABK', 'ABM', 'ABMD', 'ABN', 'ABP', 'ABR', 'ABRI', 'ABS', 'ABT', 'ABX', 'ABY', 'AC', 'ACEE', 'ACF', 'ACG', 'ACI', 'ACK', 'ACL', 'ACLNF', 'ACN', 'ACO', 'ACP', 'ACR', 'ACRG', 'ACS', 'ACTN', 'ACV', 'ACX', 'ACXM', 'AD', 'ADC', 'ADECY', 'ADF', 'ADG', 'ADI', 'ADM', 'ADMS', 'ADP', 'ADPT', 'ADV', 'ADX', 'AEC', 'AED', 'AEE', 'AEG', 'AEM', 'AEOS', 'AEP', 'AER', 'AES', 'AET', 'AF', 'AFC', 'AFF', 'AFG', 'AFL', 'AFM', 'AFP', 'AFS', 'AG', 'AGA', 'AGC', 'AGE', 'AGI', 'AGL', 'AGM', 'AGN', 'AGR', 'AGS', 'AGTX', 'AGU', 'AGX', 'AGY', 'AH', 'AHC', 'AHG', 'AHI', 'AHMH', 'AHO', 'AHP', 'AHR', 'AIB', 'AIC', 'AIF', 'AIG', 'AIN', 'AIR', 'AIV', 'AIZ', 'AJG', 'AJL', 'AJX', 'AK', 'AKO', 'AKR', 'AKS', 'AL', 'ALA', 'ALB', 'ALC', 'ALEX', 'ALG', 'ALK', 'ALL', 'ALLC', 'ALLE', 'ALLY', 'ALM', 'ALN', 'ALO', 'ALR', 'ALRE', 'ALS', 'ALU',

KeyboardInterrupt: 

In [112]:
results

NameError: name 'results' is not defined