## Librairies used : 

In [2]:
import pandas as pd
import numpy as np
import polars as pl
import json
import tarfile
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from typing import Tuple
import os
import itertools
from numpy import linalg as LA
import numpy as np
from numpy.linalg import eig
import networkx as nx
from sklearn.preprocessing import MinMaxScaler
import community
from matplotlib.patches import Patch
from clustering_script import *

## Compute covariance matrices

**Computing the covariance matrices with hayashi_yoshida estimator using the functions in utils on the dates with highest responses**

### Compute covariance matrix for demo

In [8]:
compute_covariance_matrices("processed/final_yearly/",
                            ["2010-05-06-14:45"],
                            "output_cov_matrices")

['2010-05-06-14:45']
Processing date: 2010-05-06 with start time: 14:45
Covariance matrices saved for all dates in output_cov_matrices.


### Compute covariance matrix for all dates

In [None]:
highest_responses_strings = ['2010-01-27-14:00', '2010-02-03-13:00', '2010-03-08-12:30', '2010-04-05-10:00',
 '2010-06-11-13:15', '2010-07-23-12:45', '2010-08-20-12:45', '2010-09-23-14:30',
 '2010-10-28-13:00', '2010-11-26-13:15', '2010-12-31-12:00']

compute_covariance_matrices("data_clean",
                            highest_responses_strings,
                            "output_cov_matrices3")

## Create and plot the different clusters for each period and each date

**Differents steps after creating the covariances files :**

- Iterate over each file and each date
- Read the corresponding covariance matrix
- Clip the matrix using the eigen_value method discussed in class
- Adjust with normalization the matrix to perform Louvain clustering
- Save the resulting clusters

In [11]:
def main():

    # Define the periods and their corresponding file paths
    periods_files = {
        'before': 'output_cov_matrices/before.parquet',
        'during': 'output_cov_matrices/during.parquet',
        'after': 'output_cov_matrices/after.parquet'
    }

    # Create a directory to save all plots
    output_dir = 'cluster_plots_crash'
    os.makedirs(output_dir, exist_ok=True)

    # Create separate files for saving clusters for each period
    cluster_output_files = {
        period: os.path.join(output_dir, f'{period}_clusters.jsonl')
        for period in periods_files
    }

    # Ensure the cluster files are empty before appending new results
    for cluster_file in cluster_output_files.values():
        open(cluster_file, 'w').close()

    # Iterate over each period and its corresponding file
    for period, file_path in periods_files.items():
        print(f'\nProcessing Period: {period.capitalize()} | File: {file_path}')

        # Load all covariance matrices for the period
        try:
            cov_matrices_dict = load_and_reshape_covariance(file_path)
        except Exception as e:
            print(f'Error loading file {file_path}: {e}')
            continue

        # Iterate over each date and its covariance matrix
        for date, cov_matrix in cov_matrices_dict.items():
            print(f'  Processing Date: {date}')

            try:
                # Step 1: Apply eigenvalue clipping
                clipped_cov_matrix = eigenvalue_clipping(cov_matrix)
                # Step 2: Fill diagonal with zeros
                np.fill_diagonal(clipped_cov_matrix, 0)
                clipped_cov_matrix = np.asarray(clipped_cov_matrix)
                # Step 3: Normalize the covariance matrix
                scaler = MinMaxScaler()
                adj_matrix = scaler.fit_transform(clipped_cov_matrix)
                # Step 4: Perform Louvain clustering
                partition_optimal = perform_louvain_clustering(adj_matrix)

                clusters_optimal = {}
                for node, cluster_id in partition_optimal.items():
                    # Ensure node index is within the range of asset_names
                    if node < len(asset_names):
                        asset_name = asset_names[node]
                    else:
                        asset_name = f'Asset_{node}'  # Placeholder for undefined assets
                    clusters_optimal.setdefault(cluster_id, []).append(asset_name)

                # Output the optimal clustering results
                print(f'    Optimal Clustering: {clusters_optimal}')

                # Step 5: Save the clusters to the corresponding file
                cluster_data = {
                    'date': date,
                    'clusters': clusters_optimal
                }
                with open(cluster_output_files[period], 'a') as f:
                    f.write(json.dumps(cluster_data) + '\n')

                # Step 6: Process the clusters to count domains
                cluster_domains = {}
                for cluster_id, assets in clusters_optimal.items():
                    # Initialize domain count for the cluster
                    domain_count = {domain: 0 for domain in domain_colors.keys()}
                    for asset in assets:
                        domain = asset_to_domain.get(asset, 'Unknown')
                        domain_count[domain] += 1
                    cluster_domains[cluster_id] = domain_count

                # Step 7: Define the save path for the plot
                # Create a directory for the period if it doesn't exist
                period_dir = os.path.join(output_dir, period)
                os.makedirs(period_dir, exist_ok=True)

                # Define the filename based on the date
                save_filename = f'{period}_{date}.png'
                save_path = os.path.join(period_dir, save_filename)

                # Step 8: Plot and save the cluster domains
                plot_cluster_domains(
                    cluster_domains=cluster_domains,
                    clusters_optimal=clusters_optimal,
                    domain_colors=domain_colors,
                    period=period,
                    date=date,
                    save_path=save_path
                )

                print(f'    Plot saved to: {save_path}')

            except Exception as e:
                print(f'    Error processing date {date}: {e}')
                continue

    print('\nAll clustering and plotting completed.')

if __name__ == "__main__":
    main()



Processing Period: Before | File: output_cov_matrices/before.parquet
  Processing Date: 2010-05-06
    Optimal Clustering: {4: ['AAPL', 'AMGN', 'BA', 'CAT', 'CSCO', 'MMM', 'TRV'], 0: ['AXP', 'HD', 'WMT'], 2: ['CVX', 'MRK', 'VZ'], 5: ['DOW', 'KO', 'UTX', 'XOM'], 3: ['IBM', 'PG', 'V'], 1: ['INTC', 'JPM']}
    Plot saved to: cluster_plots_crash\before\before_2010-05-06.png

Processing Period: During | File: output_cov_matrices/during.parquet
  Processing Date: 2010-05-06
    Optimal Clustering: {5: ['AAPL', 'HD', 'JPM'], 1: ['AMGN', 'DOW', 'IBM', 'KO'], 2: ['AXP', 'BA', 'MRK'], 0: ['CAT', 'V', 'VZ'], 3: ['CSCO', 'WMT', 'XOM'], 6: ['CVX', 'MMM', 'TRV'], 4: ['INTC', 'PG', 'UTX']}
    Plot saved to: cluster_plots_crash\during\during_2010-05-06.png

Processing Period: After | File: output_cov_matrices/after.parquet
  Processing Date: 2010-05-06
    Optimal Clustering: {1: ['AAPL', 'AMGN', 'AXP', 'BA', 'CAT', 'CSCO', 'IBM', 'INTC', 'KO'], 0: ['CVX', 'JPM', 'MMM', 'MRK', 'PG', 'TRV', 'V', 'VZ'

In [12]:
def main():

    # Define the periods and their corresponding file paths
    periods_files = {
        'before': 'output_cov_matrices2/before.parquet',
        'during': 'output_cov_matrices2/during.parquet',
        'after': 'output_cov_matrices2/after.parquet'
    }

    # Create a directory to save all plots
    output_dir = 'cluster_plots_all'
    os.makedirs(output_dir, exist_ok=True)

    # Create separate files for saving clusters for each period
    cluster_output_files = {
        period: os.path.join(output_dir, f'{period}_clusters.jsonl')
        for period in periods_files
    }

    # Ensure the cluster files are empty before appending new results
    for cluster_file in cluster_output_files.values():
        open(cluster_file, 'w').close()

    # Iterate over each period and its corresponding file
    for period, file_path in periods_files.items():
        print(f'\nProcessing Period: {period.capitalize()} | File: {file_path}')

        # Load all covariance matrices for the period
        try:
            cov_matrices_dict = load_and_reshape_covariance(file_path)
        except Exception as e:
            print(f'Error loading file {file_path}: {e}')
            continue

        # Iterate over each date and its covariance matrix
        for date, cov_matrix in cov_matrices_dict.items():
            print(f'  Processing Date: {date}')

            try:
                # Step 1: Apply eigenvalue clipping
                clipped_cov_matrix = eigenvalue_clipping(cov_matrix)
                # Step 2: Fill diagonal with zeros
                np.fill_diagonal(clipped_cov_matrix, 0)
                clipped_cov_matrix = np.asarray(clipped_cov_matrix)
                # Step 3: Normalize the covariance matrix
                scaler = MinMaxScaler()
                adj_matrix = scaler.fit_transform(clipped_cov_matrix)
                # Step 4: Perform Louvain clustering
                partition_optimal = perform_louvain_clustering(adj_matrix)

                clusters_optimal = {}
                for node, cluster_id in partition_optimal.items():
                    # Ensure node index is within the range of asset_names
                    if node < len(asset_names):
                        asset_name = asset_names[node]
                    else:
                        asset_name = f'Asset_{node}'  # Placeholder for undefined assets
                    clusters_optimal.setdefault(cluster_id, []).append(asset_name)

                # Output the optimal clustering results
                print(f'    Optimal Clustering: {clusters_optimal}')

                # Step 5: Save the clusters to the corresponding file
                cluster_data = {
                    'date': date,
                    'clusters': clusters_optimal
                }
                with open(cluster_output_files[period], 'a') as f:
                    f.write(json.dumps(cluster_data) + '\n')

                # Step 6: Process the clusters to count domains
                cluster_domains = {}
                for cluster_id, assets in clusters_optimal.items():
                    # Initialize domain count for the cluster
                    domain_count = {domain: 0 for domain in domain_colors.keys()}
                    for asset in assets:
                        domain = asset_to_domain.get(asset, 'Unknown')
                        domain_count[domain] += 1
                    cluster_domains[cluster_id] = domain_count

                # Step 7: Define the save path for the plot
                # Create a directory for the period if it doesn't exist
                period_dir = os.path.join(output_dir, period)
                os.makedirs(period_dir, exist_ok=True)

                # Define the filename based on the date
                save_filename = f'{period}_{date}.png'
                save_path = os.path.join(period_dir, save_filename)

                # Step 8: Plot and save the cluster domains
                plot_cluster_domains(
                    cluster_domains=cluster_domains,
                    clusters_optimal=clusters_optimal,
                    domain_colors=domain_colors,
                    period=period,
                    date=date,
                    save_path=save_path
                )

                print(f'    Plot saved to: {save_path}')

            except Exception as e:
                print(f'    Error processing date {date}: {e}')
                continue

    print('\nAll clustering and plotting completed.')

if __name__ == "__main__":
    main()



Processing Period: Before | File: output_cov_matrices2/before.parquet
  Processing Date: 2010-11-26
    Optimal Clustering: {0: ['AAPL', 'IBM'], 1: ['AMGN', 'AXP', 'CSCO', 'INTC'], 3: ['BA', 'VZ'], 7: ['CAT', 'UTX'], 6: ['CVX', 'MRK'], 4: ['DOW', 'KO'], 8: ['HD', 'V'], 2: ['JPM', 'MMM'], 5: ['PG', 'TRV', 'WMT'], 9: ['XOM']}
    Plot saved to: cluster_plots_all\before\before_2010-11-26.png
  Processing Date: 2010-01-27
    Optimal Clustering: {0: ['AAPL', 'CAT', 'TRV'], 3: ['AMGN', 'BA', 'CVX', 'MRK', 'PG'], 2: ['AXP', 'V'], 4: ['CSCO', 'IBM'], 1: ['DOW', 'HD'], 6: ['INTC', 'MMM'], 7: ['JPM', 'KO'], 5: ['UTX', 'VZ', 'WMT', 'XOM']}
    Plot saved to: cluster_plots_all\before\before_2010-01-27.png
  Processing Date: 2010-04-05
    Optimal Clustering: {0: ['AAPL', 'AMGN', 'AXP', 'BA', 'IBM', 'JPM', 'MMM', 'PG', 'TRV'], 2: ['CAT'], 1: ['CSCO', 'HD', 'INTC', 'MRK', 'UTX', 'V', 'VZ', 'WMT', 'XOM'], 3: ['CVX', 'DOW', 'KO']}
    Plot saved to: cluster_plots_all\before\before_2010-04-05.png
  P

In [3]:
def process_covariance_matrices(cov_matrix_path, output_folder):
    """
    Processes covariance matrices, applies clustering, and generates plots.
    
    Parameters:
        cov_matrix_path (str): Path to the directory containing covariance matrix files.
        output_folder (str): Path to the output directory for saving results.
    """

    # Define the periods and their corresponding file paths
    periods_files = {
        'before': f'{cov_matrix_path}/before.parquet',
        'during': f'{cov_matrix_path}/during.parquet',
        'after': f'{cov_matrix_path}/after.parquet'
    }

    # Create a directory to save all plots
    os.makedirs(output_folder, exist_ok=True)

    # Create separate files for saving clusters for each period
    cluster_output_files = {
        period: os.path.join(output_folder, f'{period}_clusters.jsonl')
        for period in periods_files
    }

    # Ensure the cluster files are empty before appending new results
    for cluster_file in cluster_output_files.values():
        open(cluster_file, 'w').close()

    # Iterate over each period and its corresponding file
    for period, file_path in periods_files.items():
        print(f'\nProcessing Period: {period.capitalize()} | File: {file_path}')

        # Load all covariance matrices for the period
        try:
            cov_matrices_dict = load_and_reshape_covariance(file_path)
        except Exception as e:
            print(f'Error loading file {file_path}: {e}')
            continue

        # Iterate over each date and its covariance matrix
        for date, cov_matrix in cov_matrices_dict.items():
            print(f'  Processing Date: {date}')

            try:
                # Step 1: Apply eigenvalue clipping
                clipped_cov_matrix = eigenvalue_clipping(cov_matrix)
                # Step 2: Fill diagonal with zeros
                np.fill_diagonal(clipped_cov_matrix, 0)
                clipped_cov_matrix = np.asarray(clipped_cov_matrix)
                # Step 3: Normalize the covariance matrix
                scaler = MinMaxScaler()
                adj_matrix = scaler.fit_transform(clipped_cov_matrix)
                # Step 4: Perform Louvain clustering
                partition_optimal = perform_louvain_clustering(adj_matrix)

                clusters_optimal = {}
                for node, cluster_id in partition_optimal.items():
                    # Ensure node index is within the range of asset_names
                    if node < len(asset_names):
                        asset_name = asset_names[node]
                    else:
                        asset_name = f'Asset_{node}'  # Placeholder for undefined assets
                    clusters_optimal.setdefault(cluster_id, []).append(asset_name)

                # Output the optimal clustering results
                print(f'    Optimal Clustering: {clusters_optimal}')

                # Step 5: Save the clusters to the corresponding file
                cluster_data = {
                    'date': date,
                    'clusters': clusters_optimal
                }
                with open(cluster_output_files[period], 'a') as f:
                    f.write(json.dumps(cluster_data) + '\n')

                # Step 6: Process the clusters to count domains
                cluster_domains = {}
                for cluster_id, assets in clusters_optimal.items():
                    # Initialize domain count for the cluster
                    domain_count = {domain: 0 for domain in domain_colors.keys()}
                    for asset in assets:
                        domain = asset_to_domain.get(asset, 'Unknown')
                        domain_count[domain] += 1
                    cluster_domains[cluster_id] = domain_count

                # Step 7: Define the save path for the plot
                # Create a directory for the period if it doesn't exist
                period_dir = os.path.join(output_folder, period)
                os.makedirs(period_dir, exist_ok=True)

                # Define the filename based on the date
                save_filename = f'{period}_{date}.png'
                save_path = os.path.join(period_dir, save_filename)

                # Step 8: Plot and save the cluster domains
                plot_cluster_domains(
                    cluster_domains=cluster_domains,
                    clusters_optimal=clusters_optimal,
                    domain_colors=domain_colors,
                    period=period,
                    date=date,
                    save_path=save_path
                )

                print(f'    Plot saved to: {save_path}')

            except Exception as e:
                print(f'    Error processing date {date}: {e}')
                continue

    print('\nAll clustering and plotting completed.')

process_covariance_matrices('output_cov_matrices2', "test")


Processing Period: Before | File: output_cov_matrices2/before.parquet
  Processing Date: 2010-11-26
    Optimal Clustering: {0: ['AAPL', 'AMGN', 'CSCO', 'INTC'], 1: ['AXP', 'IBM'], 3: ['BA', 'JPM'], 7: ['CAT', 'V'], 6: ['CVX', 'UTX'], 8: ['DOW', 'HD'], 9: ['KO', 'MRK'], 2: ['MMM', 'VZ'], 5: ['PG', 'TRV', 'WMT'], 4: ['XOM']}
    Plot saved to: test\before\before_2010-11-26.png
  Processing Date: 2010-01-27
    Optimal Clustering: {0: ['AAPL', 'JPM', 'TRV'], 4: ['AMGN', 'CAT', 'CVX', 'IBM', 'PG'], 2: ['AXP', 'HD'], 3: ['BA', 'MMM'], 1: ['CSCO', 'V'], 6: ['DOW', 'KO'], 7: ['INTC', 'MRK'], 5: ['UTX', 'VZ', 'WMT', 'XOM']}
    Plot saved to: test\before\before_2010-01-27.png
  Processing Date: 2010-04-05
    Optimal Clustering: {0: ['AAPL', 'BA', 'IBM', 'XOM'], 2: ['AMGN', 'AXP', 'CAT', 'CSCO', 'DOW', 'HD', 'JPM', 'KO', 'MMM', 'PG', 'TRV'], 1: ['CVX', 'INTC', 'MRK', 'UTX', 'V', 'VZ', 'WMT']}
    Plot saved to: test\before\before_2010-04-05.png
  Processing Date: 2010-08-20
    Optimal Clust