## Librairies used : 

In [8]:
import pandas as pd
import numpy as np
import polars as pl
import tarfile
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from typing import Tuple
import os
import itertools
from numpy import linalg as LA
import numpy as np
from numpy.linalg import eig
import networkx as nx
from sklearn.preprocessing import MinMaxScaler
import community
from matplotlib.patches import Patch
from utils_clustering import *

## Compute covariance matrices

**Computing the covariance matrices with hayashi_yoshida estimator using the functions in utils on the dates with highest responses**

In [9]:
# Array of mapped strings for the highest responses per month
highest_responses_strings = np.array(['2010-01-27-14:00',
    '2010-02-03-13:00',
    '2010-03-08-12:30',
    '2010-04-05-10:00',
    '2010-06-11-13:15',
    '2010-07-23-12:45',
    '2010-08-20-12:45',
    '2010-09-23-14:30',
    '2010-10-28-13:00',
    '2010-11-26-13:15',
    '2010-12-31-12:00'
])

# Asset names
asset_names = [
    'AAPL', 'AMGN', 'AXP', 'BA', 'CAT', 'CSCO', 'CVX', 'DOW', 'HD', 'IBM',
    'INTC', 'JPM', 'KO', 'MMM', 'MRK', 'PG', 'TRV', 'UTX', 'V', 'VZ',
    'WMT', 'XOM'
]

# Step 1: Define the mapping from assets to domains
asset_to_domain = {
    'AAPL': 'Technology', 'IBM': 'Technology', 'INTC': 'Technology', 'MSFT': 'Technology',
    'AMGN': 'Healthcare', 'JNJ': 'Healthcare', 'MMM': 'Healthcare', 'MRK': 'Healthcare',
    'PFE': 'Healthcare', 'UNH': 'Healthcare',
    'AXP': 'Finance', 'GS': 'Finance', 'JPM': 'Finance', 'TRV': 'Finance',
    'BA': 'Industrials', 'CAT': 'Industrials', 'DOW': 'Industrials', 'RTX': 'Industrials',
    'UTX': 'Industrials',
    'CSCO': 'Telecommunications', 'VZ': 'Telecommunications',
    'CVX': 'Energy', 'XOM': 'Energy',
    'HD': 'Retail',
    'KO': 'Consumer Staples', 'WBA': 'Consumer Staples',
    'MCD': 'Consumer Discretionary', 'NKE': 'Consumer Discretionary',
    'PG': 'Consumer Discretionary', 'V': 'Consumer Discretionary',
    'WMT': 'Consumer Discretionary'
}

domain_colors = {
    'Technology': '#1f77b4',
    'Healthcare': '#ff7f0e',
    'Finance': '#2ca02c',
    'Industrials': '#d62728',
    'Telecommunications': '#9467bd',
    'Energy': '#8c564b',
    'Retail': '#e377c2',
    'Consumer Staples': '#7f7f7f',
    'Consumer Discretionary': '#bcbd22'
}


### Compute covariance matrix for demo

In [None]:
compute_covariance_matrices("data_clean",
                            ["2010-05-06-14:45"],
                            "output_cov_matrices")

### Compute covariance matrix for all dates

In [10]:
compute_covariance_matrices("data_clean",
                            highest_responses_strings,
                            "output_cov_matrices3")

Processing date: 2010-01-27 with start time: 14:00
Processing date: 2010-02-03 with start time: 13:00
Processing date: 2010-03-08 with start time: 12:30
Processing date: 2010-04-05 with start time: 10:00


## Create and plot the different clusters for each period and each date

**Differents steps after creating the covariances files :**

- Iterate over each file and each date
- Read the corresponding covariance matrix
- Clip the matrix using the eigen_value method discussed in class
- Adjust with normalization the matrix to perform Louvain clustering
- Save the resulting clusters

In [7]:
def main():
    # Define the periods and their corresponding file paths

    demo_periods_files = {
        'before': 'output_cov_matrices/before.parquet',
        'during': 'output_cov_matrices/during.parquet',
        'after': 'output_cov_matrices/after.parquet'
    }

    periods_files = {
        'before': 'output_cov_matrices2/before.parquet',
        'during': 'output_cov_matrices2/during.parquet',
        'after': 'output_cov_matrices2/after.parquet'
    }

    # Create a directory to save all plots
    output_dir = 'cluster_plots2'
    os.makedirs(output_dir, exist_ok=True)

    # Iterate over each period and its corresponding file
    for period, file_path in periods_files.items():
        print(f'\nProcessing Period: {period.capitalize()} | File: {file_path}')

        # Load all covariance matrices for the period
        try:
            cov_matrices_dict = load_and_reshape_covariance(file_path)
        except Exception as e:
            print(f'Error loading file {file_path}: {e}')
            continue

        # Iterate over each date and its covariance matrix
        for date, cov_matrix in cov_matrices_dict.items():
            print(f'  Processing Date: {date}')

            try:
                # Step 1: Apply eigenvalue clipping
                clipped_cov_matrix = eigenvalue_clipping(cov_matrix)
                # Step 2: Fill diagonal with zeros
                np.fill_diagonal(clipped_cov_matrix, 0)
                clipped_cov_matrix = np.asarray(clipped_cov_matrix)
                # Step 3: Normalize the covariance matrix
                scaler = MinMaxScaler()
                adj_matrix = scaler.fit_transform(clipped_cov_matrix)
                # Step 4: Perform Louvain clustering
                partition_optimal = perform_louvain_clustering(adj_matrix)

                clusters_optimal = {}
                for node, cluster_id in partition_optimal.items():
                    # Ensure node index is within the range of asset_names
                    if node < len(asset_names):
                        asset_name = asset_names[node]
                    else:
                        asset_name = f'Asset_{node}'  # Placeholder for undefined assets
                    clusters_optimal.setdefault(cluster_id, []).append(asset_name)

                # Output the optimal clustering results
                print(f'    Optimal Clustering: {clusters_optimal}')

                # Step 6: Process the clusters to count domains
                cluster_domains = {}
                for cluster_id, assets in clusters_optimal.items():
                    # Initialize domain count for the cluster
                    domain_count = {domain: 0 for domain in domain_colors.keys()}
                    for asset in assets:
                        domain = asset_to_domain.get(asset, 'Unknown')
                        domain_count[domain] += 1
                    cluster_domains[cluster_id] = domain_count

                # Step 7: Define the save path for the plot
                # Create a directory for the period if it doesn't exist
                period_dir = os.path.join(output_dir, period)
                os.makedirs(period_dir, exist_ok=True)

                # Define the filename based on the date
                save_filename = f'{period}_{date}.png'
                save_path = os.path.join(period_dir, save_filename)

                # Step 8: Plot and save the cluster domains
                plot_cluster_domains(
                    cluster_domains=cluster_domains,
                    clusters_optimal=clusters_optimal,
                    domain_colors=domain_colors,
                    period=period,
                    date=date,
                    save_path=save_path
                )

                print(f'    Plot saved to: {save_path}')

            except Exception as e:
                print(f'    Error processing date {date}: {e}')
                continue

    print('\nAll clustering and plotting completed.')

if __name__ == "__main__":
    main()


Processing Period: Before | File: output_cov_matrices2/before.parquet
  Processing Date: 2010-11-26
0
1
2
3
4
    Optimal Clustering: {0: ['AAPL', 'AMGN', 'CSCO', 'DOW', 'INTC', 'KO'], 1: ['AXP', 'IBM'], 2: ['BA', 'UTX'], 3: ['CAT', 'VZ'], 5: ['CVX', 'MRK'], 6: ['HD', 'TRV'], 7: ['JPM', 'PG', 'WMT'], 8: ['MMM', 'V'], 4: ['XOM']}
    Plot saved to: cluster_plots2\before\before_2010-11-26.png
  Processing Date: 2010-01-27
0
1
2
3
4
    Optimal Clustering: {0: ['AAPL', 'KO'], 2: ['AMGN', 'AXP', 'BA', 'CSCO', 'PG'], 3: ['CAT', 'IBM'], 4: ['CVX', 'JPM', 'TRV'], 5: ['DOW', 'INTC'], 6: ['HD', 'MRK'], 1: ['MMM', 'V'], 7: ['UTX', 'VZ', 'WMT', 'XOM']}
    Plot saved to: cluster_plots2\before\before_2010-01-27.png
  Processing Date: 2010-04-05
0
1
2
3
4
    Optimal Clustering: {0: ['AAPL', 'IBM', 'XOM'], 3: ['AMGN', 'AXP', 'BA'], 2: ['CAT', 'CSCO', 'CVX', 'DOW', 'HD', 'JPM', 'KO', 'MMM', 'PG', 'TRV'], 1: ['INTC', 'MRK', 'UTX', 'V', 'VZ', 'WMT']}
    Plot saved to: cluster_plots2\before\before_20

In [None]:
def main(demo=True):
    # Define the periods and their corresponding file paths
    demo_periods_files = {
        'before': 'output_cov_matrices/before.parquet',
        'during': 'output_cov_matrices/during.parquet',
        'after': 'output_cov_matrices/after.parquet'
    }

    periods_files = {
        'before': 'output_cov_matrices2/before.parquet',
        'during': 'output_cov_matrices2/during.parquet',
        'after': 'output_cov_matrices2/after.parquet'
    }

    # Select the appropriate file paths based on the demo flag
    selected_files = demo_periods_files if demo else periods_files

    # Create a directory to save all plots
    output_dir = 'cluster_plots2'
    os.makedirs(output_dir, exist_ok=True)

    # Iterate over each period and its corresponding file
    for period, file_path in selected_files.items():
        print(f'\nProcessing Period: {period.capitalize()} | File: {file_path}')

        # Load all covariance matrices for the period
        try:
            cov_matrices_dict = load_and_reshape_covariance(file_path)
        except Exception as e:
            print(f'Error loading file {file_path}: {e}')
            continue

        # Iterate over each date and its covariance matrix
        for date, cov_matrix in cov_matrices_dict.items():
            print(f'  Processing Date: {date}')

            try:
                # Step 1: Apply eigenvalue clipping
                clipped_cov_matrix = eigenvalue_clipping(cov_matrix)
                # Step 2: Fill diagonal with zeros
                np.fill_diagonal(clipped_cov_matrix, 0)
                clipped_cov_matrix = np.asarray(clipped_cov_matrix)
                # Step 3: Normalize the covariance matrix
                scaler = MinMaxScaler()
                adj_matrix = scaler.fit_transform(clipped_cov_matrix)
                # Step 4: Perform Louvain clustering
                partition_optimal = perform_louvain_clustering(adj_matrix)

                clusters_optimal = {}
                for node, cluster_id in partition_optimal.items():
                    # Ensure node index is within the range of asset_names
                    if node < len(asset_names):
                        asset_name = asset_names[node]
                    else:
                        asset_name = f'Asset_{node}'  # Placeholder for undefined assets
                    clusters_optimal.setdefault(cluster_id, []).append(asset_name)

                # Output the optimal clustering results
                print(f'    Optimal Clustering: {clusters_optimal}')

                # Step 6: Process the clusters to count domains
                cluster_domains = {}
                for cluster_id, assets in clusters_optimal.items():
                    # Initialize domain count for the cluster
                    domain_count = {domain: 0 for domain in domain_colors.keys()}
                    for asset in assets:
                        domain = asset_to_domain.get(asset, 'Unknown')
                        domain_count[domain] += 1
                    cluster_domains[cluster_id] = domain_count

                # Step 7: Define the save path for the plot
                # Create a directory for the period if it doesn't exist
                period_dir = os.path.join(output_dir, period)
                os.makedirs(period_dir, exist_ok=True)

                # Define the filename based on the date
                save_filename = f'{period}_{date}.png'
                save_path = os.path.join(period_dir, save_filename)

                # Step 8: Plot and save the cluster domains
                plot_cluster_domains(
                    cluster_domains=cluster_domains,
                    clusters_optimal=clusters_optimal,
                    domain_colors=domain_colors,
                    period=period,
                    date=date,
                    save_path=save_path
                )

                print(f'    Plot saved to: {save_path}')

            except Exception as e:
                print(f'    Error processing date {date}: {e}')
                continue

    print('\nAll clustering and plotting completed.')

if __name__ == "__main__":
    main(demo=True)  # Change demo=True or demo=False as needed
