In [2]:
import numpy as np
import pandas as pd
from scipy import linalg

# Function to calculate covariance matrix
def calculate_cov_matrix(data):
    """
    Calculate the covariance matrix handling missing values
    """
    # Convert to pandas DataFrame to handle missing values
    df = pd.DataFrame(data)
    # Calculate covariance using pandas method which handles missing values
    return df.cov().values

# Higham's method for nearest PSD matrix
def nearest_PSD(A, max_iter=100, tol=1e-8):
    """
    Find the nearest positive semi-definite matrix using Higham's method
    """
    n = A.shape[0]
    X = A.copy()
    Y = A.copy()
    delta_S = np.zeros_like(A)
    
    for k in range(max_iter):
        # Project onto symmetric matrices
        R = Y - delta_S
        X = (R + R.T) / 2
        
        # Get eigendecomposition
        eigvals, eigvecs = linalg.eigh(X)
        
        # Project onto PSD matrices
        D_plus = np.maximum(eigvals, 0)
        Y = eigvecs @ np.diag(D_plus) @ eigvecs.T
        
        # Update delta S
        delta_S = Y - R
        
        # Check convergence
        norm_X = np.linalg.norm(X, 'fro')
        norm_Y = np.linalg.norm(Y, 'fro')
        diff = abs(norm_X - norm_Y) / norm_X
        
        if diff < tol:
            break
    
    return Y

# Rebonato and Jäckel method
def near_PSD_Rebonato(A):
    """
    Find the nearest positive semi-definite matrix using Rebonato and Jäckel method
    """
    # Get eigendecomposition
    eigvals, eigvecs = linalg.eigh(A)
    
    # Replace negative eigenvalues with small positive values
    lambda_plus = np.maximum(eigvals, 1e-10)
    
    # Reconstruct matrix
    return eigvecs @ np.diag(lambda_plus) @ eigvecs.T

# Calculate covariance with overlapping data only
def calculate_overlapping_cov(data):
    """
    Calculate covariance matrix using only overlapping data
    """
    df = pd.DataFrame(data)
    # Use pandas' built-in covariance calculation which handles missing values
    return df.cov().values

def analyze_data(file_path='problem2.csv'):
    """
    Main analysis function
    """
    try:
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # A. Calculate covariance matrix
        cov_matrix = calculate_cov_matrix(df)
        print("A. Covariance Matrix:")
        print(np.round(cov_matrix, 6))
        print("\n")

        # B. Check positive semi-definiteness
        # Ensure matrix is symmetric and real before eigenvalue calculation
        cov_matrix = (cov_matrix + cov_matrix.T) / 2  # Ensure symmetry
        eigenvals = linalg.eigvalsh(cov_matrix)
        is_psd = np.all(eigenvals >= -1e-10)  # allowing for numerical errors
        print("B. Eigenvalues:")
        print(np.round(eigenvals, 6))
        print(f"Is PSD: {is_psd}")
        print("\n")

        # C. If not PSD, find nearest PSD matrix
        if not is_psd:
            higham_psd = nearest_PSD(cov_matrix)
            rebonato_psd = near_PSD_Rebonato(cov_matrix)
            
            print("C. Nearest PSD Matrix (Higham):")
            print(np.round(higham_psd, 6))
            print("\nNearest PSD Matrix (Rebonato-Jäckel):")
            print(np.round(rebonato_psd, 6))
            print("\n")
            
            # Verify the results are PSD
            higham_eigenvals = linalg.eigvalsh(higham_psd)
            rebonato_eigenvals = linalg.eigvalsh(rebonato_psd)
            
            print("Verification - Minimum eigenvalues:")
            print(f"Higham method: {np.round(np.min(higham_eigenvals), 6)}")
            print(f"Rebonato method: {np.round(np.min(rebonato_eigenvals), 6)}")
            print("\n")

        # D. Calculate overlapping covariance matrix
        overlapping_cov = calculate_overlapping_cov(df)
        print("D. Overlapping Covariance Matrix:")
        print(np.round(overlapping_cov, 6))
        print("\n")

        # E. Compare the matrices
        if not is_psd:
            print("E. Comparison of Methods:")
            print("\nFrobenius norm differences:")
            print(f"Original vs Higham: {np.round(np.linalg.norm(cov_matrix - higham_psd, 'fro'), 6)}")
            print(f"Original vs Rebonato: {np.round(np.linalg.norm(cov_matrix - rebonato_psd, 'fro'), 6)}")
            print(f"Original vs Overlapping: {np.round(np.linalg.norm(cov_matrix - overlapping_cov, 'fro'), 6)}")
            print("\nMaximum absolute differences:")
            print(f"Original vs Higham: {np.round(np.max(np.abs(cov_matrix - higham_psd)), 6)}")
            print(f"Original vs Rebonato: {np.round(np.max(np.abs(cov_matrix - rebonato_psd)), 6)}")
            print(f"Original vs Overlapping: {np.round(np.max(np.abs(cov_matrix - overlapping_cov)), 6)}")
            
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

if __name__ == "__main__":
    analyze_data()

A. Covariance Matrix:
[[1.470484 1.454214 0.877269 1.903226 1.444361]
 [1.454214 1.252078 0.539548 1.621918 1.237877]
 [0.877269 0.539548 1.272425 1.171959 1.091912]
 [1.903226 1.621918 1.171959 1.814469 1.589729]
 [1.444361 1.237877 1.091912 1.589729 1.396186]]


B. Eigenvalues:
[-0.310243 -0.133232  0.027978  0.834434  6.786706]
Is PSD: False


C. Nearest PSD Matrix (Higham):
[[1.615133 1.44196  0.897144 1.780426 1.433794]
 [1.44196  1.346968 0.585086 1.554552 1.211409]
 [0.897144 0.585086 1.298916 1.115956 1.076692]
 [1.780426 1.554552 1.115956 1.983165 1.621373]
 [1.433794 1.211409 1.076692 1.621373 1.404936]]

Nearest PSD Matrix (Rebonato-Jäckel):
[[1.615133 1.44196  0.897144 1.780426 1.433794]
 [1.44196  1.346968 0.585086 1.554552 1.211409]
 [0.897144 0.585086 1.298916 1.115956 1.076692]
 [1.780426 1.554552 1.115956 1.983165 1.621373]
 [1.433794 1.211409 1.076692 1.621373 1.404936]]


Verification - Minimum eigenvalues:
Higham method: 0.0
Rebonato method: 0.0


D. Overlapping Cov