### Datasources

- YahooFinance
- WRDS - CRSP
- Bloomberg
- Datastream
- Xiu, Goyal, French
- Dacheng Xiu (Data with features for stocks.)

Past returns for as many as possible assets. -> Or Dacheng Xiu datasets with features as well.

Cluster: **EPFL Scitas** -> Free accounts

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import zipfile as zp
import wrds
import time
import pickle
from sklearn.decomposition import PCA

## Modelling

In [2]:
all_data_ret_only = pd.read_pickle('all_data_greater_50.pkl')

In [54]:
def create_total_r2(actual_returns, pred_returns):
    denom = np.nansum(actual_returns**2)
    nom = np.nansum((actual_returns - pred_returns)**2)
    r_2 = 1 - nom / denom
    return r_2

In [4]:
def calculate_beta_factor_prediction(start_date, end_date, no_factors=3):
    all_data_ret = all_data_ret_only.loc[(all_data_ret_only['date'] >= start_date) & (all_data_ret_only['date'] <= end_date)]
    all_data_ret_pivoted = all_data_ret.pivot(index='date', columns='permno', values='ret')
    #Drop the columns which have at least missing values equal to half of the number of days in the estimation window

    all_data_ret_pivoted.dropna(axis=1, thresh=len(all_data_ret_pivoted)/2, inplace=True)
    all_data_ret_pivoted.index = pd.to_datetime(all_data_ret_pivoted.index, format='%Y%m%d')
    col_names_in_sample = all_data_ret_pivoted.columns

    all_data_out_sample = all_data_ret_only.loc[all_data_ret_only['date'] > end_date ]
    all_data_out_sample = all_data_out_sample[all_data_out_sample['permno'].isin(col_names_in_sample)]
    all_data_out_sample_pivoted = all_data_out_sample.pivot(index='date', columns='permno', values='ret')
    out_sample_col_names = all_data_out_sample_pivoted.columns
    #Estimate in sample covariance matrix based on out of sample stocks
    cov_matrix =all_data_ret_pivoted[out_sample_col_names].cov()
    # Apply eignevalue decomposition to the covariance matrix
    eigenvalues, eigenvectors = np.linalg.eig(cov_matrix)
    # Get the real part of the first three eigenvectors
    eigenvectors = eigenvectors.real
    # Get the first three eigenvectors
    beta = eigenvectors[:,0:no_factors]
    print(f"Finished Eigendecomposition")
    out_sample_factors = np.linalg.inv(beta.T @ beta) @ beta.T @ all_data_out_sample_pivoted.fillna(0).values.T

    #In sample factors
    in_sample_factors = np.linalg.inv(beta.T @ beta) @ beta.T @ all_data_ret_pivoted[out_sample_col_names].fillna(0).values.T

    # predicted_returns_out_sample = (beta @ out_sample_factors)
    # predicted_returns_in_sample = (beta @ in_sample_factors)

    return beta, in_sample_factors, out_sample_factors, all_data_out_sample_pivoted, all_data_ret_pivoted[out_sample_col_names]

In [5]:
def calculate_out_sample_r2(all_data_out_sample_pivoted, beta, out_sample_factors):
    predicted_returns_out_sample = beta @ out_sample_factors  
    actual_returns = []
    pred_returns = []
    #Loop over each row in the all_data_out_sample_pivoted dataframe and append only the non-nan values to a list, then convert the list to a numpy array
    for i in range(all_data_out_sample_pivoted.T.shape[1]):
        temp_df = all_data_out_sample_pivoted.iloc[:,i]
        temp_predicted = predicted_returns_out_sample[i]
        temp_predicted = temp_predicted[~np.isnan(temp_df)]
        temp_df = temp_df[~np.isnan(temp_df)]
        actual_returns.append(temp_df.values)
        pred_returns.append(temp_predicted)
    actual_returns = np.concatenate(actual_returns, axis=0)
    pred_returns = np.concatenate(pred_returns, axis=0)
    return create_total_r2(actual_returns, pred_returns)

In [6]:
def calculate_predictive_r_2(all_data_out_sample_pivoted, in_sample_factors, beta):
    lambda_array = np.mean(in_sample_factors, axis=1)
    # Broadcast lambda_df to the shape of all_data_out_sample_pivoted
    lambda_broadcasted = np.tile(lambda_array,(all_data_out_sample_pivoted.shape[0],1))
    predicted_returns_out_sample = beta @ lambda_broadcasted.T
    actual_returns = []
    pred_returns = []
    #Loop over each row in the all_data_out_sample_pivoted dataframe and append only the non-nan values to a list, then convert the list to a numpy array
    for i in range(all_data_out_sample_pivoted.T.shape[1]):
        temp_df = all_data_out_sample_pivoted.iloc[:,i]
        temp_predicted = predicted_returns_out_sample[i]
        temp_predicted = temp_predicted[~np.isnan(temp_df)]
        temp_df = temp_df[~np.isnan(temp_df)]
        actual_returns.append(temp_df.values)
        pred_returns.append(temp_predicted)
    actual_returns = np.concatenate(actual_returns, axis=0)
    pred_returns = np.concatenate(pred_returns, axis=0)
    return create_total_r2(actual_returns, pred_returns)

# Static PCA

## Estimation Window: 5 years

### 3 Factors

In [22]:
beta, in_sample_factors, out_sample_factors, all_data_out_sample_pivoted, all_data_ret_pivoted = calculate_beta_factor_prediction(20050101, 20091231, no_factors=3)
#in all_data_out_sample_pivoted convert date to datetime
all_data_out_sample_pivoted.index = pd.to_datetime(all_data_out_sample_pivoted.index, format='%Y%m%d')
# Join together all_data_ret_pivoted and all_data_out_sample_pivoted on the date and the permno
total_sample = pd.concat([all_data_ret_pivoted, all_data_out_sample_pivoted])
total_factors =np.concatenate([in_sample_factors, out_sample_factors], axis=1)
print( f"Total sample R^2: {round(calculate_out_sample_r2(total_sample, beta, total_factors),3)}") 
print(f"Out of sample R^2: {round(calculate_out_sample_r2(all_data_out_sample_pivoted, beta, out_sample_factors),3)}")
print(f"Predictive R^2: {round(calculate_predictive_r_2(all_data_out_sample_pivoted, in_sample_factors, beta),3)}")

Total sample R^2: 0.212
Out of sample R^2: 0.096
Predictive R^2: 0.004


### 5 Factors

In [23]:
beta, in_sample_factors, out_sample_factors, all_data_out_sample_pivoted, all_data_ret_pivoted = calculate_beta_factor_prediction(20050101, 20091231, no_factors=5)

#in all_data_out_sample_pivoted convert date to datetime
all_data_out_sample_pivoted.index = pd.to_datetime(all_data_out_sample_pivoted.index, format='%Y%m%d')
# Join together all_data_ret_pivoted and all_data_out_sample_pivoted on the date and the permno
total_sample = pd.concat([all_data_ret_pivoted, all_data_out_sample_pivoted])
total_factors =np.concatenate([in_sample_factors, out_sample_factors], axis=1)
print( f"Total sample R^2: {round(calculate_out_sample_r2(total_sample, beta, total_factors),3)}")
print(f"Out of sample R^2: {round(calculate_out_sample_r2(all_data_out_sample_pivoted, beta, out_sample_factors),3)}")
print(f"Predictive R^2: {round(calculate_predictive_r_2(all_data_out_sample_pivoted, in_sample_factors, beta),3)}")


Finished Eigendecomposition
Total sample R^2: 0.261
Out of sample R^2: 0.098
Predictive R^2: 0.003


## Estimation Window 10 years

## 3 Factors

In [25]:
beta, in_sample_factors, out_sample_factors, all_data_out_sample_pivoted, all_data_ret_pivoted = calculate_beta_factor_prediction(20000101, 20091231, no_factors=3)#in all_data_out_sample_pivoted convert date to datetime
all_data_out_sample_pivoted.index = pd.to_datetime(all_data_out_sample_pivoted.index, format='%Y%m%d')
# Join together all_data_ret_pivoted and all_data_out_sample_pivoted on the date and the permno
total_sample = pd.concat([all_data_ret_pivoted, all_data_out_sample_pivoted])
total_factors =np.concatenate([in_sample_factors, out_sample_factors], axis=1)
print( f"Total sample R^2: {round(calculate_out_sample_r2(total_sample, beta, total_factors),3)}")
print(f"Out of sample R^2: {round(calculate_out_sample_r2(all_data_out_sample_pivoted, beta, out_sample_factors),3)}")
print(f"Predictive R^2: {round(calculate_predictive_r_2(all_data_out_sample_pivoted, in_sample_factors, beta),3)}")

Finished Eigendecomposition
Total sample R^2: 0.236
Out of sample R^2: 0.115
Predictive R^2: 0.006


## 5 Factors

In [24]:
beta, in_sample_factors, out_sample_factors, all_data_out_sample_pivoted, all_data_ret_pivoted = calculate_beta_factor_prediction(20000101, 20091231, no_factors=5)
#in all_data_out_sample_pivoted convert date to datetime
all_data_out_sample_pivoted.index = pd.to_datetime(all_data_out_sample_pivoted.index, format='%Y%m%d')
# Join together all_data_ret_pivoted and all_data_out_sample_pivoted on the date and the permno
total_sample = pd.concat([all_data_ret_pivoted, all_data_out_sample_pivoted])
total_factors =np.concatenate([in_sample_factors, out_sample_factors], axis=1)
print( f"Total sample R^2: {round(calculate_out_sample_r2(total_sample, beta, total_factors),3)}")
print(f"Out of sample R^2: {round(calculate_out_sample_r2(all_data_out_sample_pivoted, beta, out_sample_factors),3)}")
print(f"Predictive R^2: {round(calculate_predictive_r_2(all_data_out_sample_pivoted, in_sample_factors, beta),3)}")


Finished Eigendecomposition
Total sample R^2: 0.27
Out of sample R^2: 0.117
Predictive R^2: 0.006


# PCA with Rolling Beta

Where Rolling Beta Means we roll over the estimation window for the betas each time 

In [70]:
def dynamic_covariance_estimation_and_eigen_decomposition(start_date, end_date, out_sample_col_names, out_sample_data, no_factors=3):
    all_data_ret = all_data_ret_only.loc[(all_data_ret_only['date'] >= start_date) & (all_data_ret_only['date'] < end_date)]
    # Get index of stock only which have non-Nan in the last month
    all_data_ret = all_data_ret[all_data_ret['permno'].isin(out_sample_col_names)]
    all_data_ret_pivoted = all_data_ret.pivot(index='date', columns='permno', values='ret')
    all_data_ret_pivoted.index = pd.to_datetime(all_data_ret_pivoted.index, format='%Y%m%d')
    #Estimate in sample covariance matrix based on out of sample stocks
    cov_matrix =all_data_ret_pivoted.cov()
    print(f"Shape of Covariance matrix is: {cov_matrix.shape}")
    print(f"Number of Nan values in the covariance matrix: {np.sum(np.sum(np.isnan(cov_matrix)))}")
    # Apply eignevalue decomposition to the covariance matrix
    eigenvectors = np.linalg.eig(cov_matrix)[1]
    print(f"Finished Eigendecomposition")
    # Get the real part of the first three eigenvectors
    eigenvectors = eigenvectors.real
    # Get the first three eigenvectors
    beta = eigenvectors[:,0:no_factors]
    out_sample_beta = beta
    print(f"{beta.shape}")
    print(out_sample_data[out_sample_data.index == end_date].fillna(0).values.T.shape)
    # out_sample_beta = beta[data_index]
    #Only calculate for betas which are in the out of sample data index
    # multipl = (np.linalg.inv(beta.T @ beta) @ beta.T)[:,out_sample_index]
    multipl = (np.linalg.inv(beta.T @ beta) @ beta.T)
    out_sample_factor = multipl @ out_sample_data[out_sample_data.index == end_date].fillna(0).values.T
    print( f"Finished for date: {end_date}")
    print('----------------------------------------------------')
    # Create a dictionary with the date as the key and the out of sample factors as the value
    return out_sample_beta, out_sample_factor

In [2]:
# for i in out_sample_dates:
#     print(f"For end date: {i}")
#     out_sample_beta, out_sample_factor = naive_dynamic_covariance_estimation_and_eigen_decomposition(start_date, i, out_sample_col_names, all_data_out_sample_pivoted, no_factors=3)
#     betas_out_samp.append(out_sample_beta)
#     out_sample_factors.append(out_sample_factor)
# Parallelize this loop

# betas_out_samp, out_sample_factors = zip(*Parallel(n_jobs=12)(delayed(naive_dynamic_covariance_estimation_and_eigen_decomposition)(start_date, i, out_sample_col_names, all_data_out_sample_pivoted, no_factors=3) for i in out_sample_dates))

In [68]:


def calculate_beta_factor_prediction(start_date, end_date, betas_out_sample, out_sample_factors, no_factors=3):
    all_data_ret = all_data_ret_only.loc[(all_data_ret_only['date'] >= start_date) & (all_data_ret_only['date'] <= end_date)]
    all_data_ret_pivoted = all_data_ret.pivot(index='date', columns='permno', values='ret')
    #Drop the columns which have at least missing values equal to half of the number of days in the estimation window

    all_data_ret_pivoted.dropna(axis=1, thresh=(len(all_data_ret_pivoted)/2 + 3), inplace=True)

    all_data_ret_pivoted.index = pd.to_datetime(all_data_ret_pivoted.index, format='%Y%m%d')
    col_names_in_sample = all_data_ret_pivoted.columns

    print(f" In sample column number: {len(col_names_in_sample)}")

    all_data_out_sample = all_data_ret_only.loc[all_data_ret_only['date'] > end_date ]
    all_data_out_sample = all_data_out_sample[all_data_out_sample['permno'].isin(col_names_in_sample)]
    all_data_out_sample_pivoted = all_data_out_sample.pivot(index='date', columns='permno', values='ret')
    out_sample_col_names = all_data_out_sample_pivoted.columns
    out_sample_dates = all_data_out_sample.date.unique()
    # print(out_sample_dates)
    print(f" Out of Sample column number: {len(out_sample_col_names)}")
    # out_sample_index = np.where(np.isin(col_names_in_sample, out_sample_col_names))[0]
    #Estimate in sample covariance matrix based on out of sample stocks
    all_data_in_sample = all_data_ret_pivoted[out_sample_col_names]
    in_samp_cov_matrix =all_data_in_sample.cov()
    # in_samp_cov_matrix =all_data_ret_pivoted.cov()

    # Apply eignevalue decomposition to the covariance matrix
    eigenvectors_in_samp = np.linalg.eig(in_samp_cov_matrix)[1]
    # Get the real part of the first three eigenvectors
    eigenvectors_in_samp = eigenvectors_in_samp.real
    # Get the first three eigenvectors
    beta_in_samp = eigenvectors_in_samp[:,0:no_factors]
    print(f"Finished Eigendecomposition")
    in_sample_factors = np.linalg.inv(beta_in_samp.T @ beta_in_samp) @ beta_in_samp.T @ all_data_ret_pivoted[out_sample_col_names].fillna(0).values.T
    betas_out_sample = []
    out_sample_factors = []
    for i in out_sample_dates:
        print(f"For end date: {i}")
        out_sample_beta, out_sample_factor = dynamic_covariance_estimation_and_eigen_decomposition(start_date, i, out_sample_col_names, all_data_out_sample_pivoted, no_factors)
        betas_out_sample.append(out_sample_beta)
        out_sample_factors.append(out_sample_factor)
    
    

    return beta_in_samp, in_sample_factors, out_sample_factors, betas_out_sample, all_data_out_sample_pivoted, all_data_in_sample 

In [7]:
def calculate_total_sample_r2_rolling(beta_in_samp, beta_out_sample, in_sample_factors, out_sample_factors,
                                       all_data_in_sample, all_data_out_sample_pivoted):
    predicted_returns_out_sample = beta_in_samp @ in_sample_factors 
    in_sample_errors = all_data_in_sample - predicted_returns_out_sample.T
    all_betas = np.array(beta_out_sample)
    all_factors =np.array(out_sample_factors)
    # Multiply together the betas and the factors to get the predicted returns
    out_sample_predicted_returns = np.concatenate((all_betas @ all_factors), axis=1)
    out_sample_errors = all_data_out_sample_pivoted - out_sample_predicted_returns.T
    in_sampl_denom = np.sum(np.sum(all_data_in_sample**2))
    out_sampl_denom = np.sum(np.sum(all_data_out_sample_pivoted**2))
    in_sampl_nom = np.sum(np.sum(in_sample_errors**2))
    out_sampl_nom = np.sum(np.sum(out_sample_errors**2))
    total_denoms = in_sampl_denom + out_sampl_denom
    totaLnom = in_sampl_nom + out_sampl_nom
    total_r_2 = 1 - totaLnom / total_denoms
    out_sample_r_2 = 1 - out_sampl_nom / out_sampl_denom


    return total_r_2, out_sample_r_2

In [8]:
def calculate_predicted_r2_rolling(beta_in_samp, beta_out_sample, in_sample_factors, out_sample_factors,
                                       all_data_in_sample, all_data_out_sample_pivoted):
    lambda_array = np.mean(in_sample_factors, axis=1)
    # Broadcast lambda_df to the shape of all_data_out_sample_pivoted
    lambda_broadcasted = np.tile(lambda_array,(all_data_out_sample_pivoted.shape[0],1))
    # Get the index of values in col_names_in_sample which can also be found in out_sample_col_names
    index = np.where(np.isin(all_data_in_sample.columns, all_data_out_sample_pivoted.columns))[0]
    # Use this index to get the in sample betas for the out of sample stocks
    predicted_returns_out_sample = beta_in_samp[:, index] @ lambda_broadcasted.T
    out_sampl_denom = np.sum(np.sum(all_data_out_sample_pivoted**2))
    out_sampl_nom = np.sum(np.sum((all_data_out_sample_pivoted - predicted_returns_out_sample.T)**2))
    predictive_r_2 = 1 - out_sampl_nom / out_sampl_denom

    return predictive_r_2

### 5 Years

3 Factors

In [23]:
betas_out_sample = []
out_sample_factors = []
beta_in_samp, in_sample_factors, out_sample_factors, betas_out_samp, all_data_out_sample_pivoted, all_data_ret_pivoted = calculate_beta_factor_prediction(20050101, 20091231,betas_out_sample, out_sample_factors, no_factors=3)

 In sample column number: 4817
 Out of Sample column number: 4084
Finished Eigendecomposition
For end date: 20100129
Shape of Covariance matrix is: (4084, 4084)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(4084, 3)
(4084, 1)
Finished for date: 20100129
----------------------------------------------------
For end date: 20100226
Shape of Covariance matrix is: (4084, 4084)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(4084, 3)
(4084, 1)
Finished for date: 20100226
----------------------------------------------------
For end date: 20100331
Shape of Covariance matrix is: (4084, 4084)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(4084, 3)
(4084, 1)
Finished for date: 20100331
----------------------------------------------------
For end date: 20100430
Shape of Covariance matrix is: (4084, 4084)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(4084, 3)
(4084, 1)
Finishe

In [28]:
actual_betas = out_sample_factors
actual_factors = betas_out_samp
betas_list = [list(i.values()) for i in actual_betas]
factors_list = [list(i.values()) for i in actual_factors]
# with open(f"pca_rolling_os_betas.pkl", 'wb') as f:
#     pickle.dump(betas_list, f)
# with open(f"pca_rolling_os_factors.pkl", 'wb') as f:
#     pickle.dump(factors_list, f) 

In [44]:
# with open('pca_rolling_os_betas.pkl', 'rb') as f:
#     rolling_beta_os = pickle.load(f)

# with open('pca_rolling_os_factors.pkl', 'rb') as f:
#     rolling_factors_os = pickle.load(f)

In [119]:
calculate_total_sample_r2_rolling(beta_in_samp, rolling_beta_os, in_sample_factors, rolling_factors_os,
                                       all_data_in_sample, all_data_out_sample_pivoted)

(0.21076055329498045, 0.11852124619261306)

5 Factors

In [13]:
betas_out_sample_5 = []
out_sample_factors_5 = []
beta_in_samp, in_sample_factors, out_sample_factors, betas_out_samp, all_data_out_sample_pivoted, all_data_ret_pivoted = calculate_beta_factor_prediction(20050101, 20091231,betas_out_sample_5, out_sample_factors_5, no_factors=5)

 In sample column number: 4817
 Out of Sample column number: 4084
Finished Eigendecomposition
For end date: 20100129
Shape of Covariance matrix is: (4084, 4084)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(4084, 3)
(4084, 1)
Finished for date: 20100129
----------------------------------------------------
For end date: 20100226
Shape of Covariance matrix is: (4084, 4084)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(4084, 3)
(4084, 1)
Finished for date: 20100226
----------------------------------------------------
For end date: 20100331
Shape of Covariance matrix is: (4084, 4084)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(4084, 3)
(4084, 1)
Finished for date: 20100331
----------------------------------------------------
For end date: 20100430
Shape of Covariance matrix is: (4084, 4084)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(4084, 3)
(4084, 1)
Finishe

In [14]:
# with open(f"pca_rolling_os_betas_5_fact.pkl", 'wb') as f:
#     pickle.dump(betas_out_samp, f)
# with open(f"pca_rolling_os_5_factors.pkl", 'wb') as f:
#     pickle.dump(out_sample_factors, f) 

In [57]:
calculate_total_sample_r2_rolling(beta_in_samp, betas_out_samp, in_sample_factors, out_sample_factors,
                                       all_data_ret_pivoted, all_data_out_sample_pivoted)

(0.24552086166418774, 0.11852124619261306)

In [60]:
np.mean(in_sample_factors, axis=1).shape

(5,)

In [None]:
lambda_array = np.mean(in_sample_factors, axis=1)
# Broadcast lambda_df to the shape of all_data_out_sample_pivoted
lambda_broadcasted = np.tile(lambda_array,(all_data_out_sample_pivoted.shape[0],1))
# Get the index of values in col_names_in_sample which can also be found in out_sample_col_names
index = np.where(np.isin(all_data_in_sample.columns, all_data_out_sample_pivoted.columns))[0]
# Use this index to get the in sample betas for the out of sample stocks
predicted_returns_out_sample = beta_in_samp[:, index] @ lambda_broadcasted.T
out_sampl_denom = np.sum(np.sum(all_data_out_sample_pivoted**2))
out_sampl_nom = np.sum(np.sum((all_data_out_sample_pivoted - predicted_returns_out_sample.T)**2))
predictive_r_2 = 1 - out_sampl_nom / out_sampl_denom

In [None]:
calculate_predicted_r2_rolling(beta_in_samp, betas_out_samp, in_sample_factors, out_sample_factors,
                                       all_data_ret_pivoted, all_data_out_sample_pivoted)

In [14]:
def calculate_in_sample_data_and_beta(start_date, end_date, no_factors=3):
    all_data_ret = all_data_ret_only.loc[(all_data_ret_only['date'] >= start_date) & (all_data_ret_only['date'] <= end_date)]
    all_data_ret_pivoted = all_data_ret.pivot(index='date', columns='permno', values='ret')
    #Drop the columns which have at least missing values equal to half of the number of days in the estimation window

    all_data_ret_pivoted.dropna(axis=1, thresh=(len(all_data_ret_pivoted)/2 + 3), inplace=True)

    all_data_ret_pivoted.index = pd.to_datetime(all_data_ret_pivoted.index, format='%Y%m%d')
    col_names_in_sample = all_data_ret_pivoted.columns

    print(f" In sample column number: {len(col_names_in_sample)}")

    all_data_out_sample = all_data_ret_only.loc[all_data_ret_only['date'] > end_date ]
    all_data_out_sample = all_data_out_sample[all_data_out_sample['permno'].isin(col_names_in_sample)]
    all_data_out_sample_pivoted = all_data_out_sample.pivot(index='date', columns='permno', values='ret')
    out_sample_col_names = all_data_out_sample_pivoted.columns
    out_sample_dates = all_data_out_sample.date.unique()
    # print(out_sample_dates)
    print(f" Out of Sample column number: {len(out_sample_col_names)}")
    # out_sample_index = np.where(np.isin(col_names_in_sample, out_sample_col_names))[0]
    #Estimate in sample covariance matrix based on out of sample stocks
    all_data_in_sample = all_data_ret_pivoted[out_sample_col_names]
    in_samp_cov_matrix =all_data_in_sample.cov()
    # in_samp_cov_matrix =all_data_ret_pivoted.cov()

    # Apply eignevalue decomposition to the covariance matrix
    eigenvectors_in_samp = np.linalg.eig(in_samp_cov_matrix)[1]
    # Get the real part of the first three eigenvectors
    eigenvectors_in_samp = eigenvectors_in_samp.real
    # Get the first three eigenvectors
    beta_in_samp = eigenvectors_in_samp[:,0:no_factors]
    print(f"Finished Eigendecomposition")
    in_sample_factors = np.linalg.inv(beta_in_samp.T @ beta_in_samp) @ beta_in_samp.T @ all_data_ret_pivoted[out_sample_col_names].fillna(0).values.T

    return beta_in_samp, in_sample_factors, all_data_in_sample, all_data_out_sample_pivoted

Estimation period 10 yeaars

In [10]:
betas_out_sample = []
out_sample_factors = []
beta_in_samp, in_sample_factors, out_sample_factors, betas_out_samp, all_data_out_sample_pivoted, all_data_ret_pivoted = calculate_beta_factor_prediction(20000101, 20091231,betas_out_sample, out_sample_factors, no_factors=3)

 In sample column number: 5087
 Out of Sample column number: 3551
Finished Eigendecomposition
For end date: 20100129
Shape of Covariance matrix is: (3551, 3551)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(3551, 3)
(3551, 1)
Finished for date: 20100129
----------------------------------------------------
For end date: 20100226
Shape of Covariance matrix is: (3551, 3551)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(3551, 3)
(3551, 1)
Finished for date: 20100226
----------------------------------------------------
For end date: 20100331
Shape of Covariance matrix is: (3551, 3551)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(3551, 3)
(3551, 1)
Finished for date: 20100331
----------------------------------------------------
For end date: 20100430
Shape of Covariance matrix is: (3551, 3551)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(3551, 3)
(3551, 1)
Finishe

In [11]:
# with open(f"pca_rolling_os_betas_10_years_fact.pkl", 'wb') as f:
#     pickle.dump(betas_out_samp, f)
# with open(f"pca_rolling_os_10_years.pkl", 'wb') as f:
#     pickle.dump(out_sample_factors, f) 

In [14]:
calculate_total_sample_r2_rolling(beta_in_samp, betas_out_samp, in_sample_factors, out_sample_factors,
                                       all_data_ret_pivoted, all_data_out_sample_pivoted)

(0.21847214968729345, 0.13040790419601755)

In [11]:
with open('pca_rolling_os_betas_10_years_fact.pkl', 'rb') as f:
    betas_out_samp = pickle.load(f)

with open('pca_rolling_os_10_years.pkl', 'rb') as f:
    out_sample_factors = pickle.load(f)

In [15]:
beta_in_samp, in_sample_factors, all_data_in_sample, all_data_out_sample_pivoted = calculate_in_sample_data_and_beta(20000101, 20091231, no_factors=3)

 In sample column number: 5087
 Out of Sample column number: 3551
Finished Eigendecomposition


In [17]:
calculate_total_sample_r2_rolling(beta_in_samp, betas_out_samp, in_sample_factors, out_sample_factors,
                                       all_data_in_sample, all_data_out_sample_pivoted)

(0.21847214968729345, 0.13040790419601755)

In [52]:
def calculate_rolling_pred_r_2(betas_out_sample, in_sample_factors, all_data_out_sample_pivoted):
    lambda_array = np.mean(in_sample_factors, axis=1)
    beta_out_samp_arr = np.array(betas_out_sample)
    predicted_returns_out_sample = np.concatenate([beta_out_samp_arr[i] @ lambda_array for i in range(beta_out_samp_arr.shape[0])])
    all_data_out_sample_pivoted_stacked = np.concatenate([all_data_out_sample_pivoted.iloc[:,i].values for i in range(all_data_out_sample_pivoted.shape[1])])
    pred_r_2 = create_total_r2(all_data_out_sample_pivoted_stacked, predicted_returns_out_sample)
    return pred_r_2

In [55]:
calculate_rolling_pred_r_2(betas_out_samp, in_sample_factors, all_data_out_sample_pivoted)

-0.007865154938315744

In [71]:
betas_out_sample = []
out_sample_factors = []
beta_in_samp, in_sample_factors, out_sample_factors, betas_out_samp, all_data_out_sample_pivoted, all_data_ret_pivoted = calculate_beta_factor_prediction(20000101, 20091231,betas_out_sample, out_sample_factors, no_factors=5)

 In sample column number: 5087
 Out of Sample column number: 3551
Finished Eigendecomposition
For end date: 20100129
Shape of Covariance matrix is: (3551, 3551)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(3551, 5)
(3551, 1)
Finished for date: 20100129
----------------------------------------------------
For end date: 20100226
Shape of Covariance matrix is: (3551, 3551)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(3551, 5)
(3551, 1)
Finished for date: 20100226
----------------------------------------------------
For end date: 20100331
Shape of Covariance matrix is: (3551, 3551)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(3551, 5)
(3551, 1)
Finished for date: 20100331
----------------------------------------------------
For end date: 20100430
Shape of Covariance matrix is: (3551, 3551)
Number of Nan values in the covariance matrix: 0
Finished Eigendecomposition
(3551, 5)
(3551, 1)
Finishe

In [72]:
# with open('pca_rolling_os_betas_10_years_5_factors.pkl', 'wb') as f:
#     pickle.dump(betas_out_samp, f)

# with open('pca_rolling_os_factors_10_years_5_factors.pkl', 'wb') as f:
#     pickle.dump(out_sample_factors, f)

In [65]:
with open('pca_rolling_os_betas_10_years_5_factors.pkl', 'rb') as f:
    betas_out_samp = pickle.load(f)

with open('pca_rolling_os_factors_10_years_5_factors.pkl', 'rb') as f:
    out_sample_factors = pickle.load(f)

In [73]:
beta_in_samp, in_sample_factors, all_data_in_sample, all_data_out_sample_pivoted = calculate_in_sample_data_and_beta(20000101, 20091231, no_factors=5)

 In sample column number: 5087
 Out of Sample column number: 3551
Finished Eigendecomposition


In [74]:
calculate_total_sample_r2_rolling(beta_in_samp, betas_out_samp, in_sample_factors, out_sample_factors,
                                       all_data_in_sample, all_data_out_sample_pivoted)

(0.2516261847496356, 0.13553206032206644)

In [75]:
calculate_rolling_pred_r_2(betas_out_samp, in_sample_factors, all_data_out_sample_pivoted)

-0.00800565967117084

In [80]:
pca_cols = list(all_data_out_sample_pivoted.columns)

Save PCA cols

In [81]:
with open('pca_cols_os.pkl', 'wb') as f:
    pickle.dump(pca_cols, f)