<a href="https://colab.research.google.com/github/antoinebachand/Kushnir_Method_2012/blob/main/PCA_Factor_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Dimension Reduction
# Author: Antoine Bachand

In [135]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb 

from sklearn.metrics import r2_score
from sklearn import linear_model

In [136]:
# Import the Adj close
df = pd.read_excel('Adj_data.xlsx', header=0, index_col=False, keep_default_na=True)

# Create the returns series
returns = (np.log(df).diff()).dropna()

In [137]:
# First lets see the correlation of the data
returns.corr()

Unnamed: 0,INTC,MSFT,IBM,MCD,WMT,DIS,^DJI
INTC,1.0,0.531709,0.356435,0.187476,0.22398,0.214851,0.407465
MSFT,0.531709,1.0,0.328527,0.224868,0.27906,0.261736,0.45713
IBM,0.356435,0.328527,1.0,0.204807,0.201436,0.195959,0.477094
MCD,0.187476,0.224868,0.204807,1.0,0.264647,0.256571,0.479196
WMT,0.22398,0.27906,0.201436,0.264647,1.0,0.281248,0.486493
DIS,0.214851,0.261736,0.195959,0.256571,0.281248,1.0,0.513085
^DJI,0.407465,0.45713,0.477094,0.479196,0.486493,0.513085,1.0


#Factor Model

In [227]:
# Create the regression function ( The for loop applies the regression to all stocks)

def beta():
  Beta = []
  r_square = []
  error = pd.DataFrame({})  # empty df
  stock_list = ['INTC', 'MSFT', 'IBM', 'MCD', 'WMT', 'DIS']
  for i in stock_list:
    # Model (Beta)
    reg = np.polyfit(returns['^DJI'], returns[i],deg = 1)
    Beta.append(reg[0]) # Adds only the Beta
    # R2
    r2 = r2_score(returns[i], returns['^DJI'])
    r_square.append(r2)
    # Error 
    prediction = (returns['^DJI']*reg[0])+reg[1] # prediction from the beta and intercept
    e = pred - returns[i]
    error[i] = e  # Adds the error columns to the df

  table = pd.DataFrame({"Beta": Beta, "r_square": r_square})
  table.index = [stock_list]
  display(table)
  display(error.corr()) # Cov of the error df
  
beta()


Unnamed: 0,Beta,r_square
INTC,1.12086,0.162
MSFT,1.109366,0.204974
IBM,1.071722,0.226386
MCD,0.858034,0.223161
WMT,1.019169,0.236591
DIS,1.028865,0.263037


Unnamed: 0,INTC,MSFT,IBM,MCD,WMT,DIS
INTC,1.0,0.425303,0.20183,-0.009573,0.032225,0.007372
MSFT,0.425303,1.0,0.141368,0.008222,0.07311,0.035849
IBM,0.20183,0.141368,1.0,-0.026336,-0.038485,-0.063292
MCD,-0.009573,0.008222,-0.026336,1.0,0.04963,0.022796
WMT,0.032225,0.07311,-0.038485,0.04963,1.0,0.045021
DIS,0.007372,0.035849,-0.063292,0.022796,0.045021,1.0


# PCA

In [237]:
#Data preparation (For this section we dont need the DOW)
df = returns.drop(['^DJI'],axis=1)

# Covariance Matrix
cov = np.cov(df.T)

In [238]:
# Principal component
eig_vals,eig_vecs = np.linalg.eig(cov)
# Creat clean df
data = {'INTC': eig_vecs[0], 'MSFT': eig_vecs[1], 'IBM':eig_vecs[2],'MCD':eig_vecs[3],'WMT':eig_vecs[4],'DIS':eig_vecs[5]}
dfe = pd.DataFrame(data)
dfe.index = ['Eigen 1', 'Eigen 2', 'Eigen 3', 'Eigen 4', 'Eigen 5', 'Eigen 6']

df_2PCA = dfe.drop(['Eigen 3', 'Eigen 4', 'Eigen 5', 'Eigen 6']) # Print just the first two vectors
display(df_2PCA)



Unnamed: 0,INTC,MSFT,IBM,MCD,WMT,DIS
Eigen 1,-0.636874,-0.529765,-0.371244,-0.193455,-0.276607,-0.248963
Eigen 2,-0.512041,-0.103423,0.067493,0.350862,0.593009,0.497794


In [None]:
# Percentage of variance  
tot = sum(eig_vals)
var_exp = [(i/tot)*100 for i in sorted(eig_vals, reverse = True)]
cum_var_exp = np.cumsum(var_exp)

def _plot_cum_variance():
    plt.plot([1,2,3,4,5,6],cum_var_exp, 'k')
    plt.title('Cumulative Variance of Eigenvalue')
    plt.xlabel('Eigenvalue index')
    plt.ylabel('Percentage of variance explained')
    plt.gcf().set_dpi(300)
    plt.show()
_plot_cum_variance()
print(cum_var_exp)

In [245]:
# Now lets do the model with Sklearn 
from sklearn.decomposition import PCA

X = df

pca = PCA(n_components=2)
pca.fit(X)
print(pca.explained_variance_ratio_) # We get the same variance we add earlier

y = pca.transform(X)
print(y)

[0.43520862 0.15650998]
[[ 0.00928374 -0.00395553]
 [-0.01308261 -0.00768956]
 [-0.05408258  0.01878536]
 ...
 [ 0.02045732  0.00395969]
 [ 0.00812687 -0.02796403]
 [ 0.02446952  0.00305815]]
