In [64]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt

import seaborn as sns
sns.set()

%config InlineBackend.figure_format = 'svg'

In [65]:
df_close = pd.read_csv('../data/close_prices.csv')

In [66]:
from sklearn.decomposition import PCA

In [67]:
pca_close = PCA(n_components=10)

In [68]:
df_close.columns

Index(['date', 'AXP', 'BA', 'CAT', 'CSCO', 'CVX', 'DD', 'DIS', 'GE', 'GS',
       'HD', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM', 'MRK', 'MSFT',
       'NKE', 'PFE', 'PG', 'T', 'TRV', 'UNH', 'UTX', 'V', 'VZ', 'WMT', 'XOM'],
      dtype='object')

In [69]:
companies_name = ['AXP', 'BA', 'CAT', 'CSCO', 'CVX', 'DD', 'DIS', 'GE', 'GS',
       'HD', 'IBM', 'INTC', 'JNJ', 'JPM', 'KO', 'MCD', 'MMM', 'MRK', 'MSFT',
       'NKE', 'PFE', 'PG', 'T', 'TRV', 'UNH', 'UTX', 'V', 'VZ', 'WMT', 'XOM', ]

In [70]:
X = df_close[companies_name]

In [71]:
pca_close.fit(X)

PCA(copy=True, iterated_power='auto', n_components=10, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [72]:
pca_close.explained_variance_ratio_ 

array([0.73897118, 0.11007169, 0.04995088, 0.0287492 , 0.02215448,
       0.01931577, 0.00674853, 0.00614091, 0.00320594, 0.00305611])

In [73]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly
import plotly.graph_objs as go

init_notebook_mode(connected=True)

In [74]:
x_plot = [i for i in range(len(pca_close.explained_variance_ratio_)+1)]
y_plot = [sum(pca_close.explained_variance_ratio_[:i]) for i in range(len(pca_close.explained_variance_ratio_))]
trace = go.Scatter(
    x=x_plot,
    y=y_plot,
    name='pca_close.explained_variance_ratio_'
)
threshold = go.Scatter(
    x=x_plot,
    y = [0.9, ] * len(x_plot),
    name='Threshold'
)

data = [trace, threshold]

# Set the title
layout = {'title': 'How components affect variance'}

fig = go.Figure(data=data, layout=layout)
iplot(fig, show_link=False)

In [75]:
X_t = pca_close.transform(X)

In [76]:
df_idx = pd.read_csv('../data/djia_index.csv')

In [77]:
Y = df_idx['^DJI']

In [78]:
X_first_component = X_t[:, 0]

In [79]:
np.corrcoef(X_first_component, Y)


array([[1.        , 0.90965222],
       [0.90965222, 1.        ]])

In [80]:
pca_close.components_[0, :]

array([ 1.61383840e-02,  1.20644923e-01, -5.16612711e-02,  5.04842369e-02,
       -1.25859933e-01,  1.14089567e-01,  2.33906290e-01, -6.20513749e-03,
        2.51227032e-01,  2.88996029e-01, -2.64998795e-01,  9.31320168e-02,
        9.13948403e-02,  4.69879340e-02,  2.90549417e-02, -2.61068828e-02,
        3.29615584e-01,  7.13897133e-02,  7.62295699e-02,  2.11888868e-01,
        2.30922941e-02,  7.77316954e-02, -7.20594590e-03,  1.89479745e-01,
        3.21564017e-01,  5.36834873e-02,  5.79683946e-01,  1.09122230e-04,
        8.71614334e-02, -4.29421420e-02])

In [81]:
np.argmax(pca_close.components_[0, :])

26

In [82]:
pca_close.components_[0, 26]

0.579683945747361

In [83]:
companies_name[26]

'V'

In [84]:
print('V is Visa :-)')

V is Visa :-)
