# Analyse S&P500 Stock Prices Using Unsupervised Machine Learning

In [None]:
import sys
sys.path.append('../../')

In [None]:
QUANDL_API_KEY = 'YOUR-QUANDL-API-KEY-HERE'

## Load S&P500 Stocks Symbols

In [None]:
import pandas as pd

sp_file='../../data/SP500_companies.csv'

sp_df=pd.read_csv(sp_file)

sectors=list(set(sp_df['Sector'].values.tolist()))

nb_sectors=len(sectors)

markers=["+","*","o","d","<","2","v","h","p","s","1",">","3","4"]

sector_marker_map={sectors[n]:markers[n] for n in range(nb_sectors)}

print(sector_marker_map)

symbol_sector_map={sp_df['Symbol'][n]:sp_df['Sector'][n] for n in range(sp_df.shape[0])}

## Load S&P Stocks Prices

In [None]:
%%time

import datetime
import matplotlib.pyplot as plt

from data_loading.stocks_data_loaders import get_stock_attribute_data
from data_loading.stocks_data_loaders import get_stock_data

quandl_tickers=[f'WIKI/{symbol}' for symbol in sp_df['Symbol']]

start_date='2017-1-1'#datetime.datetime(2017,1,1)

end_date='2017-4-30'#datetime.datetime(2017,4,30)

stocks_df=get_stock_data(quandl_tickers,start_date,end_date,QUANDL_API_KEY)

prices_df=get_stock_attribute_data(stocks_df=stocks_df,attribute='Adj. Close')

# Renaming columns name as stocks symbols
renaming_map={key:key.split(' ')[0].split('/')[1] for key in prices_df.keys()}
prices_df=prices_df.rename(columns=renaming_map)

#dropping columns with missing symbols
prices_df=prices_df.dropna(axis=1,how='all') 

#collecting remaining stocks symbols 
symbols=prices_df.keys().tolist()

## Compute Stock Returns

In [None]:
from data_loading.stocks_data_loaders import compute_stock_returns,compute_dataframe_mean_std

returns_df=compute_stock_returns(prices_df)

returns_df=returns_df.fillna(returns_df.mean())

## Plot Prices And Returns

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

fig,axs = plt.subplots(1,2,figsize=(10,3))
axs[0].plot(prices_df.values)
axs[0].set_title('Adj Close price S&P500 stocks')
axs[0].set_xlabel('Time (days)')
axs[0].set_ylabel('Price (dollars)')

for ticker in  [symbols[0], symbols[10]]:
    axs[1].plot(returns_df[ticker].values, label=ticker)
axs[1].set_title('Stocks returns')
axs[1].set_xlabel('Time (days)')
axs[1].set_ylabel('Return (percentage)')
axs[1].legend()
plt.show()

## Stock Returns PCA Analysis

In [None]:
import numpy as np

from sklearn.cluster import KMeans

from sklearn.decomposition import PCA

pca = PCA()

samples=returns_df.values.T

nb_samples=samples.shape[0]

pca.fit(samples)

K=100

explained=sum(pca.explained_variance_ratio_[0:K])

print("variance explained by {} first eigenvectors {}%".format(K,100*explained))

samples_2d=np.matmul(samples,pca.components_[:,0:2])

kmeans=KMeans(n_clusters=5)

kmeans.fit(samples_2d)

colormap=["r","g","b","c","y","k","m"] 

fig = plt.figure(figsize=(15,12))

for n in range(0,nb_samples,2):
    x=samples_2d[n,0]
    y=samples_2d[n,1]
    symbol = symbols[n]
    plt.scatter(x,y,marker=sector_marker_map[symbol_sector_map[symbol]],color=colormap[kmeans.labels_[n]])
    plt.text(x,y,symbol,fontsize=8)
plt.xlabel("PC-1")
plt.ylabel("PC-2")
plt.show()

In [None]:
fig,axs=plt.subplots(1,2,figsize=(10,3))
axs[0].bar(np.arange(pca.explained_variance_ratio_.shape[0]),100*pca.explained_variance_ratio_)
axs[0].set_ylabel('Percentage of explained variance')
axs[0].set_xlabel('PC Index')
lineObjects = axs[1].plot(pca.components_[:,0:2])
axs[1].legend(iter(lineObjects), ('PC-1', 'PC-2'))
axs[1].set_xlabel("time")
axs[1].set_ylabel("return")
plt.show()

## Stocks Returns Neural Embedding Analysis

In [None]:
import tensorflow as tf

from ml_models.neural_networks import create_autoencoder

encoder,autoencoder=create_autoencoder(input_dim=samples.shape[1],
                                       encoding_layers_sizes=[150,50,2],
                                       layers_activation='relu',
                                       embedding_activation='tanh',
                                       output_activation='linear')

optimizer=tf.keras.optimizers.Adam(learning_rate=0.001)
    
autoencoder.compile(optimizer='adam',loss='mean_squared_error')

training=autoencoder.fit(samples,samples,epochs=1000,batch_size=16,shuffle=True)

In [None]:
samples_2d = encoder.predict(samples)

kmeans = KMeans(n_clusters=5)

kmeans.fit(samples_2d)

fig = plt.figure(figsize=(15,12))

for n in range(nb_samples):
    x=samples_2d[n,0]
    y=samples_2d[n,1]
    symbol = symbols[n]
    plt.scatter(x,y,marker=sector_marker_map[symbol_sector_map[symbol]],color=colormap[kmeans.labels_[n]])
    plt.text(x+0.003,y+0.003,symbol,fontsize=8)
plt.title('S&P 500 return trajectories auto-encoder embedding')
plt.xlabel("X1")
plt.ylabel("X2")
plt.show()