In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import pandas_datareader as pdr
import seaborn as sns
import matplotlib.pyplot as plt
import bs4 as bs
import requests
from IPython.display import clear_output
from scipy.stats import mstats
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import RandomizedSearchCV, validation_curve, TimeSeriesSplit
from sklearn.ensemble import RandomForestClassifier
import pickle
import os
from sklearn.model_selection import GridSearchCV
sns.set()



#Obtain list of S&100 companies from wikipedia
resp = requests.get("https://en.wikipedia.org/wiki/S%26P_100")
convert_soup = bs.BeautifulSoup(resp.text, 'lxml')
table = convert_soup.find('table',{'class':'wikitable sortable'})

tickers = []

for rows in table.findAll('tr')[1:]:
    ticker = rows.findAll('td')[0].text.strip()
    tickers.append(ticker)

all_data = pd.DataFrame()
test_data = pd.DataFrame()
no_data = []

#Extract data from Yahoo Finance
for i in tickers:
    try:
        print(i)
        test_data = pdr.get_data_yahoo(i, start = dt.datetime(1990,1,1), end = dt.date.today())
        test_data['symbol'] = i
        all_data = all_data.append(test_data)
        clear_output(wait = True)
    except:
        no_data.append(i)

    clear_output(wait = True)

XOM


In [5]:
all_data['return'] = all_data.groupby('symbol')['Close'].pct_change() 


all_data['SMA_5'] = all_data.groupby('symbol')['Close'].transform(lambda x: x.rolling(window = 5).mean())
all_data['SMA_15'] = all_data.groupby('symbol')['Close'].transform(lambda x: x.rolling(window = 15).mean())
all_data['SMA_ratio'] = all_data['SMA_15'] / all_data['SMA_5']


all_data['SMA5_Volume'] = all_data.groupby('symbol')['Volume'].transform(lambda x: x.rolling(window = 5).mean())
all_data['SMA15_Volume'] = all_data.groupby('symbol')['Volume'].transform(lambda x: x.rolling(window = 15).mean())
all_data['SMA_Volume_Ratio'] = all_data['SMA5_Volume']/all_data['SMA15_Volume']



In [2]:
all_data['Close_Shifted'] = all_data.groupby('symbol')['Close'].transform(lambda x: x.shift(-6))
all_data['Target'] = ((all_data['Close_Shifted'] - all_data['Open'])/(all_data['Open']) * 100).shift(-1)
all_data['Target_Direction'] = np.where(all_data['Target']>0,1,0)
all_data = all_data.dropna().copy()

In [3]:
Target_variables = ['SMA_ratio','ATR_5','ATR_15','ATR_Ratio',
                       'ADX_5','ADX_15','SMA_Volume_Ratio','Stochastic_5','Stochastic_15','Stochastic_Ratio',
                      'RSI_5','RSI_15','RSI_ratio','MACD']
for variable in Target_variables:
    all_data.loc[:,variable] = mstats.winsorize(all_data.loc[:,variable], limits = [0.1,0.1])

KeyError: 'SMA_ratio'

In [7]:
#Extract the returns
returns = all_data[['symbol','return']].copy()
returns['Date'] = returns.index.copy()

#Pivot the returns to create series of returns for each stock 
transposed = returns.pivot(index = 'Date', columns = 'symbol', values = 'return')

transposed

symbol,AAPL,ABBV,ABT,ACN,ADBE,AIG,AMGN,AMT,AMZN,AVGO,...,UNH,UNP,UPS,USB,V,VZ,WBA,WFC,WMT,XOM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1990-01-02,,,,,,,,,,,...,,,,,,,,,,
1990-01-03,0.006711,,0.003610,,0.049383,-0.001185,0.024271,,,,...,-0.020409,-0.014107,,0.011364,,-0.002193,-0.013055,-0.044199,0.000000,-0.010000
1990-01-04,0.003336,,-0.001799,,0.047059,-0.014235,0.018958,,,,...,-0.010416,-0.004769,,-0.044944,,-0.038462,-0.034391,-0.011561,-0.005305,-0.010101
1990-01-05,0.003322,,-0.010811,,0.022472,-0.030084,0.013953,,,,...,-0.042105,0.000000,,-0.011765,,-0.022857,-0.024658,-0.029240,-0.010667,-0.005102
1990-01-08,0.006622,,0.000000,,0.010989,-0.006203,-0.002293,,,,...,-0.021979,0.004792,,-0.035714,,0.008187,0.019663,0.030120,0.013477,0.015385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-23,0.003644,0.007876,0.001223,-0.000941,0.010000,0.023521,0.010202,-0.008924,0.000184,0.014652,...,0.002530,0.011322,0.017991,0.004280,-0.006148,-0.001706,0.001389,0.006871,-0.002217,0.000492
2021-12-27,0.022975,0.009918,0.016528,0.029803,0.014150,0.005834,0.006167,0.025185,-0.008178,0.014260,...,0.008317,0.009404,0.013761,0.007636,0.004663,0.000000,0.004952,0.008478,0.009105,0.014258
2021-12-28,-0.005767,-0.000149,-0.006998,-0.000144,-0.014402,-0.004042,0.002665,0.009553,0.005844,-0.007831,...,0.006887,0.005162,-0.001999,0.000176,0.001930,0.000759,0.013799,-0.007177,0.014351,-0.003232
2021-12-29,0.000502,0.007218,0.005126,0.000361,-0.000123,0.001941,0.008106,0.001352,-0.008555,0.005396,...,0.005249,0.007222,-0.000373,-0.002643,0.000550,-0.006829,0.015944,0.000620,-0.000490,-0.008753


In [10]:
#Transpose the data to get companies on the index level and dates on the column level since clusters takes place on index level
X = transposed.dropna().transpose()

X
#Extract sum of squares for K-means clusters from 1 to 50 clusters
# sum_of_sq = np.zeros([50, 1])
# for k in range(1, 51):
#     sum_of_sq[k-1] = KMeans(n_clusters=k).fit(X).inertia_
    
# plt.plot(range(1, 50), sum_of_sq[1:50])
# plt.title("Elbow Method") 
# plt.xlabel("Number of Cluster") 
# plt.ylabel("Within-cluster Sum of Squares")

# pd.DataFrame(sum_of_sq, columns = ['Difference in SS'], index = range(1,51)).diff()

Date,2019-03-21,2019-03-22,2019-03-25,2019-03-26,2019-03-27,2019-03-28,2019-03-29,2019-04-01,2019-04-02,2019-04-03,...,2021-12-16,2021-12-17,2021-12-20,2021-12-21,2021-12-22,2021-12-23,2021-12-27,2021-12-28,2021-12-29,2021-12-30
symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAPL,0.036830,-0.020708,-0.012091,-0.010332,0.008994,0.001326,0.006518,0.006791,0.014537,0.006855,...,-0.039264,-0.006502,-0.008122,0.019087,0.015319,0.003644,0.022975,-0.005767,0.000502,-0.006578
ABBV,0.010071,-0.018218,-0.003134,0.015721,-0.010773,0.000876,0.007879,0.002358,0.028349,0.000120,...,0.011902,-0.017074,0.010268,-0.006954,0.016160,0.007876,0.009918,-0.000149,0.007218,0.004211
ABT,0.001501,-0.026105,0.003335,0.009843,-0.003418,0.009018,0.006294,-0.003503,-0.000502,-0.001507,...,0.003910,0.007495,-0.015535,0.002149,0.027501,0.001223,0.016528,-0.006998,0.005126,-0.001346
ACN,0.011848,-0.017832,-0.000968,0.011267,-0.002815,0.051961,0.005139,0.001704,-0.005388,0.010378,...,0.067413,-0.010884,-0.026398,0.018430,0.027489,-0.000941,0.029803,-0.000144,0.000361,-0.003828
ADBE,0.017094,-0.016996,0.009935,0.012659,-0.012237,0.004536,0.011232,0.021314,-0.003013,0.000553,...,-0.101915,-0.016693,-0.012342,0.014097,0.011587,0.010000,0.014150,-0.014402,-0.000123,0.002178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VZ,0.010751,0.025219,0.005355,0.009154,0.004123,-0.029566,0.000846,-0.000676,-0.009477,0.005809,...,0.043521,0.007962,-0.001505,-0.005839,-0.000189,-0.001706,0.000000,0.000759,-0.006829,-0.002101
WBA,0.012900,-0.018711,-0.011537,0.003728,0.005006,0.004499,0.012158,0.003477,-0.128052,-0.009393,...,-0.008055,-0.008323,-0.000409,0.023141,0.009007,0.001389,0.004952,0.013799,0.015944,-0.004976
WFC,-0.010714,-0.031087,-0.004761,0.019343,-0.004897,0.006561,-0.015685,0.010141,-0.012293,0.013483,...,0.027789,-0.046720,-0.022941,0.020491,0.004601,0.006871,0.008478,-0.007177,0.000620,-0.007224
WMT,0.004258,-0.007874,-0.001119,0.001528,-0.011290,-0.000823,0.004118,0.002973,-0.008996,0.002579,...,-0.016670,-0.032022,0.003243,0.003017,0.001289,-0.002217,0.009105,0.014351,-0.000490,0.003223


In [None]:
#Get 17 clusters
gmm = GaussianMixture(n_components = 17)
gmm.fit(transposed.dropna().transpose())

#Predict for each company
clusters = gmm.predict(transposed.dropna().transpose())
clusters_df = pd.DataFrame({'Cluster':clusters,
                           'Companies':transposed.columns})

#Sort by Clusters
clusters_df = clusters_df.sort_values(['Cluster']).reset_index(drop = True)

#Save as csv
clusters_df.to_csv("clusters.csv")
clusters_df = pd.read_csv("clusters.csv", index_col = 0)