In [1]:
!pwd

/Users/ashish1610dhiman/gatech_projects/time_series_practice/notebooks/stocks_clustering


In [2]:
import sys
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import yfinance as yf
import datetime

from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, TimeSeriesResampler

from sklearn.metrics import adjusted_mutual_info_score as ami

from tqdm.notebook import tqdm

sys.version

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


'3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:55:37) \n[Clang 14.0.6 ]'

In [3]:
%load_ext autoreload 
%autoreload 2

In [4]:
SEED = 77

### Preprocess data

In [5]:
df_ticker = pd.read_csv("../../data/sp500_ticker_sector.csv")
stock_prices = pd.read_csv("../../data/sp500_stock_prices.xlsx", header=[0, 1], low_memory=False)

In [6]:
stock_prices.head()

Unnamed: 0_level_0,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,Adj Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,A,AAL,AAP,AAPL,ABBV,ABC,ABT,ACGL,ACN,ADBE,...,WYNN,XEL,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
0,37.941978,51.07991,148.292755,24.603207,46.979839,79.608658,38.606461,19.496668,77.739204,72.339996,...,2228900,2534900,10220400,1825800,605900,2283466,936579,411800,2298700,1784200
1,37.231033,51.051483,146.338089,23.910097,46.095711,78.93103,38.615059,19.379999,76.42662,71.980003,...,1695100,3107200,18502400,1469800,1369900,4418651,2223873,420300,5326000,3112100
2,36.651051,50.255592,146.235245,23.912344,45.867554,79.362244,38.176548,19.423332,75.875359,70.529999,...,1975800,4749600,16670700,1988400,1333200,5004401,1835563,527500,9086500,3977200
3,37.137489,50.227158,149.377655,24.247646,47.721359,80.946335,38.486088,19.536667,77.467957,71.110001,...,1472000,2833400,13590700,959800,1038600,4554134,1505860,467800,2759900,2481800
4,38.25069,50.843033,150.686996,25.179295,48.220463,81.131142,39.27713,19.876667,78.649269,72.919998,...,1676600,2516800,15487500,1409500,821800,4258268,1449004,324400,1831500,3121300


In [7]:
stock_prices.columns.get_level_values(0).unique()

Index(['Adj Close', 'Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')

In [8]:
close_prices = stock_prices["Adj Close"]
close_prices.index = pd.to_datetime(stock_prices[(     'Date', 'Unnamed: 0_level_1')].values)

close_prices.head()

KeyError: ('Date', 'Unnamed: 0_level_1')

In [None]:
close_prices.info()

In [None]:
close_prices.shape, close_prices.dropna(axis=1).shape, close_prices.dropna(axis=1).isna().sum().sum()

In [None]:
close_prices = close_prices.dropna(axis=1)
close_prices.shape

In [None]:
close_prices.index = pd.to_datetime(close_prices.index)

In [None]:
close_prices = close_prices["2018":]
close_prices.shape

In [None]:
close_prices.to_csv("../../data/sp500_close_prices.csv")

#### Sample plots

In [None]:
close_prices[list(close_prices.columns)[:10]].plot()

In [None]:
close_prices[list(close_prices.columns)[200:210]].plot()

#### Sectors available

In [None]:
df_ticker.head()

In [None]:
df_ticker["GICS Sector"].value_counts(),df_ticker["GICS Sector"].nunique()

### Time Series Clustering
#### Preprocess time series data

In [None]:
close_prices.shape

In [None]:
#we have 479 time series
X_train = TimeSeriesScalerMeanVariance().fit_transform(close_prices.T)
X_train.shape

In [None]:
get_sector = lambda x: df_ticker[df_ticker.Symbol==x]["GICS Sector"].values[0]

In [None]:
get_sector("MMM")

In [None]:
y_train = np.array([get_sector(tick) for tick in close_prices.columns])
y_train.shape

In [None]:
pd.DataFrame(X_train[:20,:,0].T).plot()

#### Plot all sectors together

In [None]:
import matplotlib
matplotlib.rcParams['figure.figsize'] = [12, 5]

In [None]:
for sector in set(y_train):
    stocks_with_sector = y_train==sector
    df_subset = close_prices.loc[:,stocks_with_sector]
    sector_cols = df_subset.columns
    scaled_subset = pd.DataFrame(X_train[stocks_with_sector,:,0].T,\
                            columns = sector_cols)
    scaled_subset.index = df_subset.index
    random_columns = np.random.choice(sector_cols,size = min(10,len(scaled_subset)))
    fig, axes = plt.subplots(nrows=2, ncols=1, sharex=True)
    df_subset[random_columns].plot(ax=axes[0], title =f"{sector} | original")
    scaled_subset[random_columns].plot(ax=axes[1], title =f"{sector} | scaled")

### Time series clustering

In [None]:
### dummy data
from tslearn.datasets import CachedDatasets
X_train1, y_train1, X_test1, y_test1 = CachedDatasets().load_dataset("Trace")
X_train1 = X_train1[y_train1 < 4]  # Keep first 3 classes
X_train1.shape, y_train1.shape

In [None]:
X_train1.shape

In [None]:
X_train.shape

In [None]:
def ad_clustering(X_train,y_train,k, seed = SEED):
    model = TimeSeriesKMeans(n_clusters=k,
                          n_init=2,
                          metric="dtw",
                          verbose=False,
                          max_iter_barycenter=10,tol=1e-4,
                          random_state=seed,n_jobs=-1)
    y_pred = model.fit_predict(X_train)
    ami1 = ami(labels_true = y_train, labels_pred = y_pred)
    return (model,y_pred,ami1)

In [None]:
X_train[:10,1000:,].shape

In [None]:
result_dict={}
for k in tqdm(range(5,20)):
    result_dict[k] = ad_clustering(X_train[:10,1000:,],y_train[:10],k)
    print (f"For k = {k}, ami = {ami:.5%}")