In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from tqdm import tqdm

from datetime import datetime
import yfinance as yf
import matplotlib.pyplot as plt
import scipy.stats as stats

In [None]:
from utils import *
from data_handler import DataHandler
from feature_engineering import FeatureEngineering

In [None]:
data_handler = DataHandler()
feature_handler = FeatureEngineering()

In [None]:
# Due to 3 of the Stocks having different class, we have 503 classes instead of 500, those companies being (GOOG, GOOGL), (FOX, FOXA), (NWS, NWSA)
print("Total S&P 500 Companies: ", len(data_handler))

In [None]:
snp = data_handler.get_snp()
display(snp)

In [None]:
# The Following Stocks have multiple types, and they follow similar behavious

snp = snp.loc[~snp["Symbol"].isin(["GOOG", "FOX", "NWS"])]
tickers = snp["Symbol"].unique().tolist()#[:30]
print("Total Unique Tickers: ", len(tickers))

# STORING AND PREPARING AVERAGE RETURN DATA FOR FUTURE

In [None]:
"""
For each Stock in our SNP500, we fetch the data and prepare the features that will be used to label 
the stock at each timestamp
"""


all_feature = []
for ticker in tqdm(tickers[:]): # iterating over all tickers from snp500
  try:
    # Fetch the data for a given stock
    df = data_handler.get_data(ticker)
    # Check if the datframe length is 0 we will not consider this stock for further experiment
    if len(df)==0:
      print(f"Skipping Ticker: {ticker} due to :: no data download")
      continue
    # fill in the stock dates between start date and end date
    # we fill the price from missing date with the last observed date values
    df = feature_handler.fill_missing_date(df.copy())
  except Exception as e:
    print(f"Skipping Ticker: {ticker} due to :: {e}")
    continue

  # create a column with the rate of price difference change between current price and price in 2 year future
  df_price_diff = feature_handler.get_price_diff(df.copy())
  # create columns with the mean and std of the rate of change over different window period (currently done for 7 days and 60 days)
  df_price_diff = feature_handler.add_rolling_average(df_price_diff.copy())

  # create columns that are then used to label stock at each point (taking into account the price diff and near window changes)
  feature = feature_handler.get_feature(df_price_diff)
  # add ticker name to the dataframe
  feature["Ticker"] = ticker
  all_feature.append(feature)

# concatenating all the dataframes into a single dataframe (TO BE REUSED OVER AND OVER)
all_feature = pd.concat(all_feature)

display(all_feature)


In [None]:
save_data(all_feature, "data_snp500_movement_v2.csv")

In [None]:
len(all_feature)

In [None]:
all_feature["Ticker"].nunique()

## Assigning Label to each movement of every stock (labels range from 1-12)

In [None]:
# Create a new dataframe with labels for each stock movement (there are in total 12 of these labels)
temp_all, df = get_data_label()
# Length of each stock sequence
lens = get_length_of_tickers(temp_all)
# storing the complete data in form of numpy array for easier operations
full_array = get_array(temp_all)
# Computing Siilarity Matrix between sequences of stocks
scores, all_tickers = get_similarity_score(df, full_array, lens)

In [None]:
save_data(scores, 'all_data_snp500_similarity_score_train.npy') # similarity score between each pair of stocks
save_data(all_tickers, "all_data_snp500_similarity_tickers_v2.pickle") # all tickers in order of there position in the similarity array 

# Clustering of Stocks based upon the identified sequence Labels

In [None]:
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster

similarity_matrix = load_data("all_data_snp500_similarity_score_train.npy")
distance_matrix = 1 - similarity_matrix
linked = linkage(distance_matrix, 'ward')
plt.figure(figsize=(10, 7))
dendrogram = dendrogram(linked,
                            orientation='top',
                            labels=[f"{i}" for i in range(similarity_matrix.shape[0])],
                            distance_sort='ascending',
                            show_leaf_counts=True)

plt.title('Dendrogram')
plt.xlabel('Ticker index')
plt.ylabel('Distance')
plt.show()


# Clustering Stocks into multiple buckets
### Where stock in same bucket have minimal distance in terms of similarity

In [None]:
"""
Clustering Stocks into multiple buckets
Where stock in same bucket have minimal distance in terms of similarity
"""
cutoff_distance = 2
cluster_labels = fcluster(linked, cutoff_distance, criterion='distance')

print("Unique Cluster labels:", len(set(cluster_labels)))

ticker_name = load_data("all_data_snp500_similarity_tickers_v2.pickle")

df_cluster_label = {"Ticker": [], "Cluster": []}
for i in range(len(cluster_labels)):
  df_cluster_label["Ticker"].append(ticker_name[i])
  df_cluster_label["Cluster"].append(cluster_labels[i])

df_cluster_label = pd.DataFrame.from_dict(df_cluster_label)
display(df_cluster_label)
save_data(df_cluster_label, "stock_cluster_v2.csv")

## Begining the Evaluation Phase
* Unlike the data used for clustering which was prior 1st Jan 2021, we will be evaluating on the next 6 months data from 1st Jan 2021

In [None]:
# Create a new dataframe with labels for each stock movement (there are in total 12 of these labels)
temp_all, df = get_data_label(lab_idx=label_index_eval, start_date=datetime(2015, 1, 1), end_date=datetime(2024, 12, 1))
# Length of each stock sequence
temp_all = temp_all.loc[(temp_all["current_date"]>datetime(2021, 1, 1)) & (temp_all["current_date"]<datetime(2021, 7, 1))]
df = df.loc[(df["current_date"]>datetime(2021, 1, 1)) & (df["current_date"]<datetime(2021, 7, 1))]
lens = get_length_of_tickers(temp_all)
# storing the complete data in form of numpy array for easier operations
full_array = get_array(temp_all)
# Computing Siilarity Matrix between sequences of stocks
scores, all_tickers = get_similarity_score(df, full_array, lens)

In [None]:
# create a dataframe of similar stocks that are derived from historical data
df_same_cluster_ticker = df_cluster_label.groupby("Cluster")["Ticker"].apply(list).reset_index()
display(df_same_cluster_ticker)

In [None]:
matching_percentage = []

# iterate over all tickers
for idx, row in tqdm(enumerate(df_same_cluster_ticker.itertuples())):
  tickers = row.Ticker

  # if certain cluster has one element we will return here one
  if len(tickers)==1:
    matching_percentage.append(1)
    continue

  temp = temp_all.loc[temp_all["Ticker"].isin(tickers)]
  temp = get_array(temp)
    
  # computing the similarity of one sequence against rest and then taking their mean
  # This serves as our assesment quality of each cluster
  similarity = np.zeros((len(tickers), len(tickers)))
  for i in range(0, temp.shape[0], lens):
    base = temp[i:i+lens]
    for j in range(0, temp.shape[0], lens):
      target = temp[j:j+lens]
      matches, total = 0, 0
      for k, l in zip(base, target):
        if k<0 or l<0:
          continue
        if k==l:
          matches += 1
        total += 1
      similarity[i//lens,j//lens] = matches/(total+1)
  # print(similarity)
  val = sum_except_diagonal(similarity)
  norm = len(tickers)**2 - len(tickers)
  matching_percentage.append(val/norm)


In [None]:
df_same_cluster_ticker["Average Percentage Match"] =  matching_percentage
df_same_cluster_ticker["len"] =  df_same_cluster_ticker["Ticker"].apply(lambda x: len(x))
display(df_same_cluster_ticker)

In [None]:
df_same_cluster_ticker["weighted match"] =  df_same_cluster_ticker["Average Percentage Match"] *  df_same_cluster_ticker["len"]

In [None]:
print("Average Percentage Match in Test Period: ", df_same_cluster_ticker["Average Percentage Match"].mean())
print("Weighted Average Percentage Match in Test Period: ", df_same_cluster_ticker["weighted match"].sum()/df_same_cluster_ticker["len"].sum())

In [None]:
data_handler = DataHandler()
snp = data_handler.get_snp()
snp_industry = snp.set_index("Symbol")["GICS Sub-Industry"].to_dict()
temp = (df_same_cluster_ticker.loc[df_same_cluster_ticker["len"]>1].sort_values(by=["Average Percentage Match"], ascending=False)[["Cluster",	"Ticker",	"Average Percentage Match",	"len"]])
industry_counter = []
for row in temp.itertuples():
  tickers = row.Ticker
  counter = {}
  for item in tickers:
    counter[snp_industry[item]] = counter.get(snp_industry[item], 0) + 1
  industry_counter.append(counter)
temp["Sub-Industry Counter"] = industry_counter
temp["Different Sub-Industries"] = temp["Sub-Industry Counter"].apply(lambda x: len(x))

In [None]:
save_data(temp, 'clustered_stocks_final.csv')

In [None]:
# Plotting Sample 
for i, row in enumerate(temp.itertuples()):
  cluster_ticker = row.Ticker
  tt = df.loc[df["Ticker"].isin(cluster_ticker)]
  fig = px.line(tt, x='current_date', y='price_diff', color='Ticker', title='Rate Of Change')
  fig.show()

  if i==9:
    break