In [35]:
import pandas as pd
import numpy as np
import plotly.express as px
from pathlib import Path
from tslearn.clustering import TimeSeriesKMeans
from sklearn.preprocessing import StandardScaler
import umap
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN


In [18]:
DATA_ROOT = Path("data")
PROCESSED_DATA = DATA_ROOT / "processed"

trends = pd.read_csv(PROCESSED_DATA / "trends.csv", index_col=0, parse_dates=True)
prices = pd.read_csv(PROCESSED_DATA / "prices.csv", index_col=0, parse_dates=True)
volumes = pd.read_csv(PROCESSED_DATA / "volumes.csv", index_col=0, parse_dates=True)

trends.shape, prices.shape, volumes.shape

((1781, 1859), (1781, 92), (1781, 92))

In [34]:
len(trends.columns.intersection(prices.columns))

66

In [19]:
def normalize_df(df: pd.DataFrame) -> pd.DataFrame:
    # return (df - df.min()) / (df.max() - df.min())
    return df/df.sum(axis=0)

trends = normalize_df(trends)
prices = normalize_df(prices)
volumes = normalize_df(volumes)

unified = pd.concat([trends, volumes], axis=1).T
unified.shape

(1951, 1781)

In [30]:
prices.shape

(1781, 92)

In [29]:
duplicates = unified.loc[unified.index.duplicated(keep=False)]
duplicates.index

Index(['ISRG', 'RSX', 'Russell2000', 'Google', 'V',
       'US_Consumer_Discretionary', 'BAC', 'ISRA', 'SYK', 'NOW',
       ...
       'MA', 'SLB', 'AAPL', 'HydrogenCCS', 'SAP', 'ASML', 'JNJ', 'CRM',
       'CleanWater', 'BLK'],
      dtype='object', length=132)

In [31]:
1951-1885

66

In [26]:
unified.index.value_counts()

GS                                  2
SAP                                 2
NVO                                 2
PANW                                2
SHE                                 2
                                   ..
Azure price                         1
buy Workday shares                  1
Galaxy Buds sales in Afghanistan    1
electric vehicle                    1
SupportIsrael                       1
Name: count, Length: 1885, dtype: int64

In [20]:
reducer = umap.UMAP(n_neighbors=10, min_dist=0.1, random_state=0)
X_embedded = reducer.fit_transform(unified)


ts_kmeans = TimeSeriesKMeans(n_clusters=5, metric="euclidean", random_state=0)
labels = ts_kmeans.fit_predict(unified)
count = np.unique(labels, return_counts=True)
pd.Series(count[1], index=count[0]).sort_index()

  warn(


0       3
1    1798
2      13
3       2
4     135
dtype: int64

In [23]:
fig = px.scatter(
    x=X_embedded[:, 0],
    y=X_embedded[:, 1],
    color=labels.astype(str),
    hover_name=[f"{col}" for col in unified.index],
    title="Soft-DTW Time Series Clustering (UMAP visualization)",
    width=1200,
    height=1200
)
fig.update_traces(marker=dict(size=10, line=dict(width=1, color='DarkSlateGrey')))
fig.show()