#### Import libraries

In [1]:
import numpy as np
import plotly.express as px
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

#### Load data and check dimensions

In [2]:
train_clusters = np.load('data/224_train_clusters.npy')
X_train = np.load('data/224_train_embeddings.npy')

X_test = np.load("data/241_1h_train_embeddings.npy")
test_clusters = np.load("data/241_1h_train_clusters.npy")

In [3]:
np.unique(train_clusters, return_counts=True)

(array([0, 1, 2, 3, 4]), array([  257,   284,  2928, 25432,   691]))

In [4]:
print(X_train.shape)
print(X_test.shape)

(29592, 192)
(9043, 192)


#### Plot data using PCA

In [5]:
# apply PCA on the test data
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# plot the train data colored by the true clusters
fig = px.scatter(
    x=X_train_pca[:, 0],
    y=X_train_pca[:, 1],
    color=train_clusters,
    color_discrete_sequence=px.colors.qualitative.Plotly,
    labels={'x':'PC1', 'y':'PC2', 'color':'Cluster'},
    title='Train data colored by cluster',
    )
fig.show()

# plot the test data colored by the true clusters
fig = px.scatter(
    x=X_test_pca[:, 0],
    y=X_test_pca[:, 1],
    color=test_clusters,
    color_discrete_sequence=px.colors.qualitative.Plotly,
    labels={'x':'PC1', 'y':'PC2', 'color':'Cluster'},
    title='Test data colored by cluster',
    )
fig.show()

In [6]:
# time series plot of the test data colored by the true clusters with the first pca component
fig = px.scatter(
    x=np.arange(X_test_pca.shape[0]),
    y=X_test_pca[:, 0],
    color=test_clusters,
    color_discrete_sequence=px.colors.qualitative.Plotly,
    labels={'x':'Time', 'y':'PC1', 'color':'Cluster'},
    title='Test data colored by cluster',
    )
#fig.update_traces(marker=dict(size=2))
fig.show()


#### Train KNN classifier on train data

In [10]:
knn = KNeighborsClassifier(n_neighbors=1, metric='cosine')
knn.fit(X_train, train_clusters)
preds = knn.predict(X_test)

confusion_matrix(test_clusters, preds)

array([[   3,    6,    8, 1176,   22],
       [   0,    1,    3, 1061,   18],
       [   2,    2,   18, 3343,   11],
       [ 112,    0, 2809,  211,   13],
       [  77,    0,   12,   44,   91]])