# Time Series AutoEncoder Clustring



### Loading libraries and Tensorboard

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torch.utils.tensorboard import SummaryWriter

from sklearn.cluster import KMeans
from sklearn.metrics.cluster import normalized_mutual_info_score

from tqdm import tqdm

from collections import OrderedDict

In [None]:
%load_ext tensorboard

In [None]:
%tensorboard --logdir runs

### Downloading and using HAR Dataset


HAR: Human Activity Recognition

In [None]:
import requests
response = requests.get("http://archive.ics.uci.edu/ml/machine-learning-databases/00240/UCI%20HAR%20Dataset.zip")
with open("dataset.zip","wb") as f:
  f.write(response.content)

In [None]:
from zipfile import ZipFile
with ZipFile("dataset.zip","r") as f:
  f.extractall()

In [None]:
# !cat "UCI HAR Dataset/train/subject_train.txt"

In [None]:
train_x_vec = OrderedDict()
with open("UCI HAR Dataset/train/Inertial Signals/body_acc_x_train.txt", "r") as f:
  train_body_acc_x = f.read()
  train_body_acc_x = [float(i) for i in train_body_acc_x.split(" ") if not i == '']
  train_x_vec["body_acc_x"] = [train_body_acc_x[i*128:(i+1)*128] for i in range(7352)]

with open("UCI HAR Dataset/train/Inertial Signals/body_acc_y_train.txt", "r") as f:
  train_body_acc_y = f.read()
  train_body_acc_y = [float(i) for i in train_body_acc_y.split(" ") if not i == '']
  train_x_vec["body_acc_y"] = [train_body_acc_y[i*128:(i+1)*128] for i in range(7352)]

with open("UCI HAR Dataset/train/Inertial Signals/body_acc_z_train.txt", "r") as f:
  train_body_acc_z = f.read()
  train_body_acc_z = [float(i) for i in train_body_acc_z.split(" ") if not i == '']
  train_x_vec["body_acc_z"] = [train_body_acc_z[i*128:(i+1)*128] for i in range(7352)]

with open("UCI HAR Dataset/train/Inertial Signals/body_gyro_x_train.txt", "r") as f:
  train_body_gyro_x = f.read()
  train_body_gyro_x = [float(i) for i in train_body_gyro_x.split(" ") if not i == '']
  train_x_vec["body_gyro_x"] = [train_body_gyro_x[i*128:(i+1)*128] for i in range(7352)]

with open("UCI HAR Dataset/train/Inertial Signals/body_gyro_y_train.txt", "r") as f:
  train_body_gyro_y = f.read()
  train_body_gyro_y = [float(i) for i in train_body_gyro_y.split(" ") if not i == '']
  train_x_vec["body_gyro_y"] = [train_body_gyro_y[i*128:(i+1)*128] for i in range(7352)]

with open("UCI HAR Dataset/train/Inertial Signals/body_gyro_z_train.txt", "r") as f:
  train_body_gyro_z = f.read()
  train_body_gyro_z = [float(i) for i in train_body_gyro_z.split(" ") if not i == '']
  train_x_vec["body_gyro_z"] = [train_body_gyro_z[i*128:(i+1)*128] for i in range(7352)]

with open("UCI HAR Dataset/train/Inertial Signals/total_acc_x_train.txt", "r") as f:
  train_total_acc_x = f.read()
  train_total_acc_x = [float(i) for i in train_total_acc_x.split(" ") if not i == '']
  train_x_vec["total_acc_x"] = [train_total_acc_x[i*128:(i+1)*128] for i in range(7352)]

with open("UCI HAR Dataset/train/Inertial Signals/total_acc_y_train.txt", "r") as f:
  train_total_acc_y = f.read()
  train_total_acc_y = [float(i) for i in train_total_acc_y.split(" ") if not i == '']
  train_x_vec["total_acc_y"] = [train_total_acc_y[i*128:(i+1)*128] for i in range(7352)]

with open("UCI HAR Dataset/train/Inertial Signals/total_acc_z_train.txt", "r") as f:
  train_total_acc_z = f.read()
  train_total_acc_z = [float(i) for i in train_total_acc_z.split(" ") if not i == '']
  train_x_vec["total_acc_z"] = [train_total_acc_z[i*128:(i+1)*128] for i in range(7352)]

# Labels
with open("UCI HAR Dataset/train/y_train.txt", "r") as f:
  train_y = f.read()
  train_y = [int(i) for i in train_y.split("\n") if not i == '']

In [None]:
# TODO: rember to use normalization for Improvemnt of learning

In [None]:
outer=[]
for i in range(7352):
  inner = []
  for key in train_x_vec:
    inner.append(train_x_vec[key][i])
  outer.append(np.array(inner,dtype=np.float32).reshape(-1))

len(outer)
outer[1]

array([0.00109375, 0.00455008, 0.00287917, ..., 0.0935352 , 0.08903516,
       0.09061235], dtype=float32)

### Dataset and DataLoader

In [None]:
class HARDataset(Dataset):
  def __init__(self,train_x_inp):
    self.x = train_x_inp
    self.y = torch.tensor(train_y,dtype=torch.int32)

  def __len__(self):
    return len(self.x)

  def __getitem__(self,index):
    return torch.tensor(self.x[index],dtype=torch.float32), self.y[index]

In [None]:
har = HARDataset(outer)
har_gen = DataLoader(har,batch_size=512,shuffle=True)

### Neural Network Model

In [None]:
class HARNet(nn.Module):
  def __init__(self):
    super(HARNet,self).__init__()

    self.device = "cuda" if torch.cuda.is_available() else "cpu"

    self.convl1 = []
    self.convl2 = []
    for i in range(9):
      self.convl1.append(nn.Conv1d(1,16,16).to(self.device))
      self.convl2.append(nn.Conv1d(16,32,8).to(self.device))
    
    self.fc1 = nn.Linear(30528,512)
    self.fc2 = nn.Linear(512,128)
    self.middle = nn.Linear(128,32)
    self.fct1 = nn.Linear(32,128)
    self.fct2 = nn.Linear(128,512)
    self.fct3 = nn.Linear(512,30528)

    self.clstr = nn.Linear(32,32)

    self.convtl1 = []
    self.convtl2 = []
    for i in range(9):
      self.convtl1.append(nn.ConvTranspose1d(32,16,8).to(self.device))
      self.convtl2.append(nn.ConvTranspose1d(16,1,16).to(self.device))

    self.to(self.device)

    self.optimizer = optim.Adam(self.parameters())

  def encode(self,x):
    x = x.reshape(-1,1,1152)
    x = x.to(self.device)
    l1 = []
    for i in range(9):
      l1.append(F.relu(self.convl1[i](x[:,:,i*128:(i+1)*128])))
    l2 = []
    for i in range(9):
      l2.append(F.relu(self.convl2[i](l1[i])))

    flats = []
    for i in range(9):
      flats.append(torch.flatten(l2[i],1))

    concat = torch.cat(flats,1)

    l1 = F.relu(self.fc1(concat))
    l2 = F.relu(self.fc2(l1))
    latent = F.relu(self.middle(l2))

    return latent.cpu()

  def decode(self, latent):
    latent = latent.to(self.device)
    lt1 = F.relu(self.fct1(latent))
    lt2 = F.relu(self.fct2(lt1))
    lt3 = self.fct3(lt2)

    unflats = []
    for i in range(9):
      unflats.append(nn.Unflatten(1,(32,-1))(lt3[:,i*3392:(i+1)*3392]))
    
    lt4 = []
    for i in range(9):
      lt4.append(F.relu(self.convtl1[i](unflats[i])))
    
    lt5 = []
    for i in range(9):
      lt5.append(torch.flatten(self.convtl2[i](lt4[i]),1))

    return torch.cat(lt5,1).cpu()

  def cluster(self,x):
    out = F.relu(self.clstr(x))
    return out

  def forward(self, x):
    return self.encode(x) 

### Train

In [None]:
model = HARNet()
model = model.to(model.device)
model.train()

writer = SummaryWriter()

count = 0
for i in tqdm(range(60)):
  for epoch_x,epoch_y in har_gen:
    model.optimizer.zero_grad()
    en_train = model(epoch_x)
    # en_train = model.cluster(enc)
    yhat = model.decode(en_train)
    rloss = F.mse_loss(yhat,epoch_x)
    writer.add_scalar("Loss/ReconstructionLoss",rloss,count)

    if i >= 5:
      
      with torch.no_grad():
        en_train_c = en_train.clone().detach()
        AEkmeans = KMeans(6).fit(en_train_c)

        writer.add_scalar("Scores/NMI",normalized_mutual_info_score(AEkmeans.labels_,epoch_y.numpy()),count)

        labels = AEkmeans.predict(en_train_c)
        centers = AEkmeans.cluster_centers_[labels]

      kloss = F.mse_loss(torch.tensor(centers,dtype=torch.float32),en_train)
      writer.add_scalar("Loss/KmeansLoss",kloss,count)
      loss = rloss + 0.04*kloss
    else:
      loss = rloss

    writer.add_scalar("Loss/TotalLoss",loss,count)

    loss.backward()
    model.optimizer.step()
    count+=1

100%|██████████| 60/60 [59:11<00:00, 59.20s/it]


### Eval

In [None]:
har_test_gen = DataLoader(har,batch_size=1,shuffle=False)
model.eval()
encodes = []
labels = []
for x,y in tqdm(har_test_gen):
  labels.append(y)
  encodes.append(model(x).clone().reshape(-1).detach().numpy())


100%|██████████| 7352/7352 [00:56<00:00, 130.12it/s]


#### Nomalized mutual information (NMI)

In [None]:
test_AEKmeans = KMeans(6).fit(np.array(encodes))
normalized_mutual_info_score(test_AEKmeans.labels_,np.array(labels))

  
  


0.5023219823757259

### Saving and Loding the model
comment or uncomment below code for saving and loading

In [None]:
# torch.save({
#             'epoch': 60,
#             'model_state_dict': model.state_dict(),
#             'optimizer_state_dict': model.optimizer.state_dict(),
#             'loss': loss,
#             }, "/content/drive/MyDrive/University/Internship/Codes/checkpoints/chkpnt1")

In [None]:
model2 = HARNet()
chkpnt = torch.load("/content/drive/MyDrive/University/Internship/Codes/checkpoints/chkpnt1")
model2.load_state_dict(chkpnt["model_state_dict"])
# model2 = model

<All keys matched successfully>

### T-SNE Visualization

In [None]:
har_test_gen = DataLoader(har,batch_size=1,shuffle=False)
model2.eval()
encodes = []
labels = []
xs = []
for x,y in tqdm(har_test_gen):
  xs.append(x.numpy().reshape(-1))
  labels.append(y.numpy())
  encodes.append(model2(x).clone().reshape(-1).detach().numpy())

100%|██████████| 7352/7352 [01:10<00:00, 104.07it/s]


In [None]:
colors = []
for i in np.array(labels).reshape(-1).tolist():
  if i == 1:
    colors.append("WALKING")
  elif i == 2:
    colors.append("WALKING_UPSTAIRS")
  elif i == 3:
    colors.append("WALKING_DOWNSTAIRS")
  elif i == 4:
    colors.append("SITTING")
  elif i == 5:
    colors.append("STANDING")
  elif i == 6:
    colors.append("LAYING")

In [None]:
from sklearn.manifold import TSNE
import plotly.express as px

tsne = TSNE()
proj_main_data = tsne.fit_transform(xs)



In [None]:
tsne2 = TSNE()
proj_encoded_data = tsne.fit_transform(encodes)



#### Main data with real labels without encoding

In [None]:
px.scatter(proj_main_data,x=0,y=1,color=colors,labels={'color': 'species'})

#### Main data with Kmeans Labels without encoding

In [None]:
test_AEKmeans = KMeans(6).fit(np.array(xs))
print(normalized_mutual_info_score(test_AEKmeans.labels_,np.array(labels).reshape(-1)))

label_kmeans = []
for j in test_AEKmeans.labels_:
  i = j+1
  if i == 1:
    label_kmeans.append("one")
  elif i == 2:
    label_kmeans.append("two")
  elif i == 3:
    label_kmeans.append("three")
  elif i == 4:
    label_kmeans.append("four")
  elif i == 5:
    label_kmeans.append("five")
  elif i == 6:
    label_kmeans.append("six")

px.scatter(proj_main_data,x=0,y=1,color=label_kmeans,labels={'color': 'species'})

0.4293510457339367


#### Encoded data with real labels

In [None]:
px.scatter(proj_encoded_data,x=0,y=1,color=colors,labels={'color': 'species'})

#### Encoded data with Kmeans labels

In [None]:
test_AEKmeans = KMeans(6).fit(np.array(encodes))

label_kmeans = []
for j in test_AEKmeans.labels_:
  i = j+1
  if i == 1:
    label_kmeans.append("one")
  elif i == 2:
    label_kmeans.append("two")
  elif i == 3:
    label_kmeans.append("three")
  elif i == 4:
    label_kmeans.append("four")
  elif i == 5:
    label_kmeans.append("five")
  elif i == 6:
    label_kmeans.append("six")

px.scatter(proj_encoded_data,x=0,y=1,color=label_kmeans,labels={'color': 'species'})

In [None]:
from sklearn.manifold import MDS

mds = MDS()
mds2 = MDS()

mds_main_data = mds.fit_transform(xs)
mds_encoded_data = mds2.fit_transform(encodes)

In [None]:
px.scatter(mds_main_data,x=0,y=1,color=colors,labels={'color': 'species'})

In [None]:
px.scatter(mds_encoded_data,x=0,y=1,color=colors,labels={'color': 'species'})

In [None]:
dist_over_labels = OrderedDict()
for i in range(len(test_AEKmeans.cluster_centers_)):
  dist_over_labels[int(i)] = [0]*6

har_test_gen = DataLoader(har,batch_size=1,shuffle=False)
for x,y in tqdm(har_test_gen):
  en = model2(x).detach().numpy()
  label = test_AEKmeans.predict(en.reshape(1,-1))
  dist_over_labels[int(label[0])][int(y.item())-1] += 1

dist_over_labels



100%|██████████| 7352/7352 [01:18<00:00, 94.12it/s]


OrderedDict([(0, [0, 0, 0, 8, 0, 695]),
             (1, [431, 428, 65, 34, 73, 55]),
             (2, [109, 42, 4, 1112, 1301, 0]),
             (3, [77, 221, 519, 0, 0, 0]),
             (4, [609, 380, 398, 0, 0, 1]),
             (5, [0, 2, 0, 132, 0, 656])])

In [None]:
for key, value in dist_over_labels.items():
  print(key, value)

0 [0, 0, 0, 8, 0, 695]
1 [431, 428, 65, 34, 73, 55]
2 [109, 42, 4, 1112, 1301, 0]
3 [77, 221, 519, 0, 0, 0]
4 [609, 380, 398, 0, 0, 1]
5 [0, 2, 0, 132, 0, 656]


In [None]:
plot_dict = {
    "clusters": [i for i in range(len(dist_over_labels))],
    "WALKING": [v[0] for k,v in dist_over_labels.items()],
    "WALKING_UPSTAIRS": [v[1] for k,v in dist_over_labels.items()],
    "WALKING_DOWNSTAIRS": [v[2] for k,v in dist_over_labels.items()],
    "SITTING": [v[3] for k,v in dist_over_labels.items()],
    "STANDING": [v[4] for k,v in dist_over_labels.items()],
    "LAYING": [v[5] for k,v in dist_over_labels.items()],
}

plot_dict

{'clusters': [0, 1, 2, 3, 4, 5],
 'WALKING': [0, 431, 109, 77, 609, 0],
 'WALKING_UPSTAIRS': [0, 428, 42, 221, 380, 2],
 'WALKING_DOWNSTAIRS': [0, 65, 4, 519, 398, 0],
 'SITTING': [8, 34, 1112, 0, 0, 132],
 'STANDING': [0, 73, 1301, 0, 0, 0],
 'LAYING': [695, 55, 0, 0, 1, 656]}

In [None]:
df = pd.DataFrame(plot_dict)
px.bar(df,x="clusters", y = ["WALKING","WALKING_UPSTAIRS","WALKING_DOWNSTAIRS","SITTING","STANDING","LAYING"])