In [None]:
import pandas as pd
import numpy as np
from tqdm. auto import tqdm
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.nn. functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from matplotlib.ticker import MaxNLocator
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from multiprocessing import cpu_count
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
#from pytorch_lightning.metrics.functional import accuracy
from sklearn.metrics import classification_report, confusion_matrix
import os
from torch.utils.tensorboard import SummaryWriter
from torch.nn.utils.rnn import pad_sequence
import warnings
warnings.filterwarnings("ignore")

from multiprocessing import cpu_count
import torchmetrics
from torchmetrics.functional import accuracy

In [None]:
X_train = pd.read_csv("../Dataset/timeseries.csv")

In [None]:
X_train.head()

In [None]:
columns_to_encode = ['activity', 'up_event', 'down_event', 'text_change']

for column in columns_to_encode:
    X_train_encoded = pd.get_dummies(X_train[column], prefix=column)
    X_train = pd.concat([X_train, X_train_encoded], axis=1)
    X_train = X_train.drop(column, axis=1)

In [None]:
y_train = pd.read_csv('/kaggle/input/scores-time-series/train_scores.csv')

In [None]:
y_train.head()

In [None]:
label_encoder=LabelEncoder ()
encoded_labels = label_encoder.fit_transform(y_train.score)

In [None]:
y_train["label"]= encoded_labels

In [None]:
y_train.drop('score', inplace=True, axis=1)

In [None]:
FEATURE_COLUMNS = X_train.columns.tolist()[2:]
FEATURE_COLUMNS

In [None]:
maxi = X_train.id.value_counts().max()
maxi

In [None]:
sequences = []
for id, group in X_train.groupby("id"):
  sequence_features = group[FEATURE_COLUMNS]
  # Number of rows to add with zeros
  custom_rows = maxi - sequence_features.shape[0]

  # Creating a DataFrame with zeros
  zeros_df = pd.DataFrame(0, index=range(custom_rows), columns=sequence_features.columns)
  # Appending the DataFrame with zeros to the original DataFrame
  sequence_features = pd.concat([sequence_features, zeros_df], ignore_index=True)

  label= y_train[y_train.id == id].iloc[0].label
  sequences.append((sequence_features, label))

In [None]:
len(sequences)

In [None]:
train_sequences, test_sequences = train_test_split(sequences, test_size=0.2)

In [None]:
class EssayDataset(Dataset):
  def __init__ (self, sequences):
    self.sequences = sequences

  def __len__ (self):
    return len(self.sequences)

  def __getitem__ (self, idx):
    sequence, label = self.sequences [idx]
    return dict(
      sequence=torch.Tensor(sequence.to_numpy()),
      label=torch.tensor(label).long()
    )

In [None]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')

def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device

    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for batch in self.dl:
            # yield to_device(b, self.device)
            yield {key: value.to(self.device) if isinstance(value, torch.Tensor) else value for key, value in batch.items()}

    def __len__(self):
        """Number of batches"""
        return len(self.dl)



In [None]:
device = get_default_device()
device

In [None]:
class EssayDataModule (pl.LightningDataModule):
  def __init__(self, train_sequences, test_sequences, batch_size):
    super().__init__()
    self.train_sequences = train_sequences
    self.test_sequences = test_sequences
    self.batch_size = batch_size

  def setup(self, stage=None):
    self.train_dataset = EssayDataset(self.train_sequences)
    self.test_dataset = EssayDataset(self. test_sequences)

  def train_dataloader (self):
    return DataLoader(
      self.train_dataset,
      batch_size=self.batch_size,
      shuffle=True,
      num_workers=cpu_count()
    )

  def val_dataloader (self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      shuffle=False,
      num_workers=cpu_count()
    )

  def test_dataloader(self):
    return DataLoader(
      self.test_dataset,
      batch_size=self.batch_size,
      shuffle=False,
      num_workers=cpu_count()
    )

In [None]:
N_EPOCHS = 50
BATCH_SIZE = 8
data_module = EssayDataModule(train_sequences, test_sequences, BATCH_SIZE)

In [None]:
class TimeSeriesTransformer(nn.Module):
    def __init__(self, input_size, num_classes, d_model=8, nhead=2, num_layers=2):
        super(TimeSeriesTransformer, self).__init__()
        self.embedding = nn.Linear(input_size, d_model)
        self.transformer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer, num_layers=num_layers)
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        x = x.permute(1, 0, 2)  # Change the sequence length dimension to be the first dimension
        x = self.transformer_encoder(x)
        x = x.mean(dim=0)  # Aggregate the sequence information
        x = self.fc(x)
        return x

In [None]:
class TimeSeriesClassifier(pl.LightningModule):
    def __init__(self, input_size, num_classes, d_model=8, nhead=2, num_layers=2, learning_rate=1e-3):
        super(TimeSeriesClassifier, self).__init__()
        self.model = TimeSeriesTransformer(input_size, num_classes, d_model, nhead, num_layers)
        self.criterion = nn.CrossEntropyLoss()
        self.learning_rate = learning_rate
        self.num_classes=num_classes
        self.num_layers = num_layers

    def forward(self, x, labels = None):
        output = self.model(x)
        loss = 0
        if labels is not None:
          loss = self.criterion (output, labels)
        return loss, output

    def training_step(self, batch, batch_idx):
        sequences, labels = batch['sequence'], batch['label']
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        step_accuracy = accuracy(predictions, labels, task='multiclass', num_classes=self.num_classes)

        self.log("train_loss", loss, prog_bar=True, logger=True)
        self.log("train_accuracy", step_accuracy, prog_bar=True, logger=True)
        return {"loss": loss, "accuracy": step_accuracy}

    def validation_step(self, batch, batch_idx):
        sequences, labels = batch['sequence'], batch['label']
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        step_accuracy = accuracy(predictions, labels, task='multiclass', num_classes=self.num_classes)

        self.log("val_loss", loss, prog_bar=True, logger=True)
        self.log("val_accuracy", step_accuracy, prog_bar=True, logger=True)
        return {"loss": loss, "accuracy": step_accuracy}

    def testing_step(self, batch, batch_idx):
        sequences, labels = batch['sequence'], batch['label']
        loss, outputs = self(sequences, labels)
        predictions = torch.argmax(outputs, dim=1)
        step_accuracy = accuracy(predictions, labels, task='multiclass', num_classes=self.num_classes)

        self.log("test_loss", loss, prog_bar=True, logger=True)
        self.log("test_accuracy", step_accuracy, prog_bar=True, logger=True)
        return {"loss": loss, "accuracy": step_accuracy}

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=self.learning_rate)
        return optimizer

In [None]:
model = TimeSeriesClassifier(
  input_size=len(FEATURE_COLUMNS),
  num_classes=len(label_encoder.classes_)
).to(device)

In [None]:
%load_ext tensorboard
%tensorboard --logdir ./lightning_logs

In [None]:
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

logger = TensorBoardLogger("lightning_logs", name="surface")

trainer = pl.Trainer(
  logger=logger,
  callbacks=checkpoint_callback,
  max_epochs=N_EPOCHS,
  devices=1,
  accelerator='gpu'

)

In [None]:
trainer.fit(model, data_module)