# EDA and model experimentation with Haberman survival Dataset

In [2]:
import hydra
import os
import pandas as pd
import torch
import torch.nn as nn 
import torchmetrics
import pytorch_lightning as pl
import wandb

from dotenv import load_dotenv
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf
from pytorch_lightning.loggers import WandbLogger
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn import functional as F
from sklearn.metrics import accuracy_score

In [3]:
load_dotenv()

True

In [40]:
# Load config file via Hydra
with initialize(config_path="../configs"):
    cfg = compose(config_name="config.yaml")
    print(cfg)

{'processing': 'batch_size:16', 'training': {'train_bs': 16, 'val_bs': 8, 'lr': 0.01, 'max_epoch': 10, 'log_every_n_steps': 2}}


In [4]:
WANDB_API_KEY = os.getenv("WANDB_API_KEY")

In [5]:
wandb.login(key=WANDB_API_KEY)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33manthonyckleung[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Anthony/.netrc


True

In [6]:
wandb_logger = WandbLogger(project="MLOps Basics")

In [7]:
# Seed everything
pl.seed_everything(42)

Global seed set to 42


42

In [8]:
torch.cuda.device_count()

0

In [9]:
df = pd.read_csv('../data/haberman.csv')

In [10]:
df.head()

Unnamed: 0,age,year,n_auxillary_nodes,status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [11]:
df.dtypes

age                  int64
year                 int64
n_auxillary_nodes    int64
status               int64
dtype: object

In [12]:
df['status'].nunique()

2

In [13]:
class Config:
    file_path = '../data/haberman.csv'
    lr = 1e-5
    max_len = 64
    train_bs = 16
    val_bs = 8
    trainval_pcent = 0.80
    num_workers = 8

In [14]:
class HabermanData(Dataset):
    def __init__(self, data_df):
        data_df = data_df.reset_index(drop=True)
        self.features = data_df[['age','year', 'n_auxillary_nodes']].values
        self.target = data_df['status'].values
        
    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float)
        targets = torch.tensor(self.target[idx], dtype = torch.long)

        return {"features": features,
                "targets": targets}

In [15]:
class HabermanDataModule(pl.LightningDataModule):
    def __init__(self, data_path, train_bs, val_bs):
        super().__init__()
        self.data = pd.read_csv(data_path)
        self.train_bs = train_bs 
        self.val_bs = val_bs
        
    def setup(self, stage=None):
        data = self.data.sample(frac=1).reset_index(drop=True)
        data['status'] = data['status'] - 1

        dataset = HabermanData(data)

        # Split data into train, validation, and testing
        trainval_size = int(Config.trainval_pcent * len(data))
        train_size = int(trainval_size*0.8)

        val_size = trainval_size - train_size
        test_size = len(data) - trainval_size

        trainval_set, test_set = random_split(dataset, [trainval_size, test_size])
        train_set, val_set = random_split(trainval_set, [train_size, val_size])
        
        if (stage == 'fit') or (stage is None):
            self.training_set = train_set
            self.validation_set = val_set
        
        if stage == 'test':
            self.testing_set = test_set
 
    def train_dataloader(self):
        train_loader = DataLoader(
            self.training_set,
            batch_size = self.train_bs,
            shuffle=False,
            num_workers = 0
        )
        return train_loader

    def val_dataloader(self):
        val_loader = DataLoader(
            self.validation_set,
            batch_size = self.val_bs,
            shuffle=False,
            num_workers = 0
        )
        return val_loader

    def test_dataloader(self):
        test_loader = DataLoader(
            self.testing_set,
            batch_size = self.val_bs,
            shuffle=False,
            num_workers = 0
        )
        return test_loader
        

In [16]:
class HabermanANN(pl.LightningModule):
    def __init__(self, lr=1e-2):
        super(HabermanANN, self).__init__()
        self.num_classes = 2
        self.save_hyperparameters()
        self.fc1 = nn.Linear(3, 10)
        self.fc2 = nn.Linear(10, 2)

        self.ann = nn.Sequential(
                nn.Linear(3, 10),
                nn.ReLU(),
                nn.Linear(10, 2),
        )
        self.train_accuracy_metric = torchmetrics.Accuracy()
        self.val_accuracy_metric = torchmetrics.Accuracy()
        self.f1_metric = torchmetrics.F1(num_classes=self.num_classes)
        self.precision_macro_metric = torchmetrics.Precision(
            average="macro", num_classes=self.num_classes
        )
        self.recall_macro_metric = torchmetrics.Recall(
            average="macro", num_classes=self.num_classes
        )
        self.precision_micro_metric = torchmetrics.Precision(average="micro")
        self.recall_micro_metric = torchmetrics.Recall(average="micro")
        

    def forward(self, x):
        x = self.ann(x)
        return x


    def training_step(self, batch, batch_idx):
        logits = self.forward(batch["features"])
        loss = self.loss_func(logits, batch['targets'])
        train_acc = self.train_accuracy_metric(logits, batch["targets"])
        self.log("train/loss", loss, prog_bar=True, on_epoch=True)
        self.log("train/acc", train_acc, prog_bar=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        logits = self.forward(batch["features"])
        loss = self.loss_func(logits, batch['targets'])
        _, preds = torch.max(logits, dim=1)
        val_acc = accuracy_score(preds.cpu(), batch["targets"].cpu())
        val_acc = torch.tensor(val_acc)

        # Metrics
        valid_acc = self.val_accuracy_metric(preds, batch["targets"])
        precision_macro = self.precision_macro_metric(preds, batch["targets"])
        recall_macro = self.recall_macro_metric(preds, batch["targets"])
        precision_micro = self.precision_micro_metric(preds, batch["targets"])
        recall_micro = self.recall_micro_metric(preds, batch["targets"])
        f1 = self.f1_metric(preds, batch["targets"])

        self.log("val/loss", loss, prog_bar=True, on_step=True)
        self.log("val/acc", val_acc, prog_bar=True)
        self.log("val/precision_macro", precision_macro, prog_bar=True)
        self.log("val/recall_macro",recall_macro, prog_bar=True)
        self.log("val/precision_micro", precision_micro, prog_bar=True)
        self.log("val/recall_micro", recall_micro, prog_bar=True)
        self.log("val/f1", f1, prog_bar=True)
        return {"targets": batch["targets"], "logits": logits}

    def test_step(self, batch, batch_idx):
        logits = self.forward(batch["features"])
        loss = self.loss_func(logits, batch['targets'])
        _, preds = torch.max(logits, dim=1)
        test_acc = accuracy_score(preds.cpu(), batch["targets"].cpu())
        self.log("test_loss", loss)
        self.log("test_acc", test_acc) 
        

    def loss_func(self, pred, target):
        return F.cross_entropy(pred, target)

    def validation_epoch_end(self, outputs):
        labels = torch.cat([x["targets"] for x in outputs])
        logits = torch.cat([x["logits"] for x in outputs])
        preds = torch.argmax(logits, 1)

        ## There are multiple ways to track the metrics
        # 1. Confusion matrix plotting using inbuilt W&B method
        self.logger.experiment.log(
            {
                "conf": wandb.plot.confusion_matrix(
                    probs=logits.numpy(), y_true=labels.numpy()
                )
            }
        )

        # 2. Confusion Matrix plotting using scikit-learn method
        # wandb.log({"cm": wandb.sklearn.plot_confusion_matrix(labels.numpy(), preds)})

        # 3. Confusion Matric plotting using Seaborn
        # data = confusion_matrix(labels.numpy(), preds.numpy())
        # df_cm = pd.DataFrame(data, columns=np.unique(labels), index=np.unique(labels))
        # df_cm.index.name = "Actual"
        # df_cm.columns.name = "Predicted"
        # plt.figure(figsize=(7, 4))
        # plot = sns.heatmap(
        #     df_cm, cmap="Blues", annot=True, annot_kws={"size": 16}
        # )  # font size
        # self.logger.experiment.log({"Confusion Matrix": wandb.Image(plot)})

        # self.logger.experiment.log(
        #     {"roc": wandb.plot.roc_curve(labels.numpy(), logits.numpy())}
        # )

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams["lr"])

In [17]:
class SamplesVisualisationLogger(pl.Callback):
    def __init__(self, datamodule):
        super().__init__()
        self.datamodule = datamodule

    def on_validation_end(self, trainer, pl_module):
        # can be done on complete dataset also
        val_batch = next(iter(self.datamodule.val_dataloader()))
        input = val_batch["features"]

        # get the predictions
        outputs = pl_module(input)
        preds = torch.argmax(outputs, 1)
        labels = val_batch["targets"]

        # predicted and labelled data
        df = pd.DataFrame(data=input, columns=['age', 'year', 'n_aux_nodes'])
        df['Label'] = labels.numpy()
        df['Predicted'] = preds.numpy()

        # wrongly predicted data
        wrong_df = df[df["Label"] != df["Predicted"]]

        # Logging wrongly predicted dataframe as a table
        trainer.logger.experiment.log(
            {
                "examples": wandb.Table(dataframe=wrong_df, allow_mixed_types=True),
                "global_step": trainer.global_step,
            }
        )

# Training

In [18]:
# Callback
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath = "./models",
    monitor = "val/loss",
    mode = "min"
)

In [43]:
# @hydra.main(config_path="../configs", config_name="config")
def train(cfg):
    print(OmegaConf.to_yaml(cfg))
    data = HabermanDataModule('../data/haberman.csv', cfg.training.train_bs, cfg.training.val_bs)
    model = HabermanANN()

    trainer = pl.Trainer(
        # gpus=(1 if torch.cuda.is_available() else 0),
        max_epochs=10,
        fast_dev_run=False,
        accelerator='cpu',
        log_every_n_steps = 5,
        logger=wandb_logger,
        callbacks = [checkpoint_callback, SamplesVisualisationLogger(data)]
    )
    trainer.fit(model, data)

    # Perform evaluation
    trainer.test(model, data)

In [44]:
train(cfg)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


processing: batch_size:16
training:
  train_bs: 16
  val_bs: 8
  lr: 0.01
  max_epoch: 10
  log_every_n_steps: 2



[34m[1mwandb[0m: wandb version 0.12.9 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade



  | Name                   | Type       | Params
------------------------------------------------------
0 | fc1                    | Linear     | 40    
1 | fc2                    | Linear     | 22    
2 | ann                    | Sequential | 62    
3 | train_accuracy_metric  | Accuracy   | 0     
4 | val_accuracy_metric    | Accuracy   | 0     
5 | f1_metric              | F1         | 0     
6 | precision_macro_metric | Precision  | 0     
7 | recall_macro_metric    | Recall     | 0     
8 | precision_micro_metric | Precision  | 0     
9 | recall_micro_metric    | Recall     | 0     
------------------------------------------------------
124       Trainable params
0         Non-trainable params
124       Total params
0.000     Total estimated model params size (MB)


                                                                      

Global seed set to 42


Epoch 9: 100%|██████████| 20/20 [00:01<00:00, 19.77it/s, loss=0.474, v_num=vqrg, train/loss_step=0.257, train/acc_step=1.000, val/loss_step=0.176, val/loss_epoch=0.727, val/acc=0.612, val/precision_macro=0.471, val/recall_macro=0.565, val/precision_micro=0.612, val/recall_micro=0.612, val/f1=0.612, train/loss_epoch=0.513, train/acc_epoch=0.764]  
Testing: 0it [00:00, ?it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.7903226017951965, 'test_loss': 0.5061657428741455}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 8/8 [00:00<00:00, 347.83it/s]


  rank_zero_warn(


Testing: 0it [00:00, ?it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.6935483813285828, 'test_loss': 0.5761809349060059}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 8/8 [00:00<00:00, 250.02it/s]


[{'test_loss': 0.5761809349060059, 'test_acc': 0.6935483813285828}]

In [248]:
class HabermanPredictor:
    def __init__(self, model_path):
        self.model_path = model_path 
        self.model = HabermanANN.load_from_checkpoint(model_path)
        self.model.eval()
        self.model.freeze()
        self.processor = HabermanDataModule(Config.file_path)
        self.softmax = torch.nn.Softmax(dim=0)
        self.labels = ["survived", "did not survived"]

    def predict(self, input):
        logits = self.model(
            torch.tensor([input], dtype=torch.float)
        )
        scores = self.softmax(logits[0]).tolist()
        predictions = []
        for score, label in zip(scores, self.labels):
            predictions.append({'label': label, 'score': score})
        return predictions


In [249]:
predictor = HabermanPredictor("./models/epoch=8-step=116.ckpt")

In [250]:
predictor.predict([34, 60, 0])

[{'label': 'survived', 'score': 0.8074608445167542},
 {'label': 'did not survived', 'score': 0.19253917038440704}]