# EDA and model experimentation with Haberman survival Dataset

In [2]:
import os
import pandas as pd
import torch
import torch.nn as nn 
import torchmetrics
import pytorch_lightning as pl
import wandb

from dotenv import load_dotenv
from pytorch_lightning.loggers import WandbLogger
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn import functional as F
from sklearn.metrics import accuracy_score

In [2]:
load_dotenv()

True

In [3]:
WANDB_API_KEY = os.getenv("WANDB_API_KEY")

In [4]:
wandb.login(key=WANDB_API_KEY)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33manthonyckleung[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Anthony/.netrc


True

In [3]:
wandb_logger = WandbLogger(project="MLOps Basics")

In [4]:
# Seed everything
pl.seed_everything(42)

Global seed set to 42


42

In [5]:
torch.cuda.device_count()

0

In [6]:
df = pd.read_csv('../data/haberman.csv')

In [7]:
df.head()

Unnamed: 0,age,year,n_auxillary_nodes,status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [8]:
df.dtypes

age                  int64
year                 int64
n_auxillary_nodes    int64
status               int64
dtype: object

In [9]:
df['status'].nunique()

2

In [10]:
class Config:
    file_path = '../data/haberman.csv'
    lr = 1e-5
    max_len = 64
    train_bs = 16
    val_bs = 8
    trainval_pcent = 0.80
    num_workers = 8

In [11]:
class HabermanData(Dataset):
    def __init__(self, data_df):
        data_df = data_df.reset_index(drop=True)
        self.features = data_df[['age','year', 'n_auxillary_nodes']].values
        self.target = data_df['status'].values
        
    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float)
        targets = torch.tensor(self.target[idx], dtype = torch.long)

        return {"features": features,
                "targets": targets}

In [12]:
class HabermanDataModule(pl.LightningDataModule):
    def __init__(self, data_path):
        super().__init__()

        self.data = pd.read_csv(data_path)
        
    def setup(self, stage=None):
        data = self.data.sample(frac=1).reset_index(drop=True)
        data['status'] = data['status'] - 1

        dataset = HabermanData(data)

        # Split data into train, validation, and testing
        trainval_size = int(Config.trainval_pcent * len(data))
        train_size = int(trainval_size*0.8)

        val_size = trainval_size - train_size
        test_size = len(data) - trainval_size

        trainval_set, test_set = random_split(dataset, [trainval_size, test_size])
        train_set, val_set = random_split(trainval_set, [train_size, val_size])
        
        if (stage == 'fit') or (stage is None):
            self.training_set = train_set
            self.validation_set = val_set
        
        if stage == 'test':
            self.testing_set = test_set
 
    def train_dataloader(self):
        train_loader = DataLoader(
            self.training_set,
            batch_size = Config.train_bs,
            shuffle=False,
            num_workers = 0
        )
        return train_loader

    def val_dataloader(self):
        val_loader = DataLoader(
            self.training_set,
            batch_size = Config.val_bs,
            shuffle=False,
            num_workers = 0
        )
        return val_loader

    def test_dataloader(self):
        test_loader = DataLoader(
            self.testing_set,
            batch_size = Config.val_bs,
            shuffle=False,
            num_workers = 0
        )
        return test_loader
        

In [26]:
class HabermanANN(pl.LightningModule):
    def __init__(self, lr=1e-2):
        super(HabermanANN, self).__init__()
        self.num_classes = 2
        self.train_accuracy_metric = torchmetrics.Accuracy()
        self.val_accuracy_metric = torchmetrics.Accuracy()
        self.f1_metric = torchmetrics.F1(num_classes=self.num_classes)
        self.precision_macro_metric = torchmetrics.Precision(
            average="macro", num_classes=self.num_classes
        )
        self.recall_macro_metric = torchmetrics.Recall(
            average="macro", num_classes=self.num_classes
        )
        self.precision_micro_metric = torchmetrics.Precision(average="micro")
        self.recall_micro_metric = torchmetrics.Recall(average="micro")
        self.save_hyperparameters()
        self.fc1 = nn.Linear(3, 10)
        self.fc2 = nn.Linear(10, 2)

        self.ann = nn.Sequential(
                nn.Linear(3, 10),
                nn.ReLU(),
                nn.Linear(10, 2),
        )
        # self.dropout = nn.Dropout(0.4)
        
        # self.W = nn.Linear(self.distilbert.config.hidden_size, 2)
        

    def forward(self, x):
        x = self.ann(x)
        return x


    def training_step(self, batch, batch_idx):
        logits = self.forward(batch["features"])
        loss = self.loss_func(logits, batch['targets'])
        train_acc = self.train_accuracy_metric(logits, batch["targets"])
        self.log("train/loss", loss, prog_bar=True, on_epoch=True)
        self.log("train/acc", train_acc, prog_bar=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        logits = self.forward(batch["features"])
        loss = self.loss_func(logits, batch['targets'])
        _, preds = torch.max(logits, dim=1)
        val_acc = accuracy_score(preds.cpu(), batch["targets"].cpu())
        val_acc = torch.tensor(val_acc)

        # Metrics
        valid_acc = self.val_accuracy_metric(preds, batch["targets"])
        precision_macro = self.precision_macro_metric(preds, batch["targets"])
        recall_macro = self.recall_macro_metric(preds, batch["targets"])
        precision_micro = self.precision_micro_metric(preds, batch["targets"])
        recall_micro = self.recall_micro_metric(preds, batch["targets"])
        f1 = self.f1_metric(preds, batch["targets"])

        self.log("val/loss", loss, prog_bar=True, on_step=True)
        self.log("val/acc", val_acc, prog_bar=True)
        self.log("val/precision_macro", precision_macro, prog_bar=True)
        self.log("val/recall_macro",recall_macro, prog_bar=True)
        self.log("val/precision_micro", precision_micro, prog_bar=True)
        self.log("val/recall_micro", recall_micro, prog_bar=True)
        self.log("val/f1", f1, prog_bar=True)
        return {"targets": batch["targets"], "logits": logits}

    def test_step(self, batch, batch_idx):
        logits = self.forward(batch["features"])
        loss = self.loss_func(logits, batch['targets'])
        _, preds = torch.max(logits, dim=1)
        test_acc = accuracy_score(preds.cpu(), batch["targets"].cpu())
        self.log("test_loss", loss)
        self.log("test_acc", test_acc) 
        

    def loss_func(self, pred, target):
        return F.cross_entropy(pred, target)

    def validation_epoch_end(self, outputs):
        labels = torch.cat([x["targets"] for x in outputs])
        logits = torch.cat([x["logits"] for x in outputs])
        preds = torch.argmax(logits, 1)

        ## There are multiple ways to track the metrics
        # 1. Confusion matrix plotting using inbuilt W&B method
        self.logger.experiment.log(
            {
                "conf": wandb.plot.confusion_matrix(
                    probs=logits.numpy(), y_true=labels.numpy()
                )
            }
        )

        # 2. Confusion Matrix plotting using scikit-learn method
        # wandb.log({"cm": wandb.sklearn.plot_confusion_matrix(labels.numpy(), preds)})

        # 3. Confusion Matric plotting using Seaborn
        # data = confusion_matrix(labels.numpy(), preds.numpy())
        # df_cm = pd.DataFrame(data, columns=np.unique(labels), index=np.unique(labels))
        # df_cm.index.name = "Actual"
        # df_cm.columns.name = "Predicted"
        # plt.figure(figsize=(7, 4))
        # plot = sns.heatmap(
        #     df_cm, cmap="Blues", annot=True, annot_kws={"size": 16}
        # )  # font size
        # self.logger.experiment.log({"Confusion Matrix": wandb.Image(plot)})

        # self.logger.experiment.log(
        #     {"roc": wandb.plot.roc_curve(labels.numpy(), logits.numpy())}
        # )

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams["lr"])

In [39]:
class SamplesVisualisationLogger(pl.Callback):
    def __init__(self, datamodule):
        super().__init__()
        self.datamodule = datamodule

    def on_validation_end(self, trainer, pl_module):
        # can be done on complete dataset also
        val_batch = next(iter(self.datamodule.val_dataloader()))
        input = val_batch["features"]

        # get the predictions
        outputs = pl_module(input)
        preds = torch.argmax(outputs, 1)
        labels = val_batch["targets"]

        # predicted and labelled data
        df = pd.DataFrame(data=input, columns=['age', 'year', 'n_aux_nodes'])
        df['Label'] = labels.numpy()
        df['Predicted'] = preds.numpy()

        # wrongly predicted data
        wrong_df = df[df["Label"] != df["Predicted"]]

        # Logging wrongly predicted dataframe as a table
        trainer.logger.experiment.log(
            {
                "examples": wandb.Table(dataframe=wrong_df, allow_mixed_types=True),
                "global_step": trainer.global_step,
            }
        )

# Training

In [28]:
# Callback
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath = "./models",
    monitor = "val/loss",
    mode = "min"
)

In [40]:
data = HabermanDataModule(Config.file_path)
model = HabermanANN()

trainer = pl.Trainer(
    # gpus=(1 if torch.cuda.is_available() else 0),
    max_epochs=10,
    fast_dev_run=False,
    accelerator='cpu',
    log_every_n_steps = 5,
    logger=wandb_logger,
    callbacks = [checkpoint_callback, SamplesVisualisationLogger(data)]
)
trainer.fit(model, data)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name                   | Type       | Params
------------------------------------------------------
0 | train_accuracy_metric  | Accuracy   | 0     
1 | val_accuracy_metric    | Accuracy   | 0     
2 | f1_metric              | F1         | 0     
3 | precision_macro_metric | Precision  | 0     
4 | recall_macro_metric    | Recall     | 0     
5 | precision_micro_metric | Precision  | 0     
6 | recall_micro_metric    | Recall     | 0     
7 | fc1                    | Linear     | 40    
8 | fc2                    | Linear     | 22    
9 | ann                    | Sequential | 62    
------------------------------------------------------
124       Trainable params
0         Non-trainable params
124       Total params
0.000     Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check:   0%|          | 0/2 [00:00<?, ?it/s]

  rank_zero_warn(


           age         year n_aux_nodes  Label  Predicted
0  tensor(39.)  tensor(66.)  tensor(0.)      1          1
1  tensor(35.)  tensor(63.)  tensor(0.)      0          1
2  tensor(56.)  tensor(60.)  tensor(0.)      0          1
3  tensor(75.)  tensor(62.)  tensor(1.)      0          1
4  tensor(52.)  tensor(60.)  tensor(4.)      0          1
5  tensor(38.)  tensor(60.)  tensor(0.)      0          1
6  tensor(53.)  tensor(60.)  tensor(9.)      1          1
7  tensor(76.)  tensor(67.)  tensor(0.)      0          1
                                                              

Global seed set to 42
  rank_zero_warn(


Epoch 0:  87%|████████▋ | 33/38 [00:00<00:00, 69.04it/s, loss=1.47, v_num=1f8u, train/loss_step=0.862, train/acc_step=0.667]            age         year n_aux_nodes  Label  Predicted
0  tensor(39.)  tensor(66.)  tensor(0.)      1          1
1  tensor(35.)  tensor(63.)  tensor(0.)      0          1
2  tensor(56.)  tensor(60.)  tensor(0.)      0          0
3  tensor(75.)  tensor(62.)  tensor(1.)      0          0
4  tensor(52.)  tensor(60.)  tensor(4.)      0          0
5  tensor(38.)  tensor(60.)  tensor(0.)      0          0
6  tensor(53.)  tensor(60.)  tensor(9.)      1          0
7  tensor(76.)  tensor(67.)  tensor(0.)      0          0
Epoch 1:  87%|████████▋ | 33/38 [00:00<00:00, 73.01it/s, loss=0.988, v_num=1f8u, train/loss_step=0.608, train/acc_step=0.667, val/loss_step=0.747, val/loss_epoch=0.933, val/acc=0.692, val/precision_macro=0.457, val/recall_macro=0.521, val/precision_micro=0.692, val/recall_micro=0.692, val/f1=0.692, train/loss_epoch=1.510, train/acc_epoch=0.595]       

In [25]:
# Perform evaluation
trainer.test(model, data)

  rank_zero_warn(


Testing: 0it [00:00, ?it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.6935483813285828, 'test_loss': 0.5761809349060059}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 8/8 [00:00<00:00, 250.02it/s]


[{'test_loss': 0.5761809349060059, 'test_acc': 0.6935483813285828}]

In [248]:
class HabermanPredictor:
    def __init__(self, model_path):
        self.model_path = model_path 
        self.model = HabermanANN.load_from_checkpoint(model_path)
        self.model.eval()
        self.model.freeze()
        self.processor = HabermanDataModule(Config.file_path)
        self.softmax = torch.nn.Softmax(dim=0)
        self.labels = ["survived", "did not survived"]

    def predict(self, input):
        logits = self.model(
            torch.tensor([input], dtype=torch.float)
        )
        scores = self.softmax(logits[0]).tolist()
        predictions = []
        for score, label in zip(scores, self.labels):
            predictions.append({'label': label, 'score': score})
        return predictions


In [249]:
predictor = HabermanPredictor("./models/epoch=8-step=116.ckpt")

In [250]:
predictor.predict([34, 60, 0])

[{'label': 'survived', 'score': 0.8074608445167542},
 {'label': 'did not survived', 'score': 0.19253917038440704}]