# EDA and model experimentation with Haberman survival Dataset

In [174]:
import pandas as pd
import torch
import torch.nn as nn 
import pytorch_lightning as pl

from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn import functional as F
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics import accuracy_score

In [217]:
# Seed everything
pl.seed_everything(42)

Global seed set to 42


42

In [2]:
torch.cuda.device_count()

0

In [104]:
df = pd.read_csv('../data/haberman.csv')

In [105]:
df.head()

Unnamed: 0,age,year,n_auxillary_nodes,status
0,30,64,1,1
1,30,62,3,1
2,30,65,0,1
3,31,59,2,1
4,31,65,4,1


In [106]:
df.dtypes

age                  int64
year                 int64
n_auxillary_nodes    int64
status               int64
dtype: object

In [107]:
df['status'].nunique()

2

In [193]:
class Config:
    file_path = '../data/haberman.csv'
    lr = 1e-5
    max_len = 64
    train_bs = 16
    val_bs = 8
    trainval_pcent = 0.80
    num_workers = 8

In [194]:
class HabermanData(Dataset):
    def __init__(self, data_df):
        data_df = data_df.reset_index(drop=True)
        self.features = data_df[['age','year', 'n_auxillary_nodes']].values
        self.target = data_df['status'].values
        
    def __len__(self):
        return len(self.target)

    def __getitem__(self, idx):
        features = torch.tensor(self.features[idx], dtype=torch.float)
        targets = torch.tensor(self.target[idx], dtype = torch.long)

        return {"features": features,
                "targets": targets}

In [223]:
class HabermanDataModule(pl.LightningDataModule):
    def __init__(self, data_path):
        super().__init__()

        self.data = pd.read_csv(data_path)
        
    def setup(self, stage=None):
        data = self.data.sample(frac=1).reset_index(drop=True)
        data['status'] = data['status'] - 1

        dataset = HabermanData(data)

        # Split data into train, validation, and testing
        trainval_size = int(Config.trainval_pcent * len(data))
        train_size = int(trainval_size*0.8)

        val_size = trainval_size - train_size
        test_size = len(data) - trainval_size

        trainval_set, test_set = random_split(dataset, [trainval_size, test_size])
        train_set, val_set = random_split(trainval_set, [train_size, val_size])
        
        if (stage == 'fit') or (stage is None):
            self.training_set = train_set
            self.validation_set = val_set
        
        if stage == 'test':
            self.testing_set = test_set
 
    def train_dataloader(self):
        train_loader = DataLoader(
            self.training_set,
            batch_size = Config.train_bs,
            shuffle=False,
            num_workers = 0
        )
        return train_loader

    def val_dataloader(self):
        val_loader = DataLoader(
            self.training_set,
            batch_size = Config.val_bs,
            shuffle=False,
            num_workers = 0
        )
        return val_loader

    def test_dataloader(self):
        test_loader = DataLoader(
            self.testing_set,
            batch_size = Config.val_bs,
            shuffle=False,
            num_workers = 0
        )
        return test_loader
        

In [229]:
class HabermanModel(pl.LightningModule):
    def __init__(self, lr=1e-2):
        super(HabermanModel, self).__init__()
        self.save_hyperparameters()
        self.fc1 = nn.Linear(3, 10)
        self.fc2 = nn.Linear(10, 2)

        self.ann = nn.Sequential(
                nn.Linear(3, 10),
                nn.ReLU(),
                nn.Linear(10, 2),
        )
        # self.dropout = nn.Dropout(0.4)
        
        # self.W = nn.Linear(self.distilbert.config.hidden_size, 2)
        # self.num_classes = 5

    def forward(self, x):
        x = self.ann(x)
        return x


    def training_step(self, batch, batch_idx):
        logits = self.forward(batch["features"])
        loss = self.loss_func(logits, batch['targets'])
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        logits = self.forward(batch["features"])
        loss = self.loss_func(logits, batch['targets'])
        _, preds = torch.max(logits, dim=1)
        val_acc = accuracy_score(preds.cpu(), batch["targets"].cpu())
        val_acc = torch.tensor(val_acc)
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_acc", val_acc, prog_bar=True)

    def test_step(self, batch, batch_idx):
        logits = self.forward(batch["features"])
        loss = self.loss_func(logits, batch['targets'])
        _, preds = torch.max(logits, dim=1)
        test_acc = accuracy_score(preds.cpu(), batch["targets"].cpu())
        self.log("test_loss", loss)
        self.log("test_acc", test_acc) 
        

    def loss_func(self, pred, target):
        return F.cross_entropy(pred, target)

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams["lr"])

# Training

In [232]:
# Callback
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath = "./models",
    monitor = "val_loss",
    mode = "min"
)

In [233]:
data = HabermanDataModule(Config.file_path)
model = HabermanModel()

trainer = pl.Trainer(
    # gpus=(1 if torch.cuda.is_available() else 0),
    max_epochs=10,
    fast_dev_run=False,
    accelerator='cpu',
    log_every_n_steps = 5,
    callbacks = [checkpoint_callback]
)
trainer.fit(model, data)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs

  | Name | Type       | Params
------------------------------------
0 | fc1  | Linear     | 40    
1 | fc2  | Linear     | 22    
2 | ann  | Sequential | 62    
------------------------------------
124       Trainable params
0         Non-trainable params
124       Total params
0.000     Total estimated model params size (MB)


                                                              

  rank_zero_warn(
Global seed set to 42
  rank_zero_warn(


Epoch 9: 100%|██████████| 38/38 [00:00<00:00, 222.22it/s, loss=0.596, v_num=0, train_loss=0.755, val_loss=0.518, val_acc=0.785]


In [231]:
# Perform evaluation
trainer.test(model, data)

  rank_zero_warn(


Testing: 0it [00:00, ?it/s]--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': 0.6935483813285828, 'test_loss': 0.576137125492096}
--------------------------------------------------------------------------------
Testing: 100%|██████████| 8/8 [00:00<00:00, 347.94it/s]


[{'test_loss': 0.576137125492096, 'test_acc': 0.6935483813285828}]

In [234]:
trainer.logger.log_dir

'e:\\Projects\\mlops-experiment\\notebooks\\lightning_logs\\version_0'

In [248]:
class HabermanPredictor:
    def __init__(self, model_path):
        self.model_path = model_path 
        self.model = HabermanModel.load_from_checkpoint(model_path)
        self.model.eval()
        self.model.freeze()
        self.processor = HabermanDataModule(Config.file_path)
        self.softmax = torch.nn.Softmax(dim=0)
        self.labels = ["survived", "did not survived"]

    def predict(self, input):
        logits = self.model(
            torch.tensor([input], dtype=torch.float)
        )
        scores = self.softmax(logits[0]).tolist()
        predictions = []
        for score, label in zip(scores, self.labels):
            predictions.append({'label': label, 'score': score})
        return predictions


In [249]:
predictor = HabermanPredictor("./models/epoch=8-step=116.ckpt")

In [250]:
predictor.predict([34, 60, 0])

[{'label': 'survived', 'score': 0.8074608445167542},
 {'label': 'did not survived', 'score': 0.19253917038440704}]