# Model Experimentation Torch

This notebook contains the classification of cirrhosis outcomes with torch

In [1]:
import torch
from torchinfo import summary
from torch import nn
from tqdm.auto import tqdm
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Remove warnings
import warnings 
warnings.filterwarnings('ignore')

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
# Load the dataset 

train = pd.read_csv('../data/train_modified.csv')
test = pd.read_csv('../data/test_modified.csv')

# view the first 5 rows of the train dataset
train.head()

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status,N_Days_Years
0,D-penicillamine,58,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D,2.7
1,Placebo,52,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C,7.1
2,Placebo,37,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D,9.4
3,Placebo,50,F,N,N,N,N,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C,7.1
4,Placebo,45,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C,2.2


In [3]:
# view the first 5 rows of the test dataset
test.head()

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,N_Days_Years
0,D-penicillamine,54,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0,10.5
1,D-penicillamine,41,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0,6.8
2,Placebo,36,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0,0.1
3,D-penicillamine,56,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0,6.4
4,D-penicillamine,60,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0,4.4


In [4]:
# Get the informational statistics of the datasets for encoding and scaling
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7905 entries, 0 to 7904
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Drug           7905 non-null   object 
 1   Age            7905 non-null   int64  
 2   Sex            7905 non-null   object 
 3   Ascites        7905 non-null   object 
 4   Hepatomegaly   7905 non-null   object 
 5   Spiders        7905 non-null   object 
 6   Edema          7905 non-null   object 
 7   Bilirubin      7905 non-null   float64
 8   Cholesterol    7905 non-null   float64
 9   Albumin        7905 non-null   float64
 10  Copper         7905 non-null   float64
 11  Alk_Phos       7905 non-null   float64
 12  SGOT           7905 non-null   float64
 13  Tryglicerides  7905 non-null   float64
 14  Platelets      7905 non-null   float64
 15  Prothrombin    7905 non-null   float64
 16  Stage          7905 non-null   float64
 17  Status         7905 non-null   object 
 18  N_Days_Y

## Data Validation 

This section involves checking and validation the data quality

In [5]:
# checking for duplicate values
train.duplicated().sum()

4

In [6]:
# check for missing values
train.isnull().sum()

Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
Status           0
N_Days_Years     0
dtype: int64

## Data Preprocessing

In [7]:
# Splitting the data into features and target
features = train.drop(columns=['Status'])
target = train['Status']

# View the shapes of train
features.shape, target.shape

((7905, 18), (7905,))

In [8]:
num_classes = len(target.unique())
num_classes

3

In [9]:
# Split the data into training and validation sets 
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.05, random_state=42)

# Ensure the shapes of training and validation sets
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((7509, 18), (7509,), (396, 18), (396,))

# Let's create a function for preprocessing of categorical and numerical features

In [10]:
def create_preprocessor(dataset):
    
    # Get the names of categorical and numerical columns
    categorical_features = list(dataset.select_dtypes(include="object").columns)
    
    # Create a transformer for categorical cols
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    cat_scaler = StandardScaler()
    
    categorical_transformer = Pipeline(
        steps=[
            ("one_hot", one_hot_encoder),
            ("cat_scaler", cat_scaler)
        ]
    )
    
    
    preprocessor = ColumnTransformer(
            transformers = [
                    ("categorical", categorical_transformer, categorical_features)
            ]
    )
    
    
    return preprocessor

In [11]:
# Test our function
preprocessor = create_preprocessor(X_train)


X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

# Get the shapes of the preprocessed data
X_train.shape, X_val.shape

((7509, 13), (396, 13))

In [12]:
# Create scaler 
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

X_train.shape, X_val.shape

((7509, 13), (396, 13))

In [13]:
# Create a label encoder
enc = LabelEncoder()

y_train = enc.fit_transform(y_train)
y_val = enc.transform(y_val)

y_train.shape, y_val.shape

((7509,), (396,))

**Now let's convert these into tensors**

In [14]:
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.int64).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.int64).to(device)

# check the shape and size of tensors
X_train_tensor.size(), y_train_tensor.size(), X_val_tensor.size(), y_val_tensor.size()

(torch.Size([7509, 13]),
 torch.Size([7509]),
 torch.Size([396, 13]),
 torch.Size([396]))

**Let's create a Deep learning model**

In [15]:
class MultiClassCirrhosisOutcomes(nn.Module):
    def __init__(self, num_classes):
        self.num_classes = num_classes
        super().__init__()
        self.linear1 = nn.Linear(in_features=13, out_features=32)
        self.relu1 = nn.ReLU()
        self.dropout1 = nn.Dropout(0.2)
        self.linear2 = nn.Linear(in_features=32, out_features=16)
        self.relu2 = nn.ReLU()
        self.dropout2 = nn.Dropout(0.2)
        self.linear3 = nn.Linear(in_features=16, out_features=8)
        self.relu3 = nn.ReLU()
        self.output = nn.Linear(in_features=8, out_features=num_classes)
        self.softmax = nn.Softmax()
        
        
    def forward(self, x:torch.Tensor) -> torch.Tensor:
        return self.softmax(self.output(self.relu3(self.linear3(self.dropout2(self.relu2(self.linear2(self.dropout1(self.relu1(self.linear1(x))))))))))
        

In [16]:
# Create a model instance and print the summary fo the model
model_0 = MultiClassCirrhosisOutcomes(3).to(device)

# Get the summary of the model  
summary(model_0, input_size=(1, 13))

Layer (type:depth-idx)                   Output Shape              Param #
MultiClassCirrhosisOutcomes              [1, 3]                    --
├─Linear: 1-1                            [1, 32]                   448
├─ReLU: 1-2                              [1, 32]                   --
├─Dropout: 1-3                           [1, 32]                   --
├─Linear: 1-4                            [1, 16]                   528
├─ReLU: 1-5                              [1, 16]                   --
├─Dropout: 1-6                           [1, 16]                   --
├─Linear: 1-7                            [1, 8]                    136
├─ReLU: 1-8                              [1, 8]                    --
├─Linear: 1-9                            [1, 3]                    27
├─Softmax: 1-10                          [1, 3]                    --
Total params: 1,139
Trainable params: 1,139
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.00
Input size (MB): 0.00
Forward/backward pass 

In [17]:
# Create loss function and optimizer
loss_fn = nn.NLLLoss()
optimizer = torch.optim.Adam(model_0.parameters(), lr=2e-3)

In [18]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100 
    return acc

**Training the model**

In [19]:
torch.manual_seed(42)
EPOCHS = 500


# Put data to target device
X_train_tensor, y_train_tensor = X_train_tensor.to(device), y_train_tensor.to(device)
X_val_tensor, y_val_tensor = X_val_tensor.to(device), y_val_tensor.to(device)

# For epoch in a range
for epoch in tqdm(range(EPOCHS)):
    
    ### Training loop
    # Set the model to traning mode
    model_0.train()
    
    # Do the forward pass
    y_pred = model_0(X_train_tensor).squeeze()
    
    # Calculate loss and accuracy
    loss = loss_fn(y_pred, y_train_tensor)
    acc = accuracy_fn(y_true=y_train_tensor, y_pred=y_pred.argmax(1))
    
    # Optimizer zero grad
    optimizer.zero_grad()
    
    # Loss backward
    loss.backward()
    
    # Optimizer step
    optimizer.step()
    
    
    ### Testing loop
    model_0.eval()
    
    with  torch.inference_mode():
        # Forward pass
        test_preds = model_0(X_val_tensor).squeeze()
        
        # Calculate the test loss and accuracy
        test_loss = loss_fn(test_preds,
                            y_val_tensor)
        test_acc = accuracy_fn(y_pred=test_preds.argmax(1), y_true=y_val_tensor)
        
        
    # Print what's happening
    if epoch % 10 == 0:
        print(f"Epoch: {epoch} | Loss: {loss:.5f} | Accuracy: {acc:.2f}% | Test Loss: {test_loss:.5f} | Test Accuracy: {test_acc:.2f}%")

  0%|          | 0/500 [00:00<?, ?it/s]

Epoch: 0 | Loss: -0.36039 | Accuracy: 31.58% | Test Loss: -0.36616 | Test Accuracy: 38.13%
Epoch: 10 | Loss: -0.38293 | Accuracy: 33.80% | Test Loss: -0.38891 | Test Accuracy: 39.39%
Epoch: 20 | Loss: -0.41465 | Accuracy: 66.96% | Test Loss: -0.42083 | Test Accuracy: 68.94%
Epoch: 30 | Loss: -0.46350 | Accuracy: 70.22% | Test Loss: -0.46961 | Test Accuracy: 67.17%
Epoch: 40 | Loss: -0.54446 | Accuracy: 70.06% | Test Loss: -0.54641 | Test Accuracy: 66.41%
Epoch: 50 | Loss: -0.64282 | Accuracy: 70.62% | Test Loss: -0.63349 | Test Accuracy: 68.18%
Epoch: 60 | Loss: -0.69540 | Accuracy: 71.99% | Test Loss: -0.67192 | Test Accuracy: 68.69%
Epoch: 70 | Loss: -0.71439 | Accuracy: 72.83% | Test Loss: -0.68387 | Test Accuracy: 69.19%
Epoch: 80 | Loss: -0.72052 | Accuracy: 72.67% | Test Loss: -0.68736 | Test Accuracy: 69.19%
Epoch: 90 | Loss: -0.72569 | Accuracy: 72.87% | Test Loss: -0.68723 | Test Accuracy: 68.69%
Epoch: 100 | Loss: -0.72791 | Accuracy: 73.06% | Test Loss: -0.68675 | Test Accur