# Model Experimentation Torch

This notebook contains the classification of cirrhosis outcomes with torch

In [1]:
import torch
from torchinfo import summary
from torch import nn
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Remove warnings
import warnings 
warnings.filterwarnings('ignore')

device = "cuda" if torch.cuda.is_available() else "cpu"

In [2]:
# Load the dataset 

train = pd.read_csv('../data/train_modified.csv')
test = pd.read_csv('../data/test_modified.csv')

# view the first 5 rows of the train dataset
train.head()

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,Status,N_Days_Years
0,D-penicillamine,58,M,N,N,N,N,2.3,316.0,3.35,172.0,1601.0,179.8,63.0,394.0,9.7,3.0,D,2.7
1,Placebo,52,F,N,N,N,N,0.9,364.0,3.54,63.0,1440.0,134.85,88.0,361.0,11.0,3.0,C,7.1
2,Placebo,37,F,N,Y,Y,Y,3.3,299.0,3.55,131.0,1029.0,119.35,50.0,199.0,11.7,4.0,D,9.4
3,Placebo,50,F,N,N,N,N,0.6,256.0,3.5,58.0,1653.0,71.3,96.0,269.0,10.7,3.0,C,7.1
4,Placebo,45,F,N,Y,N,N,1.1,346.0,3.65,63.0,1181.0,125.55,96.0,298.0,10.6,4.0,C,2.2


In [3]:
# view the first 5 rows of the test dataset
test.head()

Unnamed: 0,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage,N_Days_Years
0,D-penicillamine,54,F,N,Y,N,N,1.2,546.0,3.37,65.0,1636.0,151.9,90.0,430.0,10.6,2.0,10.5
1,D-penicillamine,41,F,N,N,N,N,1.1,660.0,4.22,94.0,1257.0,151.9,155.0,227.0,10.0,2.0,6.8
2,Placebo,36,F,N,Y,N,Y,2.0,151.0,2.96,46.0,961.0,69.75,101.0,213.0,13.0,4.0,0.1
3,D-penicillamine,56,F,N,N,N,N,0.6,293.0,3.85,40.0,554.0,125.55,56.0,270.0,10.6,2.0,6.4
4,D-penicillamine,60,F,N,Y,N,N,1.4,277.0,2.97,121.0,1110.0,125.0,126.0,221.0,9.8,1.0,4.4


In [4]:
# Get the informational statistics of the datasets for encoding and scaling
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7905 entries, 0 to 7904
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Drug           7905 non-null   object 
 1   Age            7905 non-null   int64  
 2   Sex            7905 non-null   object 
 3   Ascites        7905 non-null   object 
 4   Hepatomegaly   7905 non-null   object 
 5   Spiders        7905 non-null   object 
 6   Edema          7905 non-null   object 
 7   Bilirubin      7905 non-null   float64
 8   Cholesterol    7905 non-null   float64
 9   Albumin        7905 non-null   float64
 10  Copper         7905 non-null   float64
 11  Alk_Phos       7905 non-null   float64
 12  SGOT           7905 non-null   float64
 13  Tryglicerides  7905 non-null   float64
 14  Platelets      7905 non-null   float64
 15  Prothrombin    7905 non-null   float64
 16  Stage          7905 non-null   float64
 17  Status         7905 non-null   object 
 18  N_Days_Y

## Data Validation 

This section involves checking and validation the data quality

In [5]:
# checking for duplicate values
train.duplicated().sum()

4

In [6]:
# check for missing values
train.isnull().sum()

Drug             0
Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
Status           0
N_Days_Years     0
dtype: int64

## Data Preprocessing

In [7]:
# Splitting the data into features and target
features = train.drop(columns=['Status'])
target = train['Status']

# View the shapes of train
features.shape, target.shape

((7905, 18), (7905,))

In [8]:
num_classes = len(target.unique())
num_classes

3

In [9]:
# Split the data into training and validation sets 
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.05, random_state=42)

# Ensure the shapes of training and validation sets
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((7509, 18), (7509,), (396, 18), (396,))

# Let's create a function for preprocessing of categorical and numerical features

In [10]:
def create_preprocessor(dataset):
    
    # Get the names of categorical and numerical columns
    categorical_features = list(dataset.select_dtypes(include="object").columns)
    
    # Create a transformer for categorical cols
    one_hot_encoder = OneHotEncoder(sparse_output=False)
    cat_scaler = StandardScaler()
    
    categorical_transformer = Pipeline(
        steps=[
            ("one_hot", one_hot_encoder),
            ("cat_scaler", cat_scaler)
        ]
    )
    
    
    preprocessor = ColumnTransformer(
            transformers = [
                    ("categorical", categorical_transformer, categorical_features)
            ]
    )
    
    
    return preprocessor

In [11]:
# Test our function
preprocessor = create_preprocessor(X_train)


X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)

# Get the shapes of the preprocessed data
X_train.shape, X_val.shape

((7509, 13), (396, 13))

In [12]:
# Create scaler 
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

X_train.shape, X_val.shape

((7509, 13), (396, 13))

In [13]:
# Create a label encoder
enc = LabelEncoder()

y_train = enc.fit_transform(y_train)
y_val = enc.transform(y_val)

y_train.shape, y_val.shape

((7509,), (396,))

In [14]:
enc.classes_

array(['C', 'CL', 'D'], dtype=object)

### Let's create a dataloader for the Dataset

In [15]:
class CirrhosisDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).to(device)
        self.y = torch.tensor(y, dtype=torch.uint8).to(device)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, index):
        return self.X[index], self.y[index]        

**Let's create training, validation Dataloaders**

In [16]:
# Create datasets
training_data = CirrhosisDataset(X_train, y_train)
validation_data = CirrhosisDataset(X_val, y_val)

BATCH_SIZE=32

# Create dataloaders
train_loader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(validation_data, batch_size=BATCH_SIZE, shuffle=True)

**Let's create a Deep learning model**

In [17]:
class MultiClassCirrhosisOutcomes(nn.Module):
    def __init__(self, num_classes):
        self.num_classes = num_classes
        super().__init__()
        self.linear1 = nn.Linear(in_features=13, out_features=32)
        self.relu1 = nn.ReLU()
        self.linear2 = nn.Linear(in_features=32, out_features=16)
        self.relu2 = nn.ReLU()
        self.output = nn.Linear(in_features=16, out_features=num_classes)
        self.softmax = nn.Softmax()
        
        
    def forward(self, x:torch.Tensor) -> torch.Tensor:
        return self.softmax(self.output(self.relu2(self.linear2(self.relu1(self.linear1(x))))))
        

In [18]:
# Create a model instance and print the summary fo the model
model_0 = MultiClassCirrhosisOutcomes(3).to(device)

# Get the summary of the model  
summary(model_0, input_size=(1, 13))

Layer (type:depth-idx)                   Output Shape              Param #
MultiClassCirrhosisOutcomes              [1, 3]                    --
├─Linear: 1-1                            [1, 32]                   448
├─ReLU: 1-2                              [1, 32]                   --
├─Linear: 1-3                            [1, 16]                   528
├─ReLU: 1-4                              [1, 16]                   --
├─Linear: 1-5                            [1, 3]                    51
├─Softmax: 1-6                           [1, 3]                    --
Total params: 1,027
Trainable params: 1,027
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.00
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.00
Estimated Total Size (MB): 0.00

In [19]:
# Create loss function and optimizer
loss_fn = nn.NLLLoss()
optimizer = torch.optim.Adam(model_0.parameters(), lr=2e-3)

In [20]:
# Calculate accuracy (a classification metric)
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item() # torch.eq() calculates where two tensors are equal
    acc = (correct / len(y_pred)) * 100 
    return acc

**Training the model**

In [21]:
EPOCHS = 100

# Set the seed for reproducibility
torch.manual_seed(42)

# Create training and testing loops
for epoch in tqdm(range(EPOCHS)):
    ### Training
    model_0.train()
    
    # Set the training loss and acc
    train_loss, train_acc = 0, 0
    
    # Loop for iterating through batches
    for X, y in train_loader:
        
        # Forward pass
        y_preds = model_0(X)
        
        # calculate the loss
        loss = loss_fn(y_preds, y)
        train_loss += loss
        train_acc += accuracy_fn(y_true=y, y_pred=y_preds.argmax(dim=1))
        
        # Optimizer zero grad
        optimizer.zero_grad()
        
        # Loss backward
        loss.backward()
        
        # Optmizer step
        optimizer.step()
        
        
    # Get the average training loss
    train_loss /= len(train_loader)
    
    # Get the average training accuracy
    train_acc /= len(train_loader)
    
    
    ### Testing Loop
    test_loss, test_acc = 0, 0
    
    model_0.eval()
    
    with torch.inference_mode():
        for X, y in val_loader:
            
            # Forward pass
            test_preds = model_0(X)
            
            # Calculate the loss and accuracy
            test_loss += loss_fn(test_preds, y)
            test_acc += accuracy_fn(y_true=y, y_pred=test_preds.argmax(dim=1))
            
        # Calculate average metrics
        test_loss /= len(val_loader)
        test_acc /= len(val_loader)
        
    
    # Print out what's happening
    if epoch % 10 == 0 or epoch == EPOCHS:
        print(f"Epoch: {epoch} Training Loss: {train_loss:.5f} | Training Accuracy: {train_acc:.2f}% \n Test Loss: {test_loss:.5f} | Test Accuracy: {test_acc:.2f}%")
       

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0 Training Loss: -0.71531 | Training Accuracy: 72.14% 
 Test Loss: -0.68029 | Test Accuracy: 68.03%
Epoch: 10 Training Loss: -0.72771 | Training Accuracy: 72.77% 
 Test Loss: -0.68029 | Test Accuracy: 68.03%
Epoch: 20 Training Loss: -0.72792 | Training Accuracy: 72.79% 
 Test Loss: -0.68429 | Test Accuracy: 68.43%
Epoch: 30 Training Loss: -0.72771 | Training Accuracy: 72.77% 
 Test Loss: -0.66827 | Test Accuracy: 66.83%
Epoch: 40 Training Loss: -0.72757 | Training Accuracy: 72.76% 
 Test Loss: -0.68429 | Test Accuracy: 68.43%
Epoch: 50 Training Loss: -0.72757 | Training Accuracy: 72.76% 
 Test Loss: -0.67228 | Test Accuracy: 67.23%
Epoch: 60 Training Loss: -0.60131 | Training Accuracy: 60.13% 
 Test Loss: -0.58173 | Test Accuracy: 58.17%
Epoch: 70 Training Loss: -0.60082 | Training Accuracy: 60.08% 
 Test Loss: -0.57772 | Test Accuracy: 57.77%
Epoch: 80 Training Loss: -0.60124 | Training Accuracy: 60.12% 
 Test Loss: -0.57372 | Test Accuracy: 57.37%
Epoch: 90 Training Loss: -0.6

In [22]:
# Save the model
torch.save(model_0, "../models/2_linear_layer_model.pt")

## Load the model and do predictions

In [23]:
# Load the model 
loaded_model = torch.load("../models/3_linear_layer_model.pt").to("cpu")

In [24]:
# Process the test data

# Apply the categorical preproessor on the test set
test_arr = preprocessor.transform(test)

# Apply the standard scaler 
test_arr = scaler.transform(test_arr)

In [25]:
# Check the shape sot that it can match the model's input shape
test_arr.shape

(5271, 13)

In [26]:
# Convert the arr to tensor
test_tensor = torch.tensor(test_arr, dtype=torch.float32)

test_tensor.shape

torch.Size([5271, 13])

In [27]:
# Get the predictions

loaded_model.eval()

with torch.inference_mode():
    preds = loaded_model(test_tensor.to("cpu"))

# View the first 5 predictions
preds[:5]

tensor([[9.9997e-01, 9.1733e-06, 2.0286e-05],
        [1.0000e+00, 1.3156e-07, 6.3359e-08],
        [4.7896e-05, 2.9267e-05, 9.9992e-01],
        [1.0000e+00, 1.3156e-07, 6.3359e-08],
        [9.9997e-01, 9.1733e-06, 2.0286e-05]])

In [28]:
preds = torch.round(preds, decimals=6)

In [29]:
preds[:5]

tensor([[9.9997e-01, 9.0000e-06, 2.0000e-05],
        [1.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.8000e-05, 2.9000e-05, 9.9992e-01],
        [1.0000e+00, 0.0000e+00, 0.0000e+00],
        [9.9997e-01, 9.0000e-06, 2.0000e-05]])

In [30]:
# Load the sample submissions and save the results
submissions = pd.read_csv('../data/sample_submission.csv')

submissions.head()

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.628084,0.034788,0.337128
1,7906,0.628084,0.034788,0.337128
2,7907,0.628084,0.034788,0.337128
3,7908,0.628084,0.034788,0.337128
4,7909,0.628084,0.034788,0.337128


In [31]:
# Fill in the values

submissions['Status_C'] = preds[:, 0]
submissions["Status_CL"] = preds[:, 1]
submissions["Status_D"] = preds[:, 2]

submissions.head()

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.999971,9e-06,2e-05
1,7906,1.0,0.0,0.0
2,7907,4.8e-05,2.9e-05,0.999923
3,7908,1.0,0.0,0.0
4,7909,0.999971,9e-06,2e-05


In [32]:
# Save the submissions dataframe
submissions.to_csv("../data/torch_2_linear_layer_model.csv", index=False, header=True, )