
## Lecture 09:

* Creating a softmax classifier with a kaggle dataset
* URL for the dataset: https://www.kaggle.com/c/otto-group-product-classification-challenge/data

- For the sake of excercise the data is saved in the data folder

In [1]:
import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
from sklearn import model_selection

In [2]:
import time
from sklearn.preprocessing import LabelEncoder

In [3]:
# Libraries for creating the dataloader
from torch.utils.data import DataLoader, Dataset

In [4]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
# Load and split the data in train and valid

df = pd.read_csv('./data/otto_train.csv')
df = df[df.columns[1:]]
number = LabelEncoder()
df['target'] = number.fit_transform(df['target'].astype('str'))
df_train, df_valid = model_selection.train_test_split(df, test_size = 0.2, random_state = 42, stratify=df.target.values)

In [6]:
# Create a custom Dataset class which accepts a dataframe and pops out tensors as needed from the model

class Mydataset():

    def __init__(self, df):
        self.xy = df
        self.len = self.xy.shape[0]
        self.x_data = torch.from_numpy(self.xy.iloc[:,0:-1].to_numpy(dtype=np.float32))
        self.y_data = torch.from_numpy(self.xy.iloc[:,-1].to_numpy(dtype=np.float32))
        self.y_data = self.y_data.long()

    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]

    def __len__(self):
        return self.len

In [7]:
df.shape

(61878, 94)

In [8]:
# Creating a dataset

train_dataset = Mydataset(df_train)
valid_dataset = Mydataset(df_valid)

In [9]:
# Creating dataloaders

train_dataloader = DataLoader(dataset = train_dataset, batch_size=32, shuffle=True, num_workers=0)
valid_dataloader = DataLoader(dataset = valid_dataset, batch_size=32, shuffle=True, num_workers=0)

In [10]:
# Creating a network for passing the data

class Model(torch.nn.Module):

    def __init__(self):
        super (Model, self).__init__()
        self.l1 = torch.nn.Linear(93, 720)
        self.l2 = torch.nn.Linear(720, 540)
        self.l3 = torch.nn.Linear(540, 320)
        self.l4 = torch.nn.Linear(320, 240)
        self.l5 = torch.nn.Linear(240, 120)
        self.l6 = torch.nn.Linear(120, 9)

        self.relu = torch.nn.functional.relu()

    def forward(self, x):
        out_1 = self.relu(self.l1(x))
        out_2 = self.relu(self.l2(out_1))
        out_3 = self.relu(self.l3(out_2))
        out_4 = self.relu(self.l4(out_3))
        out_5 = self.relu(self.l5(out_4))
        y_pred = self.l6(out_5)
        return y_pred


model = Model()
model.to(device)

Model(
  (l1): Linear(in_features=93, out_features=720, bias=True)
  (l2): Linear(in_features=720, out_features=540, bias=True)
  (l3): Linear(in_features=540, out_features=320, bias=True)
  (l4): Linear(in_features=320, out_features=240, bias=True)
  (l5): Linear(in_features=240, out_features=120, bias=True)
  (l6): Linear(in_features=120, out_features=9, bias=True)
  (relu): ReLU()
)

In [11]:
criterion = torch.nn.CrossEntropyLoss(reduction='mean')
optimus = torch.optim.AdamW(model.parameters(), lr = 0.01)

In [18]:
def train(epoch):
    model.train()
    for _, data in enumerate(train_dataloader, 0):
        inputs, labels = data
        y_pred = model(inputs.to(device))
        labels = labels.to(device)
        loss = criterion(y_pred,labels)
        if _%500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimus.zero_grad()
        loss.backward()
        optimus.step()

In [19]:
for epoch in range(0,2):
    train(epoch)

Epoch: 0, Loss:  0.7016705274581909
Epoch: 0, Loss:  0.9477417469024658
Epoch: 0, Loss:  0.8648364543914795
Epoch: 0, Loss:  0.845124363899231
Epoch: 1, Loss:  0.9539099335670471
Epoch: 1, Loss:  1.0437977313995361
Epoch: 1, Loss:  0.5430974960327148
Epoch: 1, Loss:  0.8525347709655762


In [57]:
def valid(model, valid_valid_dataloader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(valid_dataloader, 0):
            inputs, labels = data
            output = model(inputs.to(device))
            labels = labels.to(device)
            big_val, big_idx = torch.max(output.data, dim=1)
            total += labels.size(0)
            n_correct += (big_idx == labels).sum().item()
        #     if big_idx.item() == target.item()[1]:
        #     n_correct += 1
        # else:
        #     n_wrong += 1
    return (n_correct * 100.0) / (total)
        
        
    #     valid_loss += criterion(output, target).item()
    #     pred = output.data.max(1, keepdim=True)[1]
    #     correct += pred.eq(target.data.view_as(pred)).cpu().sum()
    # valid_loss /= len(valid_dataloader.dataset)
    # print(f'===========================\nTest set: Average loss: {valid_loss:.4f}, Accuracy: {correct}/{len(valid_dataloader.dataset)} '
    #       f'({100. * correct / len(valid_dataloader.dataset):.0f}%)')


In [59]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, valid_dataloader)
print("Accuracy on test data = %0.2f%%" % acc) 

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Accuracy on test data = 59.22%


In [25]:
def accuracy(model, data_x, data_y):
    # data_x and data_y are numpy nd arrays
    N = len(data_x)    # number data items
    n = len(data_x[0])  # number features

    n_correct = 0; n_wrong = 0
    for i in range(N):
        X = torch.Tensor(data_x[i].reshape((1,n)))
        Y = torch.LongTensor(data_y[i].reshape((1,1)))
        oupt = model(X.to(device))
        (big_val, big_idx) = torch.max(oupt, dim=1)
        if big_idx.item() == data_y[i]:
            n_correct += 1
        else:
            n_wrong += 1
    return (n_correct * 100.0) / (n_correct + n_wrong)

In [60]:
print('This is the validation section to print the accuracy and see how it performs')
print('This is more of a hack using the original dataset and converting it into tensors and calculating it directly')
data_x = df_valid.iloc[:,0:-1].to_numpy(dtype=np.float32)
data_y = df_valid.iloc[:,-1].to_numpy(dtype=np.float32)
model.eval()
acc = accuracy(model, data_x, data_y)
print("Accuracy on test data = %0.2f%%" % acc) 

This is the validation section to print the accuracy and see how it performs
This is more of a hack using the original dataset and converting it into tensors and calculating it directly
Accuracy on test data = 59.22%


In [22]:
# if __name__ == '__main__':
#     since = time.time()
#     for epoch in range(1, 10):
#         epoch_start = time.time()
#         train(epoch)
#         m, s = divmod(time.time() - epoch_start, 60)
#         print(f'Training time: {m:.0f}m {s:.0f}s')
#         valid()
#         m, s = divmod(time.time() - epoch_start, 60)
#         print(f'Testing time: {m:.0f}m {s:.0f}s')

#     m, s = divmod(time.time() - since, 60)
#     print(f'Total Time: {m:.0f}m {s:.0f}s\nModel was trained on {device}!')