# First NN with PyTorch on Tabular Data

In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics # plot_roc_curve.
from sklearn.model_selection import train_test_split # Train/test/validation split of data.
from sklearn.preprocessing import StandardScaler # Scale the data. 

# Pytorch imports
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader # Not sure what "Dataset" is for atm.

# Configure the device 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using '{device}' device.")

# Print working directory (for control)
import os
print(f"The working directory is {os.getcwd()}")

Using 'cuda' device.
The working directory is /home/ajo/gitRepos/master_thesis


## Load the Adult census data

In [89]:
df1 = pd.read_csv("original_data/adult.data", header = None, na_values = " ?")
df2 = pd.read_csv("original_data/adult.test", header = None, na_values = " ?")
df1.columns = df2.columns = ["age","workclass","fnlwgt","education","education_num",
                      "marital_status","occupation","relationship","race","sex",
                      "capital_gain","capital_loss","hours_per_week","native_country", "y"]
adult_data = pd.concat([df1,df2])

#print(adult_data.shape)
#print(adult_data.info())
#print(adult_data.head())

# Remove "education" column.
adult_data = adult_data.drop(columns = ["education"])

# Check if there are any NA values. 
print(adult_data.shape)
print(adult_data.isnull().values.any())
adult_data = adult_data.dropna() # Drop the NA values since we know they are few for this data set. 
print(adult_data.shape)

# Select covariates and response. 
X = adult_data.loc[:, adult_data.columns != "y"]
y = adult_data.loc[:,"y"].tolist()

(48842, 14)
True
(45222, 14)


## Pre-processing of data

* One-hot encoding of categorical features.
* Train/test/validation split.
* Standardization of numerical features (first in training data, then in testing and validation with the same center and scale.

In [103]:
# One hot encoding.

In [105]:
# Train/test/validation split.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=1/3, random_state=42)
print(X_train.shape) # 2/3
print(X_test.shape) # 2/9
print(X_valid.shape) # 1/9

(30148, 13)
(10049, 13)
(5025, 13)


In [104]:
# Standardization.

## Should use the Dataloader from Pytorch it seems

In [119]:
# This can be done as below (for example).

class Data(Dataset): # Not sure what import is needed or how this is used right now.
    def __init__(self, X, y):
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y.astype(np.float32))
        self.len = self.X.shape[0]
       
    def __getitem__(self, index):
        return self.X[index], self.y[index]
   
    def __len__(self):
        return self.len
    
    
# This can then be used as shown below later:

# select rows from the dataset
train, test = random_split(dataset, [[...], [...]])

# create a data loader for train and test sets
train_loader = DataLoader(train, batch_size=32, shuffle=True)
test_loader = DataLoader(test, batch_size=1024, shuffle=False)

NameError: name 'Dataset' is not defined

## Build the Simple Classifier

The classifier will have three dense layers with 18, 9 and 3 units respectively. In addition, it will use ReLU activations in each of the layers and have a sigmoid output activation, since we want to do binary classification of the output.

In [117]:
# Set the hyperparameters.
input_size = X_test.shape[1] # Number of columns in the data.
num_epochs = 30
batch_size = 1024
learning_rate = 0.01

# Could have used Pytorch sequential for this.
class NeuralNet(nn.Module):
    def __init__(self, input_size):
        super(NeuralNet, self).__init__()
        self.input_size = input_size
        
        # Layers. 
        self.l1 = nn.Linear(input_size, 18)
        self.l2 = nn.Linear(18,9)
        self.l3 = nn.Linear(9,3)
        self.output = nn.Linear(3,1)
        
        # Activation functions.
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        out = self.relu(out)
        out = self.l3(out)
        out = self.relu(out)
        out = self.output(out)
        out = self.sigmoid(out)
        return out

model = NeuralNet(input_size).to(device)
model

NeuralNet(
  (l1): Linear(in_features=13, out_features=18, bias=True)
  (l2): Linear(in_features=18, out_features=9, bias=True)
  (l3): Linear(in_features=9, out_features=3, bias=True)
  (output): Linear(in_features=3, out_features=1, bias=True)
  (relu): ReLU()
  (sigmoid): Sigmoid()
)

# Next we need to test the network (and see if I can get the same results as in Keras earlier.

In [120]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)  

# Train the model
n_total_steps = len(train_loader)
for epoch in range(num_epochs):
    # All the code below needs to be changed in order to acommodate my tabular data.
    for i, (images, labels) in enumerate(train_loader):  
        # origin shape: [100, 1, 28, 28]
        # resized: [100, 784]
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print (f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')

# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    for images, labels in test_loader:
        images = images.reshape(-1, 28*28).to(device)
        labels = labels.to(device)
        outputs = model(images)
        # max returns (value ,index)
        _, predicted = torch.max(outputs.data, 1)
        n_samples += labels.size(0)
        n_correct += (predicted == labels).sum().item()

    acc = 100.0 * n_correct / n_samples
    print(f'Accuracy of the network on the 10000 test images: {acc} %')

NameError: name 'train_loader' is not defined

# Perhaps we could follow [this](https://www.datacamp.com/tutorial/pytorch-tutorial-building-a-simple-neural-network-from-scratch) tutorial.
