In [1]:
import numpy as np
import pandas as pd
import os
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from torch.utils.data import DataLoader, Dataset
from torch.utils.data import DataLoader
from collections import Counter
from sklearn.metrics import accuracy_score

In [2]:
np.random.seed(42)


In [3]:
# setting the path for Data from data/housing folder
DATA_FILE_TRAIN = './data/Boston.csv'
#setting the random seed 
np.random.seed(42)
# Loading the dataset
train_data = pd.read_csv(DATA_FILE_TRAIN)
print(train_data.shape)
print(train_data.head())


(506, 15)
   ID     crim    zn  indus  chas    nox     rm   age     dis  rad  tax  \
0   1  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296   
1   2  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242   
2   3  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242   
3   4  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222   
4   5  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222   

   ptratio   black  lstat  medv  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  


In [4]:
BATCH_SIZE_1 = 101
BATCH_SIZE_2 = 51

In [5]:
# To decide on the bin values
print(train_data['medv'].max())
print(train_data['medv'].min())

50.0
5.0


In [6]:

bins = [0,30,50]
labels = [0,1]
train_data['medv'] = pd.cut(train_data['medv'], bins=bins, labels=labels)
print(train_data.head())
print(len(train_data.columns))

   ID     crim    zn  indus  chas    nox     rm   age     dis  rad  tax  \
0   1  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296   
1   2  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242   
2   3  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242   
3   4  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222   
4   5  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222   

   ptratio   black  lstat medv  
0     15.3  396.90   4.98    0  
1     17.8  396.90   9.14    0  
2     17.8  392.83   4.03    1  
3     18.7  394.63   2.94    1  
4     18.7  396.90   5.33    1  
15


In [7]:
print(f"The target class ratio is {Counter(train_data['medv']) }")

The target class ratio is Counter({0: 422, 1: 84})


In [8]:
id_col = ['ID']
categorical_features = ['chas'] 
target_feature = 'medv'

dropped_cols = id_col+categorical_features
train_data = train_data.drop(dropped_cols, axis=1)
all_features = train_data.columns.tolist()  #this will not have 'chas' and 'ID'

numerical_features = list(set(all_features)- set([target_feature]))
#print(len(numerical_features))


In [9]:
train_data_inp = train_data[numerical_features]
train_data_tar = train_data[target_feature]
Trn_input,  Val_inp, Trn_target,Val_target = train_test_split(train_data_inp, train_data_tar, test_size=0.2,random_state=123)
# Train_data has our training dataset and Valid_data has our validation dataset.
Train_data = pd.concat([Trn_input, pd.DataFrame(Trn_target)], axis=1)
Valid_data = pd.concat([Val_inp, pd.DataFrame(Val_target)], axis=1)
print(Train_data.shape)
print(Valid_data.shape)

(404, 13)
(102, 13)


In [10]:
class oversampdata(Dataset):

    def __init__(self, data):
        self.data = torch.FloatTensor(data.values.astype('float'))
        print(self.data.shape)
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        target = self.data[index][-1]
        data_val = self.data[index] [:-1]
        return data_val,target

In [11]:
# training and validation dataset 
train_dataset = oversampdata(Train_data)
valid_dataset = oversampdata(Valid_data)

torch.Size([404, 13])
torch.Size([102, 13])


In [12]:
torch.cuda.is_available()

False

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
kwargs = {'num_workers': 1, 'pin_memory': True} if device=='cuda' else {}

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE_1, shuffle=True, **kwargs)
test_loader = DataLoader(valid_dataset, batch_size=BATCH_SIZE_2, shuffle=True, **kwargs)


In [14]:
# Simple Neural network 

input_size = 12
hidden_size = 128
num_classes = 1
num_epochs = 5
learning_rate = 0.001

## BCEWithLogitsLoss( )

In [15]:
class LinearModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(LinearModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)
        self.relu = nn.ReLU()
        
                    
    def get_weights(self):
        return self.weight
    
    def forward(self,x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

In [16]:
def train(model,device,train_loader,optimizer):
    model.train()
    correct = 0
    loss_total = 0
    y_true = []
    y_pred = []
    for i in train_loader:
        
        #LOADING THE DATA IN A BATCH
        data, target = i
 
        # moving the tensors to the configured device
        data, target = data.to(device), target.to(device)
       
        #FORWARD PASS
        target = target.float()
        output = model(data.float())
        loss = criterion(output, target.unsqueeze(1)) 
        
        loss_total += loss
        
        #BACKWARD AND OPTIMIZE
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        
        #PREDICTIONS BCELogitsloss()
        pred = np.round(torch.sigmoid(output.detach()))
        target = target.float()
        y_true.extend(target.tolist()) 
        y_pred.extend(pred.reshape(-1).tolist())
        

    print("Accuracy on training set is" , accuracy_score(y_true,y_pred))

In [17]:
def test(model, device, test_loader):
    #model in eval mode skips Dropout etc
    model.eval()
    y_true = []
    y_pred = []
    
    # set the requires_grad flag to false as we are in the test mode
    with torch.no_grad():
        for i in test_loader:
            
            #LOAD THE DATA IN A BATCH
            data,target = i
            
            # moving the tensors to the configured device
            data, target = data.to(device), target.to(device)
            
            
            output = model(data.float())
            
            #PREDICTIONS
            pred = np.round(torch.sigmoid(output))
            target = target.float()
            y_true.extend(target.tolist()) 
            y_pred.extend(pred.reshape(-1).tolist())
            
            
    print("Accuracy on test set is" , accuracy_score(y_true,y_pred))
    print("********************************************************")

In [18]:
# Loss and optimize
model = LinearModel(input_size, hidden_size, num_classes).to(device)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [19]:
for epoch in range(num_epochs):
        train(model,device,train_loader,optimizer)
        test(model,device,test_loader)

Accuracy on training set is 0.4183168316831683
Accuracy on test set is 0.6764705882352942
********************************************************
Accuracy on training set is 0.7673267326732673
Accuracy on test set is 0.7843137254901961
********************************************************
Accuracy on training set is 0.844059405940594
Accuracy on test set is 0.7941176470588235
********************************************************
Accuracy on training set is 0.844059405940594
Accuracy on test set is 0.7941176470588235
********************************************************
Accuracy on training set is 0.844059405940594
Accuracy on test set is 0.7941176470588235
********************************************************


In [20]:
# Save the model checkpoint
torch.save(model.state_dict(), 'model_BCEWithLogitsloss.ckpt')