# Meetup Code for Comparing PyTorch NN to Regression

## Method

* used a well known internal dataset to compare learning algorithms
* used a comparison of PyTorch feed forward NN, Sci-Kit Learn & XgBoost regressors
* demonstrated to the meetup group the benefits/challenges of each and performance


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import pandas as pd

In [None]:
#create data object
class Data(Dataset):
    def __init__(self, X, y):
        self.x=X
        self.y=y
        self.len=self.x.shape[0]
    def __getitem__(self,index):    
            
        return self.x[index],self.y[index]
    def __len__(self):
        return self.len


#fully connected nn object
class Net(nn.Module):
    def __init__(self, D_in, H1, H2,H3, D_out):
        super(Net, self).__init__()
        
        self.linear1 = nn.Linear(D_in, H1) 
        self.act1 = nn.ReLU() # Activation function
        self.linear2 = nn.Linear(H1, D_out)

        
    def forward(self, x):
        y_pred = self.linear1(x)  #input layer
        y_pred = self.act1(y_pred)
        y_pred = self.linear2(y_pred) #hidden layer
       
        return y_pred

#single linear nn obj
class LinearNet(nn.Module):
    def __init__(self, D_in, H1, H2, D_out):
        super(LinearNet, self).__init__()
        
        self.linear1 = nn.Linear(D_in, H1) 

        
    def forward(self, x):
        y_pred = self.linear1(x)  #input layer

        
        return y_pred

In [None]:
#scale data
X_train = train_df.iloc[:,:-1]
y_train_scaled = np.log(train_df.iloc[:,-1])
scaler = preprocessing.StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled) #.round(4)
print(X_train_scaled.shape)

#make numpy arrays
X_train_scaled = np.array(X_train_scaled, dtype=np.float64)
#x_train = x_train.reshape(-1,1)
y_train_scaled = np.array(y_train_scaled, dtype=np.float64)
y_train_scaled = y_train_scaled.reshape(-1, 1)

#make tensors
numeric_x = torch.tensor(X_train_scaled, dtype=torch.float32)
numeric_y = torch.tensor(y_train_scaled, dtype=torch.float32) #scaling labels

numeric_x2 = numeric_x

In [None]:
#set up inputs
H1, H2, H3 = 5, 2, 1

D_in, D_out = numeric_x.shape[1], 1 #numeric_y.shape[0]

dataset = torch.utils.data.TensorDataset(numeric_x, numeric_y)

model1 = Net(D_in, H1, H2, H3, D_out)
#model1 = LinearNet(D_in,1,H2, D_out)


criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model1.parameters(), lr=1e-1)
trainloader=DataLoader(dataset=dataset,batch_size=5000)
losses2 = []

In [None]:
#training loop and minibatch loop
for t in range(1):
    for x,y in trainloader:
        y_pred = model1(x) #numeric_x
        
        loss = criterion(y_pred, y) #numeric_y
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        print(t, loss.item())
        losses2.append(loss.item())
        
        if torch.isnan(loss):
            break
    
    
print('Finished Training')

## Regressor Code

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import metrics 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

fname = r'some_file'
train_df = pd.read_csv(fname) 

train_df = train_df.drop('Unnamed: 0',axis=1)
train_df = train_df.sample(frac=1)

#model objects
model_sk = LinearRegression()
model_xgb = xgb.XGBRegressor(learning_rate=0.1, 
                                max_depth=5, 
                                colsample_bytree=0.5,
                                reg_alpha = 0.1,
                                subsample=0.8,
                                n_estimators=500, n_jobs=6)

#scale data
X_train = train_df.iloc[:,:-1]
X_train_feature_names = list(X_train.columns)
y_train_scaled = train_df.iloc[:,-1]
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_train_scaled = pd.DataFrame(X_train_scaled) #.round(4)
print(X_train_scaled.shape)

#make numpy arrays
X_train_scaled = np.array(X_train_scaled, dtype=np.float64)
#x_train = x_train.reshape(-1,1)
y_train_scaled = np.array(y_train_scaled, dtype=np.float64)
y_train_scaled = y_train_scaled.reshape(-1, 1)

#get train and test data
X_train, X_test, y_train, y_test = train_test_split(X_train_scaled, y_train_scaled, test_size=0.3,random_state=123)

#fit the linear model
model_sk.fit(X_train, y_train)

#predict on the test set
y_pred_sk = model_sk.predict(X_test)
r2 = metrics.r2_score(y_test, y_pred_sk) 
rmse = np.sqrt(metrics.mean_squared_error(y_test, y_pred_sk))

print(model_sk.coef_, model_sk.intercept_)
print(f"the r2 value of the linear sklearn regressor is {r2}")
print(f"the rmse value of the linear sklearn regressor is {rmse}")

#fit the xgboost model
model_xgb.fit(X_train, y_train)

#predict on the test set
y_pred_xgb = model_xgb.predict(X_test)
r2 = metrics.r2_score(y_test, y_pred_xgb)
rmse = np.sqrt(metrics.mean_squared_error(y_test,y_pred_xgb))

print(f"the r2 value of the xgboost regressor is {r2}")
print(f"the rmse value of the xgboost regressor is {rmse}")