<a href="https://colab.research.google.com/github/savitaChari/W207-Final-Project-Group3_Section6/blob/main/EDA/W207_Final_Project_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# W207 final project (Section:6 )(Group:3)
# Title: Forest Cover Type Prediction
### Members: Savita Chari, Tymon Silva, Blake Bormes, Andrew Beckerman
### Data Source: https://www.kaggle.com/c/forest-cover-type-prediction/data

### SETUP
<hr>

In [None]:
import pandas as pd
import sklearn as skl
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
import seaborn as sns
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as pyplt
import random 
import itertools 
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from numpy import argmax
from numpy import vstack

from zipfile import ZipFile, Path

import torch
import torch.nn as nn
import torch.optim
import torch.utils.data
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split

torch.manual_seed(26)


<torch._C.Generator at 0x2b690001270>

### DATA PREP
<hr>

In [None]:
# Get Data

class FCdataset(Dataset):
    def __init__(self):
        with ZipFile("covtype.zip") as myzip:
            data = myzip.open("covtype.csv")
        df = pd.read_csv(data)
        
        self.X = preprocessing.scale(df.values[:, :-1])
        self.y = df.values[:, -1]
        # ensure input data is floats
        self.X = self.X.astype('float32')
        # label encode target and ensure the values are floats
        self.y = LabelEncoder().fit_transform(self.y)
        
    # number of rows in the dataset
    def __len__(self):
        return len(self.X)
 
    # get a row at an index
    def __getitem__(self, idx):
        return [self.X[idx], self.y[idx]]
 
    # get indexes for train and test rows
    def get_splits(self, n_test=0.33):
        # determine sizes
        test_size = round(n_test * len(self.X))
        train_size = len(self.X) - test_size
        # calculate the split
        return random_split(self, [train_size, test_size])

In [None]:
# prepare the dataset
def prepare_data():
    # load the dataset
    dataset = FCdataset()
    # calculate split
    train, test = dataset.get_splits()
    # prepare data loaders
    train_dl = DataLoader(train, batch_size=1024, shuffle=True)
    test_dl = DataLoader(test, batch_size=1024, shuffle=False)
    return train_dl, test_dl

In [None]:
train_dl, test_dl = prepare_data()
print(len(train_dl.dataset), len(test_dl.dataset))

389278 191734


### BUILD MODEL
<hr>

In [None]:
# model architecture
class MLP(nn.Module):

    def __init__(self):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(54, 1024), 
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 256), 
            nn.ReLU(),
            nn.Linear(256, 7)
        )
#         self.activation = nn.Sigmoid()
#         self.activation = nn.Softmax(dim=1)

    def forward(self, x):
        output = self.layers(x)
#         output = self.activation(x) 
        return output

In [None]:
# train the model
def train_model(train_dl, model):
    # define the optimization
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9) #potential to experiment with learning rate
    # enumerate epochs
    for epoch in range(10):
        trn_losses = []
        
        # enumerate mini batches
        for i, (inputs, targets) in enumerate(train_dl):
            # clear the gradients
            optimizer.zero_grad()
            # compute the model output
            out = model(inputs)
            # calculate loss
            loss = criterion(out, targets)
            # credit assignment
            loss.backward()
            # update model weights
            optimizer.step()
            trn_losses.append(loss.item())
            print(f'Epoch {epoch} train loss: {sum(trn_losses)/(1+i) :.4f}')
        
        # Validation step
        val_loss = []
        for i, (inputs, targets) in enumerate(test_dl):
            with torch.no_grad():
                ForestMLP.eval()
            out = ForestMLP(inputs)
            loss = criterion(out, targets)
            val_loss.append(loss.item())
            print(f'Epoch {epoch} valid loss: {sum(val_loss)/(1+i) :.4f}')
#             print(f'acc: {accuracy_score(actuals, predictions)}')

In [None]:
# evaluate the model
def evaluate_model(test_dl, model):
    predictions, actuals = list(), list()
    for i, (inputs, targets) in enumerate(test_dl):
        # evaluate the model on the test set
        out = model(inputs)
        # retrieve numpy array
        out = out.detach().numpy()
        actual = targets.numpy()
        # convert to class labels
        out = argmax(out, axis=1)
        # reshape for stacking
        actual = actual.reshape((len(actual), 1))
        out = out.reshape((len(out), 1))
        # store
        predictions.append(out)
        actuals.append(actual)
    predictions, actuals = vstack(predictions), vstack(actuals)
    # calculate accuracy
    acc = accuracy_score(actuals, predictions)
    return acc

In [None]:
# START TRAINING
ForestMLP = MLP()
train_model(train_dl, ForestMLP)

# evaluate the model
acc = evaluate_model(test_dl, ForestMLP)
print('Accuracy: %.3f' % acc)

Epoch 0 train loss: 1.9187
Epoch 0 train loss: 1.8902
Epoch 0 train loss: 1.8401
Epoch 0 train loss: 1.7741
Epoch 0 train loss: 1.7068
Epoch 0 train loss: 1.6549
Epoch 0 train loss: 1.6069
Epoch 0 train loss: 1.5791
Epoch 0 train loss: 1.5426
Epoch 0 train loss: 1.5080
Epoch 0 train loss: 1.4747
Epoch 0 train loss: 1.4438
Epoch 0 train loss: 1.4194
Epoch 0 train loss: 1.3926
Epoch 0 train loss: 1.3683
Epoch 0 train loss: 1.3463
Epoch 0 train loss: 1.3225
Epoch 0 train loss: 1.3013
Epoch 0 train loss: 1.2799
Epoch 0 train loss: 1.2593
Epoch 0 train loss: 1.2398
Epoch 0 train loss: 1.2206
Epoch 0 train loss: 1.2042
Epoch 0 train loss: 1.1881
Epoch 0 train loss: 1.1724
Epoch 0 train loss: 1.1590
Epoch 0 train loss: 1.1482
Epoch 0 train loss: 1.1345
Epoch 0 train loss: 1.1230
Epoch 0 train loss: 1.1114
Epoch 0 train loss: 1.0991
Epoch 0 train loss: 1.0887
Epoch 0 train loss: 1.0779
Epoch 0 train loss: 1.0701
Epoch 0 train loss: 1.0610
Epoch 0 train loss: 1.0527
Epoch 0 train loss: 1.0448
E

### Explain how you will evaluate any challenges
Potential challeneges could include our scaling methodology affecting binary features to have negative values. We will evaluate this challenge by attempting to normalize our data to have values between 0 and 1 and also by converting continous features to be binary and not transforming our binary features. With these changes, we will see which has the best effect on our model performance. 

An additional challenge could be training the neurnal network model with too many epochs. This could cause extended computation times for our model, so we will adjust for this by starting with lower numbers of epochs as we fine tune other model hyperparameters.

We may encounter overfitting, so we can use our test data to ensure the accuray of the test data improves at a similar rate to the accuracy of our training data. We also can evaluate if changing the ratio of training and test data from our original dataset will help reduce overfitting.

Lastly, differing class distributions between the training and test set could cause poor model performance, so stratifying the dataset to ensure class distribution is identical within both training and test sets will adjust for this.


### Briefly describe what you still plan to do
Our team plans to also test a K Nearest Neighbors model to understand which model type, Neural Network or KNN, best predicts forest covers. As time permits, we will test additional models as well.

In order to improve both our KNN and Neural Network models, our team will leverage different feature engineering techniques and fine tune model hyperparameters by:
- Scaling our data by subtracting by the mean and dividing to ensure our data points are scaled equally
- Normalizing our data to ensure all data points are between 0 and 1
- Converting continuous features to be binary to ensure all features (binary and continuous) are treated equally by the model
- Adding additional layers to our neural network, adjusting the learning rate or epochs, using softmax or sigmoid, changing the loss function or optimizer, and changing other neural network hyperparameters
- Adjusting the K value, smoothing factor/ alpha, and other KNN hyperparameters

Lastly, our team will use our metrics of evaluation to determine the best model, hyperparameters, and feature engineering techniques to predict forest cover types.
