In [1]:
import pandas as pd
import numpy as np
import torch

from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

from torch import nn
from torch.utils.data import TensorDataset, DataLoader

import os
import copy

In [2]:
base_dir = '../mnt_emission_data' # Need to have the data as CSV files in this path
#base_dir = '../data' # Need to have the data as CSV files in this path
#base_dir = '../data/mini' # Need to have the data as CSV files in this path

In [3]:
def load_source_data(path):
    content = sorted(filter(lambda x: x.endswith(".csv"), os.listdir(path)))
    X = pd.concat((pd.read_csv(f'{path}/{f}') for f in content))
    X = X[~X['co2_total'].isna()]

    # Use only 100000 samples to test
    #X = X[:100000]
    
    y = X['co2_total'].copy()
    X = X.drop('co2_total', axis=1)

    return X, y

In [4]:
class OneLayerModelRobust(nn.Module):
    def __init__(self, n_input, n_hidden1, n_output, bs, p):
        super().__init__()
        self.bs = bs
        self.drop_layer = nn.Dropout(p=p)
        self.input_layer = nn.Linear(n_input, n_hidden1)
        self.hidden1 = nn.Linear(n_hidden1, n_output)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.drop_layer(x)
        x = self.input_layer(x)
        x = self.relu(x)
        x = self.hidden1(x)
        return x

In [5]:
class TwoLayerModel(nn.Module):
    def __init__(self, n_input, n_hidden1, n_hidden2, n_output, bs):
        super().__init__()
        self.bs = bs
        self.input_layer = nn.Linear(n_input, n_hidden1)
        self.hidden1 = nn.Linear(n_hidden1, n_hidden2)
        self.hidden2 = nn.Linear(n_hidden2, n_output)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)
        x = self.hidden1(x)
        x = self.relu(x)
        x = self.hidden2(x)
        return x

In [6]:
class NeuralNetworkOneLayerFFRobust:
    """
    A feedforward neural network model with one hidden layer that uses feature dropout during training. 
    The number of neurons in the hidden layer can be given as a parameter to the constructor (default 1024).
    The model will train until 5 epochs have passed without the test RMSE improving. 
    """
    def __init__(self, hidden_neurons=1024, droprate=0.2):
        self.hidden_neurons = hidden_neurons
        self.droprate = droprate
        self.__set_filename()
        self.model = None
    
    def __set_filename(self):
        self.filename = f"neural_onelayer_robust-hidden_{self.hidden_neurons}.model"

    #def preprocess(self, X):
    def __preprocess(self, X):
        # Drop empty features (dataset v. 1.0.0): unspsc_code, label 
        X = X.drop(["label", "unspsc_code"], axis=1)

        # Use unordered caterogies for several columns. List category values to support use cases when some
        # values are absent from a batch of source data.
        brand_types = CategoricalDtype(categories=["b0", "b1", "b10", "b100", "b101", "b102", "b103", "b104", "b105", "b106", "b107", "b108", "b109", "b11", "b110", "b111", "b112", "b113", "b114", "b115", "b116", "b117", "b118", "b119", "b12", "b120", "b121", "b122", "b123", "b124", "b125", "b126", "b127", "b128", "b129", "b13", "b130", "b131", "b132", "b133", "b134", "b135", "b136", "b137", "b138", "b139", "b14", "b140", "b141", "b142", "b143", "b144", "b145", "b146", "b147", "b148", "b149", "b15", "b16", "b17", "b18", "b19", "b2", "b20", "b21", "b22", "b23", "b24", "b25", "b26", "b27", "b28", "b29", "b3", "b30", "b31", "b32", "b33", "b34", "b35", "b36", "b37", "b38", "b39", "b4", "b40", "b41", "b42", "b43", "b44", "b45", "b46", "b47", "b48", "b49", "b5", "b50", "b51", "b52", "b53", "b54", "b55", "b56", "b57", "b58", "b59", "b6", "b60", "b61", "b62", "b63", "b64", "b65", "b66", "b67", "b68", "b69", "b7", "b70", "b71", "b72", "b73", "b74", "b75", "b76", "b77", "b78", "b79", "b8", "b80", "b81", "b82", "b83", "b84", "b85", "b86", "b87", "b88", "b89", "b9", "b90", "b91", "b92", "b93", "b94", "b95", "b96", "b97", "b98", "b99"], ordered=False)
        X["brand"] = X["brand"].astype(brand_types)
        cat1_types =  CategoricalDtype(categories=["baby", "clothing", "home", "kidswear", "menswear", "womenswear"], ordered=False)
        X["category-1"] = X["category-1"].astype(cat1_types)
        cat2_types = CategoricalDtype(categories=["home", "footwear", "nightwear", "thermals", "outerwear", "accessory", "uniform", "suit", "swimwear", "headgear", "sportswear", "costume", "clothing", "undergarments", "baby", "dress", "beachwear", "men-undergarments", "hosiery", "women-beachwear", "women-undergarments", "women-sportswear"], ordered=False)
        X["category-2"] = X["category-2"].astype(cat2_types)
        cat3_types = CategoricalDtype(categories=["backpack", "bikin", "body", "boxer-brief", "bra", "brief", "briefs", "cap", "coats", "costume", "curtain", "dress", "evening-dress", "fancy-dress", "flat-cap", "gloves", "hat", "hoodie", "jacket", "jean-shorts", "jeans", "jersey", "knit-cap", "knitwear", "long-sleeved-top", "mat", "overalls", "panties", "pants", "pillow", "pyjama", "scarf", "sheets", "shorts", "skirts", "snow-suit", "socks", "sport-bra", "stockings", "swimsuit", "T-shirt", "tie", "tights", "top", "towel", "trousers", "underpants", "wedding-dress"], ordered=False)
        X["category-3"] = X["category-3"].astype(cat3_types)
        colour_types = CategoricalDtype(categories=["Ivory", "amber", "aquamarine", "black", "blue", "blue gray", "bondi blue", "brown", "colourful", "dark green", "dark grey", "gold", "golden", "gray", "green", "grey", "indigo", "light brown", "light grey", "lime", "maroon", "metal", "mosaic", "mustard", "natural", "navy", "neon", "orange", "peach", "pink", "purple", "red", "silver", "teal", "turquoise", "unbleached", "unknown", "violet", "wheat", "white", "yellow"], ordered=False)
        X["colour"] = X["colour"].astype(colour_types)
        fabric_type_types = CategoricalDtype(categories=["K", "W"], ordered=False)
        X["fabric_type"] = X["fabric_type"].astype(fabric_type_types)
        gender_types = CategoricalDtype(categories=["B", "G", "K", "M", "U", "Y", "W"], ordered=False)
        X["gender"] = X["gender"].astype(gender_types)
        made_in_types = CategoricalDtype(categories=["AU", "BD", "BE", "BG", "BR", "CN", "CO", "CY", "DE", "DK", "EG", "ES", "FI", "FR", "GB", "GE", "GR", "HK", "IE", "IN", "IT", "JP", "KR", "LT", "LV", "ML", "MX", "PK", "RO", "SE", "TH", "TR", "TW", "US", "VE", "VN"], ordered=False)
        X["made_in"] = X["made_in"].astype(made_in_types)
        season_types = CategoricalDtype(categories=["AYR", "MID", "SUM", "WIN"], ordered=False)
        X["season"] = X["season"].astype(season_types)

        # Use ordered categories for size
        size_type = CategoricalDtype(categories=["XS", "S", "M", "L", "XL", "XXL"], ordered=True)
        X["size"] = X["size"].astype(size_type)

        # Convert the categoricals into a one-hot vector of binary variables
        X = pd.get_dummies(X)
        #print(X)

        # Fill in 0 for NA in ftp_ columns
        X = X.fillna(0)
        #print(X)

        scaler = MinMaxScaler()
        X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
        #print(X_scaled)

        return X_scaled

    def __save_model(self, base_dir):
        print(f"Saving neural network one hidden layer model to disk at {base_dir}/{self.filename}")
        torch.save(self.model, f"{base_dir}/{self.filename}")

    def __get_dataloader(self, X, y, bs=1000, test_size=0.2):
        X = self.__preprocess(X)
        X = X.to_numpy(dtype='float32')
        y = y.to_numpy(dtype='float32')
        
        y = y.reshape((-1,1))
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        train_dataloader = DataLoader(TensorDataset(
            torch.tensor(X_train, dtype=torch.float),
            torch.tensor(y_train, dtype=torch.float)),
            shuffle=True,
            batch_size=bs)

        test_dataloader = DataLoader(TensorDataset(
            torch.tensor(X_test, dtype=torch.float),
            torch.tensor(y_test, dtype=torch.float)),
            shuffle=True,
            batch_size=bs)      

        return train_dataloader, test_dataloader
    
    def __evaluate(self, dataloader, model, criterion, device):
        model.eval()

        rmse_scores = []
        r2_scores = []
        losses = []

        with torch.no_grad():
            for batch in dataloader:
                X, y = batch
                X = X.to(device)
                y = y.to(device)
                y_pred = model(X)

                loss = criterion(y_pred, y)
                losses.append(loss)

                with torch.no_grad():
                    y_pred = y_pred.cpu()
                    y = y.cpu()
                    s_rmse = mean_squared_error(y, y_pred, squared=False)
                    s_r2 = r2_score(y, y_pred)

                    rmse_scores.append(s_rmse)
                    r2_scores.append(s_r2)

        model.train()

        return torch.mean(torch.tensor(losses)), torch.mean(torch.tensor(rmse_scores)), torch.mean(torch.tensor(r2_scores))

    def __train(self, train_dataloader, test_dataloader, model, optimizer, criterion, device):
        model.train()

        best_model = None
        best_test_rmse_score = None
        best_test_r2_score = None
                
        fmt = '{:<5} {:12} {:12} {:<9} {:<9} {:<9} {:<9}'
        print(fmt.format('Epoch', 'Train loss', 'Valid loss', 'Train RMSE', 'Train R2', 'Test RMSE', 'Test R2'))

        epoch = 0
        best_score_epoch = 0
        while (epoch - best_score_epoch < 4):
            epoch = epoch + 1
        
            for i, batch in enumerate(train_dataloader):
                X, y = batch

                X = X.to(device)
                y = y.to(device)

                optimizer.zero_grad()
                y_pred = model(X)
                loss = criterion(y_pred, y)
                loss.backward()
                optimizer.step()

            
            train_loss, train_rmse_score, train_r2_score = self.__evaluate(train_dataloader, model, criterion, device)
            test_loss, test_rmse_score, test_r2_score = self.__evaluate(test_dataloader, model, criterion, device)

            fmt = '{:<5} {:03.2f} {:03.2f} {:02.2f} {:02.2f} {:02.2f} {:02.2f}'
            print(fmt.format(epoch, train_loss, test_loss, train_rmse_score, train_r2_score, test_rmse_score, test_r2_score))
        
            if ((best_test_rmse_score == None) or (test_rmse_score < best_test_rmse_score)):
                best_test_rmse_score = test_rmse_score
                best_test_r2_score = test_r2_score
                best_score_epoch = epoch
                best_model = copy.deepcopy(model)
            
        print(f"Neural network one hidden layer robust model trained in {best_score_epoch} epochs with stats RMSE = {best_test_rmse_score}, R2 = {best_test_r2_score}")

        best_model.eval()
        return best_model, best_test_r2_score
    
    def __select_device(self):
        if torch.cuda.is_available():
            device = torch.device('cuda')
            print("Using GPU!")
        else:
            device = torch.device('cpu')
            print("GPU not available, using CPU.")
        return device
    
    def load(self, base_dir):
        print(f"Loading neural network one hidden layer model from disk at {base_dir}/{self.filename}")
        self.model = torch.load(f"{base_dir}/{self.filename}")
        
    def train(self, X, y, base_dir=None):
        device = self.__select_device()
        lr = 0.01 # Learning rate
        bs = 1000 # Batch size
        droprate = self.droprate
        hidden_neurons = self.hidden_neurons # Number of hidden layer neurons to use

        print(f"Preparing batches of training data")
        train_dataloader, test_dataloader = self.__get_dataloader(X, y)        
        
        model = OneLayerModelRobust(334, hidden_neurons, 1, bs, p=droprate).to(device)
        
        #optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9) # Stochastic gradient descent
        #optimizer = torch.optim.Adagrad(model.parameters(), lr=lr)
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        
        criterion = nn.MSELoss(reduction='mean')
                
        print(f"Starting training of neural network one hidden layer robust model with {hidden_neurons} hidden layer neurons and batch size {bs}")
        model, _ = self.__train(train_dataloader, test_dataloader, model, optimizer, criterion, device)
        self.model = model
        print(f"Training complete")
        self.__save_model(base_dir)

    def eval(self, X, y):
        print(f"Evaluating neural network one hidden layer robust model with {hidden_neurons} hidden layer neurons and batch size {bs}")
        _, s_r2 = self.__train(X, y)
        return s_r2

    def predict(self, X):
        X = X.drop(["co2_total"], axis=1)
        device = self.__select_device()
        X = self.__preprocess(X)
        X = X.to_numpy(dtype='float32')
        X = torch.tensor(X, dtype=torch.float)
        X = X.to(device)        
        y_pred = self.model(X)
        y_pred = y_pred.detach().cpu().numpy().flatten().tolist()
        
        return y_pred

In [7]:
#model = NeuralNetworkOneLayerFF()
model = NeuralNetworkOneLayerFFRobust(hidden_neurons=1024, droprate=0.2)
X, y = load_source_data(base_dir)

#model.load(base_dir)

model.train(X, y, base_dir)

Using GPU!
Preparing batches of training data
Starting training of neural network one hidden layer robust model with 1024 hidden layer neurons and batch size 1000
Epoch Train loss   Valid loss   Train RMSE Train R2  Test RMSE Test R2  
1     169.60 173.75 12.94 0.78 13.07 0.77
2     163.42 167.60 12.72 0.79 12.88 0.78
3     160.27 164.82 12.58 0.79 12.75 0.79
4     152.84 158.06 12.29 0.80 12.50 0.79
5     148.98 154.39 12.14 0.80 12.35 0.80
6     156.60 162.12 12.44 0.79 12.65 0.79
7     147.44 153.88 12.07 0.81 12.32 0.80
8     153.01 160.21 12.31 0.80 12.58 0.79
9     142.36 148.80 11.87 0.81 12.12 0.81
10    148.77 157.46 12.12 0.80 12.47 0.79
11    151.52 160.49 12.24 0.80 12.58 0.79
12    153.18 162.94 12.31 0.80 12.68 0.79
13    151.66 162.36 12.24 0.80 12.65 0.79
Neural network one hidden layer robust model trained in 9 epochs with stats RMSE = 12.11983871459961, R2 = 0.8061007474138568
Training complete
Saving neural network one hidden layer model to disk at ../mnt_emission_da

In [8]:
nn_model = NeuralNetworkOneLayerFFRobust(hidden_neurons=1024)
nn_model.load(base_dir)
csv_file = "../testdata/test.csv"
X = pd.read_csv(csv_file)
pred = nn_model.predict(X)
pred

Loading neural network one hidden layer model from disk at ../mnt_emission_data/neural_onelayer_robust-hidden_1024.model
Using GPU!


[5.222492694854736,
 1.3370914459228516,
 11.089035987854004,
 0.6225480437278748,
 0.5705983638763428,
 5.210683345794678,
 16.268068313598633,
 3.0861122608184814,
 28.73525619506836,
 232.7754364013672,
 0.6697274446487427,
 2.678424119949341,
 30.004796981811523,
 29.08298110961914,
 16.281856536865234,
 11.206161499023438,
 12.915348052978516,
 11.595699310302734,
 27.123960494995117,
 299.264404296875,
 45.40665817260742,
 5.683190822601318,
 2.650118112564087,
 55.097442626953125,
 16.149085998535156,
 26.642549514770508,
 3.769782066345215,
 18.766408920288086,
 90.03750610351562,
 17.634769439697266,
 4.753632068634033,
 0.7565327882766724,
 5.281561374664307,
 58.60494613647461,
 50.02500534057617,
 176.358154296875,
 20.372486114501953,
 26.04391098022461,
 2.1113805770874023,
 16.879785537719727,
 0.8478763103485107,
 59.38308334350586,
 13.080726623535156,
 3.252758026123047,
 0.8297791481018066,
 25.80149269104004,
 1.2008739709854126,
 32.12127685546875,
 6.5890374183654