In [1]:
import pandas as pd
import numpy as np
import torch

from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler

from torch import nn
from torch.utils.data import TensorDataset, DataLoader

import os

In [2]:
base_dir = '../mnt_emission_data' # Need to have the data as CSV files in this path
#base_dir = '../data' # Need to have the data as CSV files in this path
#base_dir = '../data/mini' # Need to have the data as CSV files in this path

In [3]:
def load_source_data(path):
    content = sorted(filter(lambda x: x.endswith(".csv"), os.listdir(path)))
    X = pd.concat((pd.read_csv(f'{path}/{f}') for f in content))
    X = X[~X['co2_total'].isna()]

    # Use only 100000 samples to test
    #X = X[:100000]
    
    y = X['co2_total'].copy()
    X = X.drop('co2_total', axis=1)

    return X, y

In [4]:
def preprocess(X):
    # Drop empty features (dataset v. 1.0.0): unspsc_code, label 
    X = X.drop(["label", "unspsc_code"], axis=1)

    # Use unordered caterogies for several columns. List category values to support use cases when some
    # values are absent from a batch of source data.
    brand_types = CategoricalDtype(categories=["b0", "b1", "b10", "b100", "b101", "b102", "b103", "b104", "b105", "b106", "b107", "b108", "b109", "b11", "b110", "b111", "b112", "b113", "b114", "b115", "b116", "b117", "b118", "b119", "b12", "b120", "b121", "b122", "b123", "b124", "b125", "b126", "b127", "b128", "b129", "b13", "b130", "b131", "b132", "b133", "b134", "b135", "b136", "b137", "b138", "b139", "b14", "b140", "b141", "b142", "b143", "b144", "b145", "b146", "b147", "b148", "b149", "b15", "b16", "b17", "b18", "b19", "b2", "b20", "b21", "b22", "b23", "b24", "b25", "b26", "b27", "b28", "b29", "b3", "b30", "b31", "b32", "b33", "b34", "b35", "b36", "b37", "b38", "b39", "b4", "b40", "b41", "b42", "b43", "b44", "b45", "b46", "b47", "b48", "b49", "b5", "b50", "b51", "b52", "b53", "b54", "b55", "b56", "b57", "b58", "b59", "b6", "b60", "b61", "b62", "b63", "b64", "b65", "b66", "b67", "b68", "b69", "b7", "b70", "b71", "b72", "b73", "b74", "b75", "b76", "b77", "b78", "b79", "b8", "b80", "b81", "b82", "b83", "b84", "b85", "b86", "b87", "b88", "b89", "b9", "b90", "b91", "b92", "b93", "b94", "b95", "b96", "b97", "b98", "b99"], ordered=False)
    X["brand"] = X["brand"].astype(brand_types)
    cat1_types =  CategoricalDtype(categories=["baby", "clothing", "home", "kidswear", "menswear", "womenswear"], ordered=False)
    X["category-1"] = X["category-1"].astype(cat1_types)
    cat2_types = CategoricalDtype(categories=["home", "footwear", "nightwear", "thermals", "outerwear", "accessory", "uniform", "suit", "swimwear", "headgear", "sportswear", "costume", "clothing", "undergarments", "baby", "dress", "beachwear", "men-undergarments", "hosiery", "women-beachwear", "women-undergarments", "women-sportswear"], ordered=False)
    X["category-2"] = X["category-2"].astype(cat2_types)
    cat3_types = CategoricalDtype(categories=["backpack", "bikin", "body", "boxer-brief", "bra", "brief", "briefs", "cap", "coats", "costume", "curtain", "dress", "evening-dress", "fancy-dress", "flat-cap", "gloves", "hat", "hoodie", "jacket", "jean-shorts", "jeans", "jersey", "knit-cap", "knitwear", "long-sleeved-top", "mat", "overalls", "panties", "pants", "pillow", "pyjama", "scarf", "sheets", "shorts", "skirts", "snow-suit", "socks", "sport-bra", "stockings", "swimsuit", "T-shirt", "tie", "tights", "top", "towel", "trousers", "underpants", "wedding-dress"], ordered=False)
    X["category-3"] = X["category-3"].astype(cat3_types)
    colour_types = CategoricalDtype(categories=["Ivory", "amber", "aquamarine", "black", "blue", "blue gray", "bondi blue", "brown", "colourful", "dark green", "dark grey", "gold", "golden", "gray", "green", "grey", "indigo", "light brown", "light grey", "lime", "maroon", "metal", "mosaic", "mustard", "natural", "navy", "neon", "orange", "peach", "pink", "purple", "red", "silver", "teal", "turquoise", "unbleached", "unknown", "violet", "wheat", "white", "yellow"], ordered=False)
    X["colour"] = X["colour"].astype(colour_types)
    fabric_type_types = CategoricalDtype(categories=["K", "W"], ordered=False)
    X["fabric_type"] = X["fabric_type"].astype(fabric_type_types)
    gender_types = CategoricalDtype(categories=["B", "G", "K", "M", "U", "Y", "W"], ordered=False)
    X["gender"] = X["gender"].astype(gender_types)
    made_in_types = CategoricalDtype(categories=["AU", "BD", "BE", "BG", "BR", "CN", "CO", "CY", "DE", "DK", "EG", "ES", "FI", "FR", "GB", "GE", "GR", "HK", "IE", "IN", "IT", "JP", "KR", "LT", "LV", "ML", "MX", "PK", "RO", "SE", "TH", "TR", "TW", "US", "VE", "VN"], ordered=False)
    X["made_in"] = X["made_in"].astype(made_in_types)
    season_types = CategoricalDtype(categories=["AYR", "MID", "SUM", "WIN"], ordered=False)
    X["season"] = X["season"].astype(season_types)

    # Use ordered categories for size
    size_type = CategoricalDtype(categories=["XS", "S", "M", "L", "XL", "XXL"], ordered=True)
    X["size"] = X["size"].astype(size_type)

    # Convert the categoricals into a one-hot vector of binary variables
    X = pd.get_dummies(X)
    #print(X)

    # Fill in 0 for NA in ftp_ columns
    X = X.fillna(0)
    #print(X)

    scaler = MinMaxScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    #print(X_scaled)

    return X_scaled


In [5]:
def get_dataloader(path, bs=256, test_size=0.2):
    X, y = load_source_data(path)
    X = preprocess(X)
    X = X.to_numpy(dtype='float32')
    y = y.to_numpy(dtype='float32')
    print ('X dtype', X.dtype)
    print('X shape', X.shape)
    print ('y dtype', y.dtype)
    
    print('Y shape before', y.shape)
    y = y.reshape((-1,1))
    print('Y shape after reshape', y.shape)
    print(X)
    print(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    print('NaN in X indices', np.where(np.isnan(X)))
    print('NaN in y indices', np.where(np.isnan(y)))
   
    train_dataloader = DataLoader(TensorDataset(
        torch.tensor(X_train, dtype=torch.float),
        torch.tensor(y_train, dtype=torch.float)),
        shuffle=True,
        batch_size=bs)

    test_dataloader = DataLoader(TensorDataset(
        torch.tensor(X_test, dtype=torch.float),
        torch.tensor(y_test, dtype=torch.float)),
        shuffle=True,
        batch_size=bs)      

    return train_dataloader, test_dataloader

In [6]:
def evaluate(dataloader, model, criterion, device):
    model.eval()

    rmse_scores = []
    r2_scores = []
    losses = []

    with torch.no_grad():
        for batch in dataloader:
            X, y = batch
            X = X.to(device)
            y = y.to(device)
            y_pred = model(X)

            loss = criterion(y_pred, y)
            losses.append(loss)

            with torch.no_grad():
                #print ('Y preds')
                #print (y_pred.shape)
                #print (y_pred)
                y_pred = y_pred.cpu()
                y = y.cpu()
                """
                print ('Ys')
                print (y)
                print ('Y preds')
                print (y_pred.shape)
                print (y_pred)
                """
                #s_rmse = 0.
                s_rmse = mean_squared_error(y, y_pred, squared=False)
                #s_r2 = 0.
                s_r2 = r2_score(y, y_pred)
                
                rmse_scores.append(s_rmse)
                r2_scores.append(s_r2)

    model.train()

    return torch.mean(torch.tensor(losses)), torch.mean(torch.tensor(rmse_scores)), torch.mean(torch.tensor(r2_scores))

In [85]:
def train(train_dataloader, test_dataloader, model, optimizer, criterion, device, n_epochs=1, verbose=True):
    model.train()

    if verbose:
        fmt = '{:<5} {:12} {:12} {:<9} {:<9} {:<9} {:<9}'
        print(fmt.format('Epoch', 'Train loss', 'Valid loss', 'Train RMSE', 'Train R2', 'Test RMSE', 'Test R2'))

    for epoch in range(n_epochs):
        
        for i, batch in enumerate(train_dataloader):
            X, y = batch
            
            X = X.to(device)
            y = y.to(device)

            optimizer.zero_grad()

            y_pred = model(X)
            loss = criterion(y_pred, y)
            loss.backward()
            optimizer.step()

        if verbose:
            train_loss, train_rmse_score, train_r2_score = evaluate(train_dataloader, model, criterion, device)
            test_loss, test_rmse_score, test_r2_score = evaluate(test_dataloader, model, criterion, device)

            fmt = '{:<5} {:03.2f} {:03.2f} {:02.2f} {:02.2f} {:02.2f} {:02.2f}'
            print(fmt.format(epoch, train_loss, test_loss, train_rmse_score, train_r2_score, test_rmse_score, test_r2_score))

In [4]:
class OneLayerModel(nn.Module):
    def __init__(self, n_input, n_hidden1, n_output, bs):
        super().__init__()
        self.bs = bs
        self.input_layer = nn.Linear(n_input, n_hidden1)
        self.hidden1 = nn.Linear(n_hidden1, n_output)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)
        x = self.hidden1(x)
        return x

In [5]:
class TwoLayerModel(nn.Module):
    def __init__(self, n_input, n_hidden1, n_hidden2, n_output, bs):
        super().__init__()
        self.bs = bs
        self.input_layer = nn.Linear(n_input, n_hidden1)
        self.hidden1 = nn.Linear(n_hidden1, n_hidden2)
        self.hidden2 = nn.Linear(n_hidden2, n_output)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)
        x = self.hidden1(x)
        x = self.relu(x)
        x = self.hidden2(x)
        return x

In [86]:
lr = 0.01
n_epochs = 1

In [87]:
use_cuda = True

device = torch.device('cuda') if use_cuda else torch.device('cpu')

bs = 1000
hidden_neurons = 1024

model = OneLayerModel(334, hidden_neurons, 1, bs).to(device)
#model = TwoLayerModel(334, 64, 32, 1, bs).to(device)

criterion = nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)

In [88]:
read_dataloaders_from_disk = True
train_dataloader_path = f'{base_dir}/nn-model_train_dataloader.dat'
test_dataloader_path = f'{base_dir}/nn-model_test_dataloader.dat'
if(read_dataloaders_from_disk):
    train_dataloader = torch.load(train_dataloader_path)
    test_dataloader = torch.load(test_dataloader_path)
else:
    train_dataloader, test_dataloader = get_dataloader(base_dir, bs, test_size=0.2)
    torch.save(train_dataloader, train_dataloader_path)
    torch.save(test_dataloader, test_dataloader_path)

In [89]:
train(train_dataloader, test_dataloader, model, optimizer, criterion, device, n_epochs)

Epoch Train loss   Valid loss   Train RMSE Train R2  Test RMSE Test R2  
<built-in method type of Tensor object at 0x000001EB36AC4500>
torch.Size([1000, 334])
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0435, 0.0400, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.1594,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [0.0725, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.1014, 0.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000]],
       device='cuda:0')
<built-in method type of Tensor object at 0x000001EB36AC4500>
torch.Size([1000, 334])
tensor([[0.0435, 0.8900, 0.0290,  ..., 0.0000, 0.0000, 1.0000],
        [0.0000, 0.5100, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1884, 0.0000, 0.0580,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0200, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0290,  ..., 0.0000, 0.0000, 0.0000],


<built-in method type of Tensor object at 0x000001EB2A4C3840>
torch.Size([1000, 334])
tensor([[0.1304, 0.0000, 0.0435,  ..., 0.0000, 0.0000, 1.0000],
        [0.0000, 0.0300, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0145,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0900, 0.0290,  ..., 0.0000, 0.0000, 0.0000],
        [0.0290, 0.0000, 0.0145,  ..., 1.0000, 0.0000, 0.0000]],
       device='cuda:0')
<built-in method type of Tensor object at 0x000001EB308B4B80>
torch.Size([1000, 334])
tensor([[0.0870, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0870, 0.0100, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0580,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0900, 0.1449,  ..., 0.0000, 0.0000, 0.0000],
        [0.1449, 0.0000, 0.1304,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0800, 0.0000,  ..., 1.0000, 0.0000, 0.0000]],
       d

<built-in method type of Tensor object at 0x000001EB2A4C3840>
torch.Size([1000, 334])
tensor([[0.0000, 0.0300, 0.1304,  ..., 1.0000, 0.0000, 0.0000],
        [0.0725, 0.0400, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.1594, 0.0900, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [0.1304, 0.0000, 0.0145,  ..., 0.0000, 0.0000, 1.0000],
        [0.0870, 0.0000, 0.1159,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0300, 0.0000,  ..., 0.0000, 1.0000, 0.0000]],
       device='cuda:0')
<built-in method type of Tensor object at 0x000001EB308B4B80>
torch.Size([1000, 334])
tensor([[0.1594, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.1594,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.8700, 0.0290,  ..., 0.0000, 0.0000, 0.0000],
        [0.1739, 0.0000, 0.0290,  ..., 0.0000, 0.0000, 1.0000],
        [0.0580, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       d

<built-in method type of Tensor object at 0x000001EB2A4C3840>
torch.Size([1000, 334])
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1449, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.5300, 0.1739,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0290, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0290, 0.6000, 0.1159,  ..., 0.0000, 1.0000, 0.0000],
        [0.0000, 0.0700, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')
<built-in method type of Tensor object at 0x000001EB308B4B80>
torch.Size([1000, 334])
tensor([[0.0000, 0.0000, 0.0435,  ..., 0.0000, 0.0000, 0.0000],
        [0.2754, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.6900, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0290, 0.0100, 0.0145,  ..., 0.0000, 1.0000, 0.0000],
        [0.0145, 0.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.2899, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000]],
       d

tensor([[0.0725, 0.0000, 0.0435,  ..., 0.0000, 0.0000, 0.0000],
        [0.2464, 0.0000, 0.0290,  ..., 1.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        ...,
        [0.2029, 0.3400, 0.2029,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0290, 0.0500, 0.0000,  ..., 0.0000, 1.0000, 0.0000]],
       device='cuda:0')
<built-in method type of Tensor object at 0x000001EB308B4B80>
torch.Size([1000, 334])
tensor([[0.1159, 0.0200, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0870, 0.0000, 0.0000,  ..., 1.0000, 0.0000, 0.0000],
        [0.1594, 0.0000, 0.1449,  ..., 0.0000, 1.0000, 0.0000],
        ...,
        [0.1304, 0.0000, 0.1594,  ..., 0.0000, 0.0000, 1.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.0000, 0.7900, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')
<built-in method type of Tensor object at 0x000001EB2A4C3840>
torch.Si

<built-in method type of Tensor object at 0x000001EB308B4B80>
torch.Size([1000, 334])
tensor([[0.0725, 0.0000, 0.0725,  ..., 0.0000, 0.0000, 1.0000],
        [0.0145, 0.0000, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.0145, 0.0000, 0.2319,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [0.0290, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.6700, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.0870, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')
<built-in method type of Tensor object at 0x000001EB2A4C3840>
torch.Size([1000, 334])
tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.0000, 0.0400, 0.0145,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [0.2174, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.0145, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0290, 0.0300, 0.1449,  ..., 0.0000, 0.0000, 0.0000]],
       d

tensor([[0.0145, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.7900, 0.1449,  ..., 0.0000, 1.0000, 0.0000],
        [0.0290, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0725, 0.7200, 0.0435,  ..., 0.0000, 0.0000, 0.0000],
        [0.0580, 0.0000, 0.1304,  ..., 1.0000, 0.0000, 0.0000],
        [0.0580, 0.1100, 0.1594,  ..., 0.0000, 0.0000, 1.0000]],
       device='cuda:0')
<built-in method type of Tensor object at 0x000001EB2A4C3840>
torch.Size([1000, 334])
tensor([[0.0725, 0.8100, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.6900, 0.0000,  ..., 0.0000, 1.0000, 0.0000],
        [0.0435, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        ...,
        [0.0000, 0.0000, 0.0580,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0100, 0.0870,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')
<built-in method type of Tensor object at 0x000001EB308B4B80>
torch.Si

tensor([[0.1014, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0290, 0.0300, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0145,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 1.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0870, 0.0000, 0.2319,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')
<built-in method type of Tensor object at 0x000001EB2A4C3840>
torch.Size([1000, 334])
tensor([[0.1884, 0.0000, 0.0870,  ..., 0.0000, 0.0000, 1.0000],
        [0.1449, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 1.0000],
        [0.0000, 0.5900, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.1449,  ..., 0.0000, 0.0000, 0.0000],
        [0.0580, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.2029,  ..., 0.0000, 0.0000, 0.0000]],
       device='cuda:0')
<built-in method type of Tensor object at 0x000001EB308B4B80>
torch.Si

KeyboardInterrupt: 

In [60]:
model_path = f'{base_dir}/neural-network_OneLayer_hidden_{hidden_neurons}_bs_{bs}_epochs_10.dat'
#model_path = f'{base_dir}/neural-network_TwoLayer_bs_{bs}_epochs_5.dat'

#model_path = f'{base_dir}/neural-network_OneLayer_bs_{bs}_epochs_10.dat'

In [61]:
#Save model to disk
save_model_to_disk = True
if(save_model_to_disk):
    torch.save(model, model_path)

In [47]:
#Load model from disk
load_model_from_disk = True
if(load_model_from_disk):
    model = torch.load(model_path)