In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, preprocessing
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [40]:
print(device)

cuda:0


In [4]:
torch.set_default_device('cuda:0')

In [5]:
from zipfile import ZipFile
import pandas as pd
zf = ZipFile("train_dataset.zip")
train_df = pd.read_csv(zf.open('train_dataset.csv'))

In [6]:
train_df = train_df.drop("Unnamed: 0",axis = 1)

In [7]:
zf = ZipFile("validation_dataset.zip")
val_df = pd.read_csv(zf.open('validation_dataset.csv'))

In [8]:
val_df = val_df.drop("Unnamed: 0",axis = 1)

In [9]:
zf = ZipFile("test_dataset.zip")
test_df = pd.read_csv(zf.open('test_dataset.csv'))

In [10]:
test_df = test_df.drop("Unnamed: 0",axis = 1)

In [11]:
train_df.head()

Unnamed: 0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,aisle_id,department_id,add_to_cart_order,reordered
0,1,1,2,8,0.0,196,77,7,1,0
1,1,1,2,8,0.0,14084,91,16,2,0
2,1,1,2,8,0.0,12427,23,19,3,0
3,1,1,2,8,0.0,26088,23,19,4,0
4,1,1,2,8,0.0,26405,54,17,5,0


In [12]:
val_df.head()

Unnamed: 0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,aisle_id,department_id,add_to_cart_order,reordered
0,1,11,4,8,14.0,196,77,7,1,1
1,1,11,4,8,14.0,25133,21,16,2,1
2,1,11,4,8,14.0,38928,120,16,3,1
3,1,11,4,8,14.0,26405,54,17,4,1
4,1,11,4,8,14.0,39657,45,19,5,1


In [13]:
test_df = test_df.drop(['order_id','eval_set'],axis=1)
test_df

Unnamed: 0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,3,13,5,15,11.0
1,4,6,3,12,30.0
2,6,4,3,16,22.0
3,11,8,6,11,8.0
4,12,6,1,20,30.0
...,...,...,...,...,...
74995,206202,23,2,17,6.0
74996,206204,5,4,14,14.0
74997,206206,68,0,20,0.0
74998,206207,17,2,13,14.0


In [14]:
columns = train_df.columns

In [15]:
columns

Index(['user_id', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order', 'product_id', 'aisle_id', 'department_id',
       'add_to_cart_order', 'reordered'],
      dtype='object')

In [16]:
input_cols = columns[:5]
input_cols

Index(['user_id', 'order_number', 'order_dow', 'order_hour_of_day',
       'days_since_prior_order'],
      dtype='object')

In [17]:
output_cols = columns[[5,8,9]]
output_cols

Index(['product_id', 'add_to_cart_order', 'reordered'], dtype='object')

In [18]:
normalized_train_df = train_df
normalized_train_df[input_cols] = preprocessing.normalize(train_df[input_cols])

In [19]:
normalized_train_df.head()

Unnamed: 0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,aisle_id,department_id,add_to_cart_order,reordered
0,0.119523,0.119523,0.239046,0.956183,0.0,196,77,7,1,0
1,0.119523,0.119523,0.239046,0.956183,0.0,14084,91,16,2,0
2,0.119523,0.119523,0.239046,0.956183,0.0,12427,23,19,3,0
3,0.119523,0.119523,0.239046,0.956183,0.0,26088,23,19,4,0
4,0.119523,0.119523,0.239046,0.956183,0.0,26405,54,17,5,0


In [20]:
normalized_val_df = val_df
normalized_val_df[input_cols] = preprocessing.normalize(val_df[input_cols])

In [21]:
normalized_val_df.head()

Unnamed: 0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,aisle_id,department_id,add_to_cart_order,reordered
0,0.050125,0.55138,0.200502,0.401004,0.701757,196,77,7,1,1
1,0.050125,0.55138,0.200502,0.401004,0.701757,25133,21,16,2,1
2,0.050125,0.55138,0.200502,0.401004,0.701757,38928,120,16,3,1
3,0.050125,0.55138,0.200502,0.401004,0.701757,26405,54,17,4,1
4,0.050125,0.55138,0.200502,0.401004,0.701757,39657,45,19,5,1


In [22]:
normalized_test_df = test_df
normalized_test_df[input_cols] = preprocessing.normalize(test_df[input_cols])

In [23]:
normalized_test_df.head()

Unnamed: 0,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,0.128037,0.554826,0.213395,0.640184,0.469469
1,0.120331,0.180497,0.090249,0.360994,0.902485
2,0.212,0.141333,0.106,0.565332,0.777332
3,0.545921,0.397033,0.297775,0.545921,0.397033
4,0.31182,0.15591,0.025985,0.5197,0.77955


In [24]:
zf = ZipFile('products.csv.zip')
products_df = pd.read_csv(zf.open('products.csv'))
products_df

Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13
...,...,...,...,...
49683,49684,"Vodka, Triple Distilled, Twist of Vanilla",124,5
49684,49685,En Croute Roast Hazelnut Cranberry,42,1
49685,49686,Artisan Baguette,112,3
49686,49687,Smartblend Healthy Metabolism Dry Cat Food,41,8


In [73]:
total_product = len(products_df.product_id)
total_product

49688

In [84]:
class ConsumerDataset:
    def __init__(self, df):
        self.x = torch.from_numpy(df.values[:,:5]).to(device)
        self.y = torch.from_numpy(df.values[:,[5]]).to(device)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, item):
        return [self.x[item].float(), self.y[item].int()]

In [42]:
class ConsumerDataset_test:
    def __init__(self, df):
        self.x = torch.from_numpy(df.values[:,:5]).to(device)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, item):
        return [self.x[item].float()]

In [43]:

output_mapping = dict(zip(products_df['product_id'].values,products_df['product_name'].values))

In [85]:
train_set = ConsumerDataset(normalized_train_df)


In [86]:
val_set = ConsumerDataset(normalized_val_df)

In [87]:
test_set = ConsumerDataset_test(normalized_test_df)

In [47]:
input_size = len(val_set.x[0])
print(input_size)
print(val_set.x[0].shape)

5
torch.Size([5])


In [101]:
train_set.x.shape[0]

32434489

In [49]:
from sklearn.metrics import accuracy_score
from torch import Tensor
from torch.nn import Linear
from torch.nn import LeakyReLU, Dropout
from torch.nn import Softmax
from torch.nn import Module
from torch.optim import SGD
from torch.nn.init import kaiming_uniform_

In [74]:
class RecSysModel(nn.Module):
    def __init__(self, input_size,n_hidden,output_features):
        super(RecSysModel, self).__init__()
        self.input_layer = Linear(input_size,n_hidden)
        self.dropout_input = Dropout(0.1)
        self.act_input = LeakyReLU()
        
        self.hidden1 = Linear(n_hidden,n_hidden)
        self.dropout1 = Dropout(0.1)
        self.act1 = LeakyReLU()

        self.hidden2 = Linear(n_hidden,n_hidden)
        self.dropout2 = Dropout(0.1)
        self.act2 = LeakyReLU()

        self.hidden3 = Linear(n_hidden,n_hidden)
        self.dropout3 = Dropout(0.1)
        self.act3 = LeakyReLU()

        # self.hidden4 = Linear(5,3)
        # self.dropout4 = Dropout(0.1)
        # self.act4 = LeakyReLU()
        
        self.output_layer = Linear(n_hidden,output_features)
        self.dropout_out = Dropout(0.4)
        self.act_out =Softmax(dim=-1)

        
    def forward(self, X):
        X = self.input_layer(X)
        X = self.dropout_input(X)
        X = self.act_input(X)        

        X = self.hidden1(X)
        X = self.dropout1(X)
        X = self.act1(X)

        X = self.hidden2(X)
        X = self.dropout2(X)
        X = self.act2(X)

        X = self.hidden3(X)
        X = self.dropout3(X)
        X = self.act3(X)

        # X = self.hidden4(X)
        # X = self.dropout4(X)
        # X = self.act4(X)
        
        X = self.output_layer(X)
        X = self.dropout_out(X)
        X = self.act_out(X)
        return X

In [88]:
model = RecSysModel(input_size,512,total_product)

In [None]:

model(tensor)

In [89]:
loss_fn = torch.nn.CrossEntropyLoss()
lr = 1e-4
optimizer = torch.optim.SGD(model.parameters(),lr=lr)

In [92]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, threshold=0.0001, threshold_mode='abs',verbose=True)

In [97]:
import random
def train(train_df, epoch, tb_writter):

    running_loss = 0.
    last_loss = 0.
    batch_size = 10000
    for i, [input,target] in enumerate(rn_train_set):
        optimizer.zero_grad()
        output = model(input)

        loss = torch.sqrt(loss_fn(output,target))
        loss.backward()

        optimizer.step()
        running_loss += loss.item()
        if i % batch_size == 0 and i > 0:
            last_loss = running_loss / batch_size
            print('  batch {} loss: {}  lr:'.format(i, last_loss, ))
            tb_x = epoch * len(rn_train_set) + i
            tb_writter.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.
            scheduler.step(last_loss)

    return last_loss

In [103]:
# Initializing in a separate cell so we can easily add more epochs to the same run
from datetime import datetime
from torch.utils.tensorboard.writer import SummaryWriter
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/RS_trainer_{}'.format(timestamp))
epoch_number = 0
EPOCHS = 5

batch_size = int(train_set.x.shape[0]/EPOCHS)
best_vloss = 1_000_000.

for epoch in range(EPOCHS):
    start,end = epoch*batch_size, (epoch+1)*batch_size
    if end > train_set.x.shape[0]:
        batch_train_set = train_set[start:]
    else:
        batch_train_set = train_set[start:end]
    print('EPOCH {}:'.format(epoch_number + 1))

    # Make sure gradient tracking is on, and do a pass over the data
    model.train(True)
    avg_loss = train(batch_train_set,epoch_number, writer)


    running_vloss = 0.0
    # Set the model to evaluation mode, disabling dropout and using population
    # statistics for batch normalization.
    model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, [val_input,val_target] in enumerate(val_set):
            val_predict = model(val_input)
            val_loss = torch.sqrt(loss_fn(val_predict, val_target))
            running_vloss += val_loss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))
    # Log the running loss averaged per batch
    # for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()

    # Track best performance, and save the model's state
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        model_path = 'model_{}_{}'.format(timestamp, epoch_number)
        torch.save(model.state_dict(), model_path)

    

    epoch_number += 1
    

EPOCH 1:


RuntimeError: size mismatch (got input: [49688], target: [1])

In [105]:
model.eval()
with torch.no_grad():
    for i, [val_input,val_target] in enumerate(val_set):
            val_predict = model(val_input)
            print(val_predict,val_target)
            if i == 10:
                break

TypeError: can't convert cuda:0 device type tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.