# Modeling - 8

In [26]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from xgboost import XGBClassifier
from sklearn.metrics import log_loss

from utils.utils import *
from utils.modeling import *
from features.ohe_dept import *
from features.base_features import *
from preprocess.pipe1 import PreprocessingV1
from neuralnet.torch import *

### Preprocessing Pipeline 1

In [27]:
# Import file
df = import_data("../data-ignore/train.csv")

In [28]:
# OHE the departments
ohe_dept_df, transformer = ohe_dept(df)

ohe_dept_groupby_df = ohe_dept_groupby(ohe_dept_df)
ohe_dept_groupby_df.reset_index(inplace=True)
ohe_dept_groupby_df

Unnamed: 0,VisitNumber,DepartmentDescription_1-HR PHOTO,DepartmentDescription_ACCESSORIES,DepartmentDescription_AUTOMOTIVE,DepartmentDescription_BAKERY,DepartmentDescription_BATH AND SHOWER,DepartmentDescription_BEAUTY,DepartmentDescription_BEDDING,DepartmentDescription_BOOKS AND MAGAZINES,DepartmentDescription_BOYS WEAR,...,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_nan
0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95670,191344,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
95671,191345,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95672,191346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [29]:
# Create a new feature indicating if scancount is negative
df['returns'] = df['ScanCount'].apply(lambda x: 1 if x < 0 else 0)

# Goupby VisitNumber
groupby_df = groupby_visitnumber(df)

groupby_df['weekend'] = groupby_df['weekday'].apply(lambda x: 1 if x in ['Saturday','Sunday'] else 0)
groupby_df['avg_scancount_per_upc'] = groupby_df['total_scancount'] / (groupby_df['num_unique_upc'] + 1)
groupby_df['avg_scancount_per_dept'] = groupby_df['total_scancount'] / (groupby_df['num_unique_dept'] + 1)

groupby_df

Unnamed: 0,VisitNumber,triptype,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline,contains_return,weekend,avg_scancount_per_upc,avg_scancount_per_dept
0,5,999,Friday,1,-1.000000,-1,1,1,1,0,-0.500000,-0.500000
1,7,30,Friday,2,1.000000,2,2,2,0,0,0.666667,0.666667
2,8,26,Friday,20,1.217391,28,6,16,1,0,1.333333,4.000000
3,9,8,Friday,3,1.000000,3,2,3,0,0,0.750000,1.000000
4,10,8,Friday,3,1.000000,3,2,3,0,0,0.750000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191343,25,Sunday,7,1.285714,9,3,5,0,1,1.125000,2.250000
95670,191344,22,Sunday,5,1.000000,5,2,3,0,1,0.833333,1.666667
95671,191345,39,Sunday,13,1.307692,17,8,12,0,1,1.214286,1.888889
95672,191346,39,Sunday,17,1.000000,17,8,16,0,1,0.944444,1.888889


In [30]:
# Join the 2 groupby'ed dataframes on the VisitNumber
df = pd.merge(groupby_df, ohe_dept_groupby_df, on="VisitNumber", how="inner")
df

Unnamed: 0,VisitNumber,triptype,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline,contains_return,weekend,...,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_nan
0,5,999,Friday,1,-1.000000,-1,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,30,Friday,2,1.000000,2,2,2,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8,26,Friday,20,1.217391,28,6,16,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,9,8,Friday,3,1.000000,3,2,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10,8,Friday,3,1.000000,3,2,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191343,25,Sunday,7,1.285714,9,3,5,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95670,191344,22,Sunday,5,1.000000,5,2,3,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
95671,191345,39,Sunday,13,1.307692,17,8,12,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95672,191346,39,Sunday,17,1.000000,17,8,16,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Pre-preprocess
X_train, X_test, y_train, y_test = pre_preprocess(df)

# Preprocessing
preprocessing = PreprocessingV1()
X_train_proc = preprocessing.fit_transform(X_train)
X_test_proc = preprocessing.transform(X_test)

X_train_proc

X_train: (81322, 79), y_train: (81322,), X_test: (14352, 79), y_test: (14352,)


Unnamed: 0,num__num_unique_upc,num__avg_scancount,num__total_scancount,num__num_unique_dept,num__num_unique_fileline,num__contains_return,num__weekend,num__avg_scancount_per_upc,num__avg_scancount_per_dept,num__DepartmentDescription_1-HR PHOTO,...,num__DepartmentDescription_TOYS,num__DepartmentDescription_WIRELESS,num__DepartmentDescription_nan,cat__weekday_Friday,cat__weekday_Monday,cat__weekday_Saturday,cat__weekday_Sunday,cat__weekday_Thursday,cat__weekday_Tuesday,cat__weekday_Wednesday
74630,-0.663062,-0.038488,-0.632608,-0.823057,-0.666973,-0.361107,1.345875,-0.610983,-0.713340,-0.039015,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,1.0,0.0,0.0,0.0
61468,-0.663062,-3.064435,-0.827308,-0.823057,-0.666973,2.769260,-0.743011,-2.527609,-1.559269,-0.039015,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10207,-0.424896,-0.038488,-0.437909,-0.473153,-0.393816,-0.361107,-0.743011,-0.131827,-0.290375,-0.039015,...,-0.1359,-0.092219,-0.072906,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25328,0.765932,-0.038488,0.535589,1.976168,0.835391,-0.361107,1.345875,0.210428,-0.036596,-0.039015,...,-0.1359,-0.092219,-0.072906,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15352,0.408683,-0.038488,0.243540,0.576556,0.562234,-0.361107,-0.743011,0.173091,0.273578,-0.039015,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57115,0.051435,0.177651,0.048840,-0.123250,-0.120659,-0.361107,-0.743011,0.347329,0.555555,-0.039015,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,0.0,0.0,1.0,0.0
360,-0.663062,1.474485,-0.535258,-0.823057,-0.666973,-0.361107,-0.743011,0.347329,-0.290375,-0.039015,...,-0.1359,-0.092219,-0.072906,1.0,0.0,0.0,0.0,0.0,0.0,0.0
89473,-0.067648,-0.254627,-0.145859,-0.123250,-0.393816,2.769260,1.345875,0.073526,0.132590,-0.039015,...,-0.1359,-0.092219,-0.072906,0.0,0.0,1.0,0.0,0.0,0.0,0.0
51195,0.051435,-0.038488,-0.048510,0.226653,0.015920,-0.361107,1.345875,0.107751,0.047997,-0.039015,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,1.0,0.0,0.0,0.0


### PyTorch

In [32]:
# Change data to tensor
X_train_tensor = torch.tensor(X_train_proc.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)

X_test_tensor = torch.tensor(X_test_proc.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Define the model instance
model = Net3(input_dim=X_train_tensor.shape[1])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=True)

In [33]:
epochs = 20
model.train()

for epoch in range(epochs):
  train_probs = []
  train_targets = []

  for X_batch, y_batch in train_loader:
    optimizer.zero_grad()
    logits = model(X_batch)
    loss = criterion(logits, y_batch)
    loss.backward()
    optimizer.step()
    
    probs = F.softmax(logits, dim=1)
    train_probs.append(probs)
    train_targets.append(y_batch)

  train_probs = torch.cat(train_probs).detach().cpu().numpy()
  train_targets = torch.cat(train_targets).detach().cpu().numpy()

  train_loss = log_loss(train_targets, train_probs)

  model.eval()
  test_probs = []
  test_targets = []
  with torch.no_grad():
    for X_batch, y_batch in test_loader:
      logits = model(X_batch)
      probs = F.softmax(logits, dim=1)
      test_probs.append(probs)
      test_targets.append(y_batch)

  test_probs = torch.cat(test_probs).cpu().numpy()
  test_targets = torch.cat(test_targets).cpu().numpy()

  test_loss = log_loss(test_targets, test_probs)

  print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")

Epoch 1, Train Loss: 1.5659, Test Loss: 1.4986
Epoch 2, Train Loss: 0.9945, Test Loss: 0.9756
Epoch 3, Train Loss: 0.9226, Test Loss: 0.9292
Epoch 4, Train Loss: 0.8903, Test Loss: 0.9081
Epoch 5, Train Loss: 0.8695, Test Loss: 0.9034
Epoch 6, Train Loss: 0.8544, Test Loss: 0.8872
Epoch 7, Train Loss: 0.8424, Test Loss: 0.8875
Epoch 8, Train Loss: 0.8326, Test Loss: 0.8828
Epoch 9, Train Loss: 0.8249, Test Loss: 0.8734
Epoch 10, Train Loss: 0.8179, Test Loss: 0.8682
Epoch 11, Train Loss: 0.8111, Test Loss: 0.8791
Epoch 12, Train Loss: 0.8058, Test Loss: 0.8771
Epoch 13, Train Loss: 0.8020, Test Loss: 0.8726
Epoch 14, Train Loss: 0.7982, Test Loss: 0.8814
Epoch 15, Train Loss: 0.7937, Test Loss: 0.8683
Epoch 16, Train Loss: 0.7907, Test Loss: 0.8673
Epoch 17, Train Loss: 0.7861, Test Loss: 0.8693
Epoch 18, Train Loss: 0.7844, Test Loss: 0.8703
Epoch 19, Train Loss: 0.7813, Test Loss: 0.8801
Epoch 20, Train Loss: 0.7805, Test Loss: 0.8800


In [34]:
# epochs = 5
# model.train()

# for epoch in range(epochs):
#   train_probs = []
#   train_targets = []

#   for X_batch, y_batch in train_loader:
#     optimizer.zero_grad()
#     logits = model(X_batch)
#     loss = criterion(logits, y_batch)
#     loss.backward()
#     optimizer.step()
    
#     probs = F.softmax(logits, dim=1)
#     train_probs.append(probs)
#     train_targets.append(y_batch)

#   train_probs = torch.cat(train_probs).detach().cpu().numpy()
#   train_targets = torch.cat(train_targets).detach().cpu().numpy()

#   train_loss = log_loss(train_targets, train_probs)

#   model.eval()
#   test_probs = []
#   test_targets = []
#   with torch.no_grad():
#     for X_batch, y_batch in test_loader:
#       logits = model(X_batch)
#       probs = F.softmax(logits, dim=1)
#       test_probs.append(probs)
#       test_targets.append(y_batch)

#   test_probs = torch.cat(test_probs).cpu().numpy()
#   test_targets = torch.cat(test_targets).cpu().numpy()

#   test_loss = log_loss(test_targets, test_probs)

#   print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")

In [35]:
# epochs = 5
# model.train()

# for epoch in range(epochs):
#   train_probs = []
#   train_targets = []

#   for X_batch, y_batch in train_loader:
#     optimizer.zero_grad()
#     logits = model(X_batch)
#     loss = criterion(logits, y_batch)
#     loss.backward()
#     optimizer.step()
    
#     probs = F.softmax(logits, dim=1)
#     train_probs.append(probs)
#     train_targets.append(y_batch)

#   train_probs = torch.cat(train_probs).detach().cpu().numpy()
#   train_targets = torch.cat(train_targets).detach().cpu().numpy()

#   train_loss = log_loss(train_targets, train_probs)

#   model.eval()
#   test_probs = []
#   test_targets = []
#   with torch.no_grad():
#     for X_batch, y_batch in test_loader:
#       logits = model(X_batch)
#       probs = F.softmax(logits, dim=1)
#       test_probs.append(probs)
#       test_targets.append(y_batch)

#   test_probs = torch.cat(test_probs).cpu().numpy()
#   test_targets = torch.cat(test_targets).cpu().numpy()

#   test_loss = log_loss(test_targets, test_probs)

#   print(f"Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")

# Predict on Kaggle

In [36]:
# Import data
kaggle_test = pd.read_csv('../data-ignore/test.csv', dtype={'Upc': str})
kaggle_test.head()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503389714,1,SHOES,3002.0
1,1,Friday,1707710732,1,DAIRY,1526.0
2,1,Friday,89470001026,1,DAIRY,1431.0
3,1,Friday,88491211470,1,GROCERY DRY GOODS,3555.0
4,2,Friday,2840015224,1,DSD GROCERY,4408.0


In [37]:
# OHE the departments
kaggle_ohe_dept_df, transformer = ohe_dept(kaggle_test)

kaggle_ohe_dept_groupby_df = ohe_dept_groupby(kaggle_ohe_dept_df)
kaggle_ohe_dept_groupby_df.reset_index(inplace=True)
kaggle_ohe_dept_groupby_df

Unnamed: 0,VisitNumber,DepartmentDescription_1-HR PHOTO,DepartmentDescription_ACCESSORIES,DepartmentDescription_AUTOMOTIVE,DepartmentDescription_BAKERY,DepartmentDescription_BATH AND SHOWER,DepartmentDescription_BEAUTY,DepartmentDescription_BEDDING,DepartmentDescription_BOOKS AND MAGAZINES,DepartmentDescription_BOYS WEAR,...,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_nan
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95670,191339,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95671,191340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95672,191341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
# Create a new feature indicating if scancount is negative
kaggle_test['returns'] = kaggle_test['ScanCount'].apply(lambda x: 1 if x < 0 else 0)

# Goupby VisitNumber
kaggle_groupby_df = groupby_visitnumber_kaggle(kaggle_test)

kaggle_groupby_df['weekend'] = kaggle_groupby_df['weekday'].apply(lambda x: 1 if x in ['Saturday','Sunday'] else 0)
kaggle_groupby_df['avg_scancount_per_upc'] = kaggle_groupby_df['total_scancount'] / (kaggle_groupby_df['num_unique_upc'] + 1)
kaggle_groupby_df['avg_scancount_per_dept'] = kaggle_groupby_df['total_scancount'] / (kaggle_groupby_df['num_unique_dept'] + 1)

kaggle_groupby_df

Unnamed: 0,VisitNumber,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline,contains_return,weekend,avg_scancount_per_upc,avg_scancount_per_dept
0,1,Friday,4,1.000000,4,3,4,0,0,0.800000,1.00
1,2,Friday,4,1.000000,4,3,3,0,0,0.800000,1.00
2,3,Friday,1,0.000000,0,1,1,1,0,0.000000,0.00
3,4,Friday,1,1.000000,1,1,1,0,0,0.500000,0.50
4,6,Friday,1,0.000000,0,1,1,1,0,0.000000,0.00
...,...,...,...,...,...,...,...,...,...,...,...
95669,191338,Sunday,7,1.714286,12,3,6,0,1,1.500000,3.00
95670,191339,Sunday,5,1.200000,6,3,5,0,1,1.000000,1.50
95671,191340,Sunday,1,2.000000,2,1,1,0,1,1.000000,1.00
95672,191341,Sunday,10,1.200000,12,5,10,0,1,1.090909,2.00


In [39]:
# Join the 2 groupby'ed dataframes on the VisitNumber
kaggle_df = pd.merge(kaggle_groupby_df, kaggle_ohe_dept_groupby_df, on="VisitNumber", how="inner")
kaggle_df.fillna(0, inplace=True)
kaggle_df

Unnamed: 0,VisitNumber,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline,contains_return,weekend,avg_scancount_per_upc,...,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_nan
0,1,Friday,4,1.000000,4,3,4,0,0,0.800000,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Friday,4,1.000000,4,3,3,0,0,0.800000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Friday,1,0.000000,0,1,1,1,0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Friday,1,1.000000,1,1,1,0,0,0.500000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,Friday,1,0.000000,0,1,1,1,0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191338,Sunday,7,1.714286,12,3,6,0,1,1.500000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95670,191339,Sunday,5,1.200000,6,3,5,0,1,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95671,191340,Sunday,1,2.000000,2,1,1,0,1,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95672,191341,Sunday,10,1.200000,12,5,10,0,1,1.090909,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
kaggle_df.isna().sum().sum()

np.int64(0)

In [41]:
# Save the VisitNumber
kg_index = kaggle_df['VisitNumber'].astype(str)
kg_index.head()

0    1
1    2
2    3
3    4
4    6
Name: VisitNumber, dtype: object

In [42]:
# Preprocessing
preprocessing = PreprocessingV1()
preprocessing.fit_transform(X_train)
kaggle_proc = preprocessing.transform(kaggle_df)

In [43]:
kaggle_proc.fillna(0, inplace=True)
kaggle_proc.isna().sum().sum()

np.int64(0)

In [44]:
# Change data to tensor
kaggle_tensor = torch.tensor(kaggle_proc.values, dtype=torch.float32)
kaggle_dataset = TensorDataset(kaggle_tensor)

# Create dataloaders
kaggle_loader = DataLoader(kaggle_dataset, batch_size=64, shuffle=False)

In [45]:
# Predict
model.eval()
kaggle_probs = []

with torch.no_grad():
  for (X_batch,) in kaggle_loader:
    logits = model(X_batch)
    probs = torch.softmax(logits, dim=1)
    kaggle_probs.append(probs)

kaggle_probs = torch.cat(kaggle_probs).cpu().numpy()
kaggle_probs

array([[6.7211654e-06, 1.5675697e-04, 9.7626331e-04, ..., 1.7717512e-02,
        7.1935561e-03, 3.5716665e-03],
       [7.7480930e-05, 1.2136684e-04, 7.2043658e-05, ..., 2.3028755e-03,
        1.0100832e-04, 3.1218342e-03],
       [1.4313976e-05, 9.8874580e-06, 2.7920456e-05, ..., 7.9545138e-08,
        2.6474126e-08, 9.9644703e-01],
       ...,
       [3.1896252e-02, 5.5154378e-05, 3.8876002e-05, ..., 1.8256758e-06,
        5.7218825e-09, 1.1681947e-02],
       [5.1515436e-10, 1.5383030e-07, 1.3798045e-07, ..., 9.7901328e-03,
        2.6921134e-02, 5.4919790e-04],
       [1.6979055e-05, 3.7585257e-10, 1.2285875e-07, ..., 1.0667936e-05,
        6.7738110e-06, 5.8898074e-04]], shape=(95674, 38), dtype=float32)

In [46]:
# Import headers from sample_submission
sample_sub = pd.read_csv('../data-ignore/sample_submission.csv')
headers = sample_sub.columns.to_list()
headers

['VisitNumber',
 'TripType_3',
 'TripType_4',
 'TripType_5',
 'TripType_6',
 'TripType_7',
 'TripType_8',
 'TripType_9',
 'TripType_12',
 'TripType_14',
 'TripType_15',
 'TripType_18',
 'TripType_19',
 'TripType_20',
 'TripType_21',
 'TripType_22',
 'TripType_23',
 'TripType_24',
 'TripType_25',
 'TripType_26',
 'TripType_27',
 'TripType_28',
 'TripType_29',
 'TripType_30',
 'TripType_31',
 'TripType_32',
 'TripType_33',
 'TripType_34',
 'TripType_35',
 'TripType_36',
 'TripType_37',
 'TripType_38',
 'TripType_39',
 'TripType_40',
 'TripType_41',
 'TripType_42',
 'TripType_43',
 'TripType_44',
 'TripType_999']

In [47]:
# Create submission dataframe with predictions
submission = pd.DataFrame(np.round(kaggle_probs, 4), index=kg_index, columns=headers[1:])
submission.reset_index(inplace=True)
submission

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,0.0000,0.0002,0.0010,0.0002,0.0070,0.0281,0.0286,0.0029,0.0000,...,0.0094,0.0018,0.2408,0.1019,0.0005,0.0078,0.0100,0.0177,0.0072,0.0036
1,2,0.0001,0.0001,0.0001,0.0016,0.0048,0.0029,0.0006,0.0013,0.0000,...,0.0057,0.0014,0.0515,0.1030,0.0015,0.0001,0.0177,0.0023,0.0001,0.0031
2,3,0.0000,0.0000,0.0000,0.0005,0.0000,0.0013,0.0003,0.0000,0.0000,...,0.0000,0.0000,0.0004,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.9964
3,4,0.0002,0.0001,0.0004,0.0002,0.0001,0.0120,0.9474,0.0000,0.0000,...,0.0001,0.0001,0.0001,0.0000,0.0000,0.0000,0.0004,0.0000,0.0000,0.0141
4,6,0.0000,0.0000,0.0000,0.0005,0.0000,0.0013,0.0003,0.0000,0.0000,...,0.0000,0.0000,0.0004,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.9964
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191338,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0000,0.0000,0.0012,0.0178,0.0001,0.0000,0.0000,0.0000,0.0000,0.0001
95670,191339,0.0000,0.0001,0.0009,0.0001,0.0000,0.0000,0.0001,0.0007,0.0001,...,0.0210,0.0014,0.0029,0.0058,0.0002,0.0360,0.3197,0.0904,0.0012,0.0008
95671,191340,0.0319,0.0001,0.0000,0.0010,0.0026,0.7531,0.1820,0.0000,0.0000,...,0.0003,0.0003,0.0003,0.0000,0.0000,0.0000,0.0010,0.0000,0.0000,0.0117
95672,191341,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0005,0.0000,...,0.0000,0.0000,0.0007,0.0223,0.0001,0.0017,0.0032,0.0098,0.0269,0.0005


In [48]:
# Save submission to csv
submission.to_csv('../data-ignore/submission10.csv', header=True, index=False)

In [49]:
# Kaggle score: private = 0.90157, public = 0.92346
# Kaggle place 