# Modeling - 4

In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.metrics import log_loss

from utils.utils import *
from utils.modeling import *
from features.ohe_dept import *
from features.base_features import *
from preprocess.pipe1 import PreprocessingV1

### Preprocessing Pipeline 1

In [2]:
# Import file
df = import_data("../data-ignore/train.csv")

In [3]:
# OHE the departments
ohe_dept_df, transformer = ohe_dept(df)

ohe_dept_groupby_df = ohe_dept_groupby(ohe_dept_df)
ohe_dept_groupby_df.reset_index(inplace=True)
ohe_dept_groupby_df

Unnamed: 0,VisitNumber,DepartmentDescription_1-HR PHOTO,DepartmentDescription_ACCESSORIES,DepartmentDescription_AUTOMOTIVE,DepartmentDescription_BAKERY,DepartmentDescription_BATH AND SHOWER,DepartmentDescription_BEAUTY,DepartmentDescription_BEDDING,DepartmentDescription_BOOKS AND MAGAZINES,DepartmentDescription_BOYS WEAR,...,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_nan
0,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191343,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95670,191344,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
95671,191345,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95672,191346,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Create a new feature indicating if scancount is negative
df['returns'] = df['ScanCount'].apply(lambda x: 1 if x < 0 else 0)

# Goupby VisitNumber
groupby_df = groupby_visitnumber(df)

groupby_df['weekend'] = groupby_df['weekday'].apply(lambda x: 1 if x in ['Saturday','Sunday'] else 0)

groupby_df

Unnamed: 0,VisitNumber,triptype,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline,contains_return,weekend
0,5,999,Friday,1,-1.000000,-1,1,1,1,0
1,7,30,Friday,2,1.000000,2,2,2,0,0
2,8,26,Friday,20,1.217391,28,6,16,1,0
3,9,8,Friday,3,1.000000,3,2,3,0,0
4,10,8,Friday,3,1.000000,3,2,3,0,0
...,...,...,...,...,...,...,...,...,...,...
95669,191343,25,Sunday,7,1.285714,9,3,5,0,1
95670,191344,22,Sunday,5,1.000000,5,2,3,0,1
95671,191345,39,Sunday,13,1.307692,17,8,12,0,1
95672,191346,39,Sunday,17,1.000000,17,8,16,0,1


In [5]:
# Join the 2 groupby'ed dataframes on the VisitNumber
df = pd.merge(groupby_df, ohe_dept_groupby_df, on="VisitNumber", how="inner")
df

Unnamed: 0,VisitNumber,triptype,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline,contains_return,weekend,...,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_nan
0,5,999,Friday,1,-1.000000,-1,1,1,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,7,30,Friday,2,1.000000,2,2,2,0,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,8,26,Friday,20,1.217391,28,6,16,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,9,8,Friday,3,1.000000,3,2,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10,8,Friday,3,1.000000,3,2,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191343,25,Sunday,7,1.285714,9,3,5,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95670,191344,22,Sunday,5,1.000000,5,2,3,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
95671,191345,39,Sunday,13,1.307692,17,8,12,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95672,191346,39,Sunday,17,1.000000,17,8,16,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Pre-preprocess
X_train, X_test, y_train, y_test = pre_preprocess(df)

# Preprocessing
preprocessing = PreprocessingV1()
X_train_proc = preprocessing.fit_transform(X_train)
X_test_proc = preprocessing.transform(X_test)

X_train_proc

X_train: (81322, 77), y_train: (81322,), X_test: (14352, 77), y_test: (14352,)


Unnamed: 0,num__num_unique_upc,num__avg_scancount,num__total_scancount,num__num_unique_dept,num__num_unique_fileline,num__contains_return,num__weekend,num__DepartmentDescription_1-HR PHOTO,num__DepartmentDescription_ACCESSORIES,num__DepartmentDescription_AUTOMOTIVE,...,num__DepartmentDescription_TOYS,num__DepartmentDescription_WIRELESS,num__DepartmentDescription_nan,cat__weekday_Friday,cat__weekday_Monday,cat__weekday_Saturday,cat__weekday_Sunday,cat__weekday_Thursday,cat__weekday_Tuesday,cat__weekday_Wednesday
74630,-0.663062,-0.038488,-0.632608,-0.823057,-0.666973,-0.361107,1.345875,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,1.0,0.0,0.0,0.0
61468,-0.663062,-3.064435,-0.827308,-0.823057,-0.666973,2.769260,-0.743011,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,0.0,0.0,0.0,1.0
10207,-0.424896,-0.038488,-0.437909,-0.473153,-0.393816,-0.361107,-0.743011,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25328,0.765932,-0.038488,0.535589,1.976168,0.835391,-0.361107,1.345875,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,0.0,0.0,1.0,0.0,0.0,0.0,0.0
15352,0.408683,-0.038488,0.243540,0.576556,0.562234,-0.361107,-0.743011,-0.039015,10.708021,-0.134757,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
57115,0.051435,0.177651,0.048840,-0.123250,-0.120659,-0.361107,-0.743011,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,0.0,0.0,1.0,0.0
360,-0.663062,1.474485,-0.535258,-0.823057,-0.666973,-0.361107,-0.743011,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,1.0,0.0,0.0,0.0,0.0,0.0,0.0
89473,-0.067648,-0.254627,-0.145859,-0.123250,-0.393816,2.769260,1.345875,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,0.0,0.0,1.0,0.0,0.0,0.0,0.0
51195,0.051435,-0.038488,-0.048510,0.226653,0.015920,-0.361107,1.345875,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [7]:
# Train on 3 models
results_df = train_model(X_train_proc, y_train, X_test_proc, y_test)
results_df


Running Logistic Regression...
Running Random Forest...
Running XGBoost...


Unnamed: 0,Model,Train Loss,Test Loss
0,LogReg,1.157849,1.146657
1,RForest,1.639791,1.633458
2,XGBoost,0.912502,0.883545


# Predict on Kaggle

In [8]:
# Import data
kaggle_test = pd.read_csv('../data-ignore/test.csv', dtype={'Upc': str})
kaggle_test.head()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503389714,1,SHOES,3002.0
1,1,Friday,1707710732,1,DAIRY,1526.0
2,1,Friday,89470001026,1,DAIRY,1431.0
3,1,Friday,88491211470,1,GROCERY DRY GOODS,3555.0
4,2,Friday,2840015224,1,DSD GROCERY,4408.0


In [9]:
# OHE the departments
kaggle_ohe_dept_df, transformer = ohe_dept(kaggle_test)

kaggle_ohe_dept_groupby_df = ohe_dept_groupby(kaggle_ohe_dept_df)
kaggle_ohe_dept_groupby_df.reset_index(inplace=True)
kaggle_ohe_dept_groupby_df

Unnamed: 0,VisitNumber,DepartmentDescription_1-HR PHOTO,DepartmentDescription_ACCESSORIES,DepartmentDescription_AUTOMOTIVE,DepartmentDescription_BAKERY,DepartmentDescription_BATH AND SHOWER,DepartmentDescription_BEAUTY,DepartmentDescription_BEDDING,DepartmentDescription_BOOKS AND MAGAZINES,DepartmentDescription_BOYS WEAR,...,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_nan
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191338,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95670,191339,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95671,191340,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95672,191341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# Create a new feature indicating if scancount is negative
kaggle_test['returns'] = kaggle_test['ScanCount'].apply(lambda x: 1 if x < 0 else 0)

# Goupby VisitNumber
kaggle_groupby_df = groupby_visitnumber_kaggle(kaggle_test)

kaggle_groupby_df['weekend'] = kaggle_groupby_df['weekday'].apply(lambda x: 1 if x in ['Saturday','Sunday'] else 0)

kaggle_groupby_df

Unnamed: 0,VisitNumber,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline,contains_return,weekend
0,1,Friday,4,1.000000,4,3,4,0,0
1,2,Friday,4,1.000000,4,3,3,0,0
2,3,Friday,1,0.000000,0,1,1,1,0
3,4,Friday,1,1.000000,1,1,1,0,0
4,6,Friday,1,0.000000,0,1,1,1,0
...,...,...,...,...,...,...,...,...,...
95669,191338,Sunday,7,1.714286,12,3,6,0,1
95670,191339,Sunday,5,1.200000,6,3,5,0,1
95671,191340,Sunday,1,2.000000,2,1,1,0,1
95672,191341,Sunday,10,1.200000,12,5,10,0,1


In [11]:
# Join the 2 groupby'ed dataframes on the VisitNumber
kaggle_df = pd.merge(kaggle_groupby_df, kaggle_ohe_dept_groupby_df, on="VisitNumber", how="inner")
kaggle_df

Unnamed: 0,VisitNumber,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline,contains_return,weekend,DepartmentDescription_1-HR PHOTO,...,DepartmentDescription_SEASONAL,DepartmentDescription_SERVICE DELI,DepartmentDescription_SHEER HOSIERY,DepartmentDescription_SHOES,DepartmentDescription_SLEEPWEAR/FOUNDATIONS,DepartmentDescription_SPORTING GOODS,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,DepartmentDescription_nan
0,1,Friday,4,1.000000,4,3,4,0,0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Friday,4,1.000000,4,3,3,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Friday,1,0.000000,0,1,1,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Friday,1,1.000000,1,1,1,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6,Friday,1,0.000000,0,1,1,1,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191338,Sunday,7,1.714286,12,3,6,0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95670,191339,Sunday,5,1.200000,6,3,5,0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95671,191340,Sunday,1,2.000000,2,1,1,0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
95672,191341,Sunday,10,1.200000,12,5,10,0,1,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
# Save the VisitNumber
kg_index = kaggle_df['VisitNumber'].astype(str)
kg_index.head()

0    1
1    2
2    3
3    4
4    6
Name: VisitNumber, dtype: object

In [13]:
# Preprocessing
preprocessing = PreprocessingV1()
preprocessing.fit_transform(X_train)
kaggle_proc = preprocessing.transform(kaggle_df)

kaggle_proc

Unnamed: 0,num__num_unique_upc,num__avg_scancount,num__total_scancount,num__num_unique_dept,num__num_unique_fileline,num__contains_return,num__weekend,num__DepartmentDescription_1-HR PHOTO,num__DepartmentDescription_ACCESSORIES,num__DepartmentDescription_AUTOMOTIVE,...,num__DepartmentDescription_TOYS,num__DepartmentDescription_WIRELESS,num__DepartmentDescription_nan,cat__weekday_Friday,cat__weekday_Monday,cat__weekday_Saturday,cat__weekday_Sunday,cat__weekday_Thursday,cat__weekday_Tuesday,cat__weekday_Wednesday
0,-0.305814,-0.038488,-0.340559,-0.123250,-0.257237,-0.361107,-0.743011,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,-0.305814,-0.038488,-0.340559,-0.123250,-0.393816,-0.361107,-0.743011,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.663062,-1.551462,-0.729958,-0.823057,-0.666973,2.769260,-0.743011,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.663062,-0.038488,-0.632608,-0.823057,-0.666973,-0.361107,-0.743011,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.663062,-1.551462,-0.729958,-0.823057,-0.666973,2.769260,-0.743011,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,0.051435,1.042207,0.438239,-0.123250,0.015920,-0.361107,1.345875,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,1.0,0.0,0.0,0.0
95670,-0.186731,0.264106,-0.145859,-0.123250,-0.120659,-0.361107,1.345875,-0.039015,-0.069445,2.213364,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,1.0,0.0,0.0,0.0
95671,-0.663062,1.474485,-0.535258,-0.823057,-0.666973,-0.361107,1.345875,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,1.0,0.0,0.0,0.0
95672,0.408683,0.264106,0.438239,0.576556,0.562234,-0.361107,1.345875,-0.039015,-0.069445,-0.134757,...,-0.1359,-0.092219,-0.072906,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [14]:
# Train on full train dataset
full_X_train = pd.concat([X_train_proc, X_test_proc])
full_y_train = pd.concat([y_train, y_test])

# Retrain model
xgb_model = XGBClassifier()
xgb_model.fit(full_X_train, full_y_train)
train_proba = xgb_model.predict_proba(full_X_train)

# Check performance on train
train_loss = log_loss(full_y_train, train_proba)
print(f"Train loss: {train_loss:.4f}")

# Predict on kaggle test
kg_proba = xgb_model.predict_proba(kaggle_proc)
kg_proba

Train loss: 0.5722


array([[1.4810600e-05, 6.4351707e-06, 1.0459673e-04, ..., 5.3031468e-03,
        4.6744861e-04, 5.4428601e-03],
       [4.2483382e-05, 2.3636883e-05, 3.0214927e-04, ..., 8.3127869e-03,
        3.3856512e-04, 1.1520214e-02],
       [2.2197112e-06, 6.1485473e-08, 8.6560902e-07, ..., 2.1547082e-07,
        1.4294328e-07, 9.9851400e-01],
       ...,
       [1.3571969e-03, 1.3599028e-05, 8.6199856e-05, ..., 1.6874077e-05,
        2.2677683e-05, 5.7532605e-02],
       [2.2814338e-06, 2.2686070e-06, 8.7386434e-06, ..., 1.5650799e-02,
        2.1192158e-02, 1.5960138e-04],
       [4.3522724e-07, 8.5559634e-08, 1.2877972e-07, ..., 7.8899233e-04,
        9.7172779e-05, 1.3211250e-04]], shape=(95674, 38), dtype=float32)

In [15]:
# Import headers from sample_submission
sample_sub = pd.read_csv('../data-ignore/sample_submission.csv')
headers = sample_sub.columns.to_list()
headers

['VisitNumber',
 'TripType_3',
 'TripType_4',
 'TripType_5',
 'TripType_6',
 'TripType_7',
 'TripType_8',
 'TripType_9',
 'TripType_12',
 'TripType_14',
 'TripType_15',
 'TripType_18',
 'TripType_19',
 'TripType_20',
 'TripType_21',
 'TripType_22',
 'TripType_23',
 'TripType_24',
 'TripType_25',
 'TripType_26',
 'TripType_27',
 'TripType_28',
 'TripType_29',
 'TripType_30',
 'TripType_31',
 'TripType_32',
 'TripType_33',
 'TripType_34',
 'TripType_35',
 'TripType_36',
 'TripType_37',
 'TripType_38',
 'TripType_39',
 'TripType_40',
 'TripType_41',
 'TripType_42',
 'TripType_43',
 'TripType_44',
 'TripType_999']

In [16]:
# Create submission dataframe with predictions
submission = pd.DataFrame(np.round(kg_proba, 4), index=kg_index, columns=headers[1:])
submission.reset_index(inplace=True)
submission

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,0.0000,0.0,0.0001,0.0001,0.0069,0.0221,0.0204,0.0011,0.0000,...,0.0001,0.0005,0.3639,0.0630,0.0002,0.0041,0.0071,0.0053,0.0005,0.0054
1,2,0.0000,0.0,0.0003,0.0002,0.0677,0.0231,0.0106,0.0043,0.0001,...,0.0006,0.0008,0.0313,0.1119,0.0003,0.0019,0.0041,0.0083,0.0003,0.0115
2,3,0.0000,0.0,0.0000,0.0000,0.0001,0.0006,0.0006,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.9985
3,4,0.0003,0.0,0.0001,0.0003,0.0061,0.0654,0.8376,0.0000,0.0000,...,0.0003,0.0001,0.0005,0.0003,0.0001,0.0001,0.0007,0.0000,0.0000,0.0384
4,6,0.0000,0.0,0.0000,0.0000,0.0001,0.0006,0.0006,0.0000,0.0000,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.9985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191338,0.0000,0.0,0.0000,0.0002,0.0008,0.0001,0.0000,0.0002,0.0000,...,0.0000,0.0001,0.0023,0.1031,0.0009,0.0002,0.0004,0.0020,0.0007,0.0003
95670,191339,0.0000,0.0,0.0000,0.0001,0.0009,0.0006,0.0019,0.0001,0.0000,...,0.0001,0.0003,0.0019,0.0221,0.0001,0.0078,0.0936,0.0121,0.0008,0.0027
95671,191340,0.0014,0.0,0.0001,0.0032,0.0043,0.7003,0.1626,0.0001,0.0000,...,0.0006,0.0001,0.0005,0.0004,0.0001,0.0000,0.0082,0.0000,0.0000,0.0575
95672,191341,0.0000,0.0,0.0000,0.0000,0.0001,0.0000,0.0001,0.0001,0.0000,...,0.0001,0.0001,0.0004,0.0740,0.0026,0.0119,0.0178,0.0157,0.0212,0.0002


In [17]:
# Save submission to csv
submission.to_csv('../data-ignore/submission6.csv', header=True, index=False)

In [18]:
# Kaggle score: private = 0.87181, public = 0.89295
# Kaggle place 373 / 1043 = 36%