# Create Baseline Model

- Goal: Top 40% = 418/1043 participants
- 418th place = 0.98853

In [42]:
# Imports
import pandas as pd
import numpy as np
from torch.utils.tensorboard import SummaryWriter

from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn import set_config
from sklearn.metrics import log_loss
from sklearn.compose import ColumnTransformer

set_config(transform_output='pandas')

In [43]:
# Import data
original_data = pd.read_csv('../data-ignore/train.csv', dtype={'Upc': str})
original_data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,60538815980,1,SHOES,8931.0
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504.0
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017.0


In [44]:
# Group by VisitNumber to get features and target (triptype)
df = original_data.groupby('VisitNumber').agg(triptype=('TripType','first'),
                              weekday=('Weekday','first'),
                              num_unique_upc=('Upc','nunique'),
                              avg_scancount=('ScanCount','mean'),
                              total_scancount=('ScanCount','sum'),
                              num_unique_dept=('DepartmentDescription','nunique'),
                              num_unique_fileline=('FinelineNumber','nunique')
                              )
df.reset_index(inplace=True)
df

Unnamed: 0,VisitNumber,triptype,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
0,5,999,Friday,1,-1.000000,-1,1,1
1,7,30,Friday,2,1.000000,2,2,2
2,8,26,Friday,20,1.217391,28,6,16
3,9,8,Friday,3,1.000000,3,2,3
4,10,8,Friday,3,1.000000,3,2,3
...,...,...,...,...,...,...,...,...
95669,191343,25,Sunday,7,1.285714,9,3,5
95670,191344,22,Sunday,5,1.000000,5,2,3
95671,191345,39,Sunday,13,1.307692,17,8,12
95672,191346,39,Sunday,17,1.000000,17,8,16


## Start Experiment Tracking

In [45]:
# Set up the SummaryWriter

writer = SummaryWriter(log_dir='runs/exp1')

# Pipeline

In [46]:
# Drop VisitNumber
# triptype should be string
# OneHotEncode weekday
# StandardScaler all number features

### Preprocessing

In [47]:
# Drop the VisitNumber column
df.drop(columns='VisitNumber', inplace=True)

In [48]:
# Change triptype to string
df['triptype'] = df['triptype'].astype(str)
print(type(df['triptype'][0]))
df

<class 'str'>


Unnamed: 0,triptype,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
0,999,Friday,1,-1.000000,-1,1,1
1,30,Friday,2,1.000000,2,2,2
2,26,Friday,20,1.217391,28,6,16
3,8,Friday,3,1.000000,3,2,3
4,8,Friday,3,1.000000,3,2,3
...,...,...,...,...,...,...,...
95669,25,Sunday,7,1.285714,9,3,5
95670,22,Sunday,5,1.000000,5,2,3
95671,39,Sunday,13,1.307692,17,8,12
95672,39,Sunday,17,1.000000,17,8,16


In [49]:
# Use LabelEncoder to encode the labels
le = LabelEncoder()
df['triptype'] = le.fit_transform(df['triptype'])
df['triptype'].value_counts()

triptype
35    12161
25     9896
36     9464
37     8444
27     6130
34     5752
32     4593
10     3698
15     3643
22     3005
24     2912
23     2788
9      2609
21     2030
18     1984
29     1858
19     1315
33     1277
31     1187
16     1081
2       978
7       928
30      872
12      785
20      719
6       641
5       637
17      594
28      583
3       549
11      503
13      492
14      433
4       375
26      346
0       269
8       139
1         4
Name: count, dtype: int64

In [50]:
# Get the list of classes
class_list = le.classes_
class_list

array(['12', '14', '15', '18', '19', '20', '21', '22', '23', '24', '25',
       '26', '27', '28', '29', '3', '30', '31', '32', '33', '34', '35',
       '36', '37', '38', '39', '4', '40', '41', '42', '43', '44', '5',
       '6', '7', '8', '9', '999'], dtype=object)

In [51]:
# Create a dictionary that maps the encoded class to the original labels
class_map = {}
for idx, label in enumerate(class_list):
  class_map[idx] = label

class_map

{0: '12',
 1: '14',
 2: '15',
 3: '18',
 4: '19',
 5: '20',
 6: '21',
 7: '22',
 8: '23',
 9: '24',
 10: '25',
 11: '26',
 12: '27',
 13: '28',
 14: '29',
 15: '3',
 16: '30',
 17: '31',
 18: '32',
 19: '33',
 20: '34',
 21: '35',
 22: '36',
 23: '37',
 24: '38',
 25: '39',
 26: '4',
 27: '40',
 28: '41',
 29: '42',
 30: '43',
 31: '44',
 32: '5',
 33: '6',
 34: '7',
 35: '8',
 36: '9',
 37: '999'}

### Train test split

In [52]:
# Train test split
X = df.drop(columns='triptype').copy()
y = df['triptype'].copy()

# Create a test (holdout) set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((81322, 6), (81322,), (14352, 6), (14352,))

In [53]:
# OHE and scale features

num_cols = X_train.select_dtypes(include='number').columns.to_list()

transformer = ColumnTransformer([
  ('ohe', OneHotEncoder(handle_unknown='error', sparse_output=False), ['weekday']),
  ('scaler', StandardScaler(), num_cols)
], remainder='drop', verbose_feature_names_out=False)

X_train_proc = transformer.fit_transform(X_train)
X_test_proc = transformer.transform(X_test)

In [54]:
X_train_proc

Unnamed: 0,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
1068,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.288343,-0.035788,0.145449,0.923428,0.151481
57381,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.663953,1.471232,-0.535534,-0.823618,-0.668220
37682,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.645454,0.215382,0.631865,0.923428,0.834565
58642,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.187805,-0.035788,-0.243684,-0.124799,-0.121753
733,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663953,1.471232,-0.535534,-0.823618,-0.668220
...,...,...,...,...,...,...,...,...,...,...,...,...
19700,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.544916,-0.035788,-0.535534,-0.823618,-0.531603
1591,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.425879,-0.035788,-0.438251,-0.823618,-0.394986
11887,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.068768,0.215382,-0.049118,0.574019,0.014864
68871,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.544916,0.466552,-0.340968,-0.823618,-0.531603


### Model 1: LogisticRegression

In [55]:
# Train model
model_lr = LogisticRegression(solver='lbfgs', max_iter=1000)

# Cross validate train
y_proba = cross_val_predict(model_lr, X_train_proc, y_train, cv=3, method='predict_proba')
train_loss = log_loss(y_train, y_proba)

print(f"Cross-validated multi-class log loss: {train_loss:.4f}")

Cross-validated multi-class log loss: 2.3046


In [57]:
# Predict on test
model_lr.fit(X_train_proc, y_train)
y_proba = model_lr.predict_proba(X_test_proc)
test_loss = log_loss(y_test, y_proba)
test_loss

2.299661943341656