# Baseline Model using RandomForest

- Goal: Top 40% = 418/1043 participants
- 418th place = 0.98853

In [53]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn import set_config
from sklearn.metrics import log_loss
from sklearn.compose import ColumnTransformer

set_config(transform_output='pandas')

In [54]:
# Import data
original_data = pd.read_csv('../data-ignore/train.csv', dtype={'Upc': str})
original_data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,60538815980,1,SHOES,8931.0
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504.0
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017.0


In [55]:
# Group by VisitNumber to get features and target (triptype)
df = original_data.groupby('VisitNumber').agg(triptype=('TripType','first'),
                              weekday=('Weekday','first'),
                              num_unique_upc=('Upc','nunique'),
                              avg_scancount=('ScanCount','mean'),
                              total_scancount=('ScanCount','sum'),
                              num_unique_dept=('DepartmentDescription','nunique'),
                              num_unique_fileline=('FinelineNumber','nunique')
                              )
df.reset_index(inplace=True)
df

Unnamed: 0,VisitNumber,triptype,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
0,5,999,Friday,1,-1.000000,-1,1,1
1,7,30,Friday,2,1.000000,2,2,2
2,8,26,Friday,20,1.217391,28,6,16
3,9,8,Friday,3,1.000000,3,2,3
4,10,8,Friday,3,1.000000,3,2,3
...,...,...,...,...,...,...,...,...
95669,191343,25,Sunday,7,1.285714,9,3,5
95670,191344,22,Sunday,5,1.000000,5,2,3
95671,191345,39,Sunday,13,1.307692,17,8,12
95672,191346,39,Sunday,17,1.000000,17,8,16


# Pipeline

### Preprocessing

In [56]:
# Drop the VisitNumber column
df.drop(columns='VisitNumber', inplace=True)

# Use LabelEncoder to encode the labels
le = LabelEncoder()
df['triptype'] = le.fit_transform(df['triptype'])

# Get the list of classes
class_list = le.classes_
class_list

array([  3,   4,   5,   6,   7,   8,   9,  12,  14,  15,  18,  19,  20,
        21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,
        34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44, 999])

### Train test split

In [57]:
# Train test split
X = df.drop(columns='triptype').copy()
y = df['triptype'].copy()

# Create a test (holdout) set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((81322, 6), (81322,), (14352, 6), (14352,))

### Scaling & Encoding

In [58]:
# OHE and scale features

num_cols = X_train.select_dtypes(include='number').columns.to_list()

transformer = ColumnTransformer([
  ('ohe', OneHotEncoder(handle_unknown='error', sparse_output=False), ['weekday']),
  ('scaler', StandardScaler(), num_cols)
], remainder='drop', verbose_feature_names_out=False)

X_train_proc = transformer.fit_transform(X_train)
X_test_proc = transformer.transform(X_test)

X_train_proc

Unnamed: 0,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
74630,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.663062,-0.038488,-0.632608,-0.823057,-0.666973
61468,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.663062,-3.064435,-0.827308,-0.823057,-0.666973
10207,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.424896,-0.038488,-0.437909,-0.473153,-0.393816
25328,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.765932,-0.038488,0.535589,1.976168,0.835391
15352,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.408683,-0.038488,0.243540,0.576556,0.562234
...,...,...,...,...,...,...,...,...,...,...,...,...
57115,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.051435,0.177651,0.048840,-0.123250,-0.120659
360,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,1.474485,-0.535258,-0.823057,-0.666973
89473,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.067648,-0.254627,-0.145859,-0.123250,-0.393816
51195,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.051435,-0.038488,-0.048510,0.226653,0.015920


### Modeling

In [59]:
# Train model
model = RandomForestClassifier(max_depth=10, n_estimators=100)

# Cross validate train
y_proba = cross_val_predict(model, X_train_proc, y_train, cv=3, method='predict_proba')
train_loss = log_loss(y_train, y_proba)

print(f"Cross-validated multi-class log loss on train: {train_loss:.4f}")

Cross-validated multi-class log loss on train: 2.1602


In [60]:
# Predict on test
model.fit(X_train_proc, y_train)
y_proba = model.predict_proba(X_test_proc)
test_loss = log_loss(y_test, y_proba)
test_loss

2.145043863034057

### Save to experiment tracker

In [61]:
# Create a summary writer
writer = SummaryWriter(log_dir='../runs/baseline_rf')

# Add model metrics
writer.add_scalar('Loss/Train', train_loss, 0)
writer.add_scalar('Loss/Test', test_loss, 0)

# Log model feature importances
importances = model.feature_importances_
for i, imp in enumerate(importances):
  writer.add_scalar(f"FeatureImportance/Feature_{i}", imp, 0)

writer.add_histogram("FeatureImportances", importances, 0)

preprocessing_notes = """
### Preprocessing
- Scaled numerical features with StandardScaler
- One-hot encoded 'weekday'

### Features used
- weekday	
- num_unique_upc	
- avg_scancount	
- total_scancount	
- num_unique_dept	
- num_unique_fileline
"""
writer.add_text("Preprocessing/Features", preprocessing_notes)

# Flatten all predicted probabilities
writer.add_histogram("Probabilities/AllClasses", y_proba.flatten(), 0)

# log per-class
for class_idx in range(y_proba.shape[1]):
    writer.add_histogram(f"Probabilities/Class_{class_idx}", y_proba[:, class_idx], 0)

fig, ax = plt.subplots()
ax.hist(y_proba.max(axis=1), bins=20)
ax.set_title("Max Class Probability Distribution")
writer.add_figure("Confidence Histogram", fig, 0)

# Predict on Kaggle Test

In [62]:
# Import data
kaggle_test = pd.read_csv('../data-ignore/test.csv', dtype={'Upc': str})
kaggle_test.head()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503389714,1,SHOES,3002.0
1,1,Friday,1707710732,1,DAIRY,1526.0
2,1,Friday,89470001026,1,DAIRY,1431.0
3,1,Friday,88491211470,1,GROCERY DRY GOODS,3555.0
4,2,Friday,2840015224,1,DSD GROCERY,4408.0


In [63]:
# Group by VisitNumber to get features and target (triptype)
kg = kaggle_test.groupby('VisitNumber').agg(
  weekday=('Weekday','first'),
  num_unique_upc=('Upc','nunique'),
  avg_scancount=('ScanCount','mean'),
  total_scancount=('ScanCount','sum'),
  num_unique_dept=('DepartmentDescription','nunique'),
  num_unique_fileline=('FinelineNumber','nunique')
  )
kg.reset_index(inplace=True)
kg

Unnamed: 0,VisitNumber,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
0,1,Friday,4,1.000000,4,3,4
1,2,Friday,4,1.000000,4,3,3
2,3,Friday,1,0.000000,0,1,1
3,4,Friday,1,1.000000,1,1,1
4,6,Friday,1,0.000000,0,1,1
...,...,...,...,...,...,...,...
95669,191338,Sunday,7,1.714286,12,3,6
95670,191339,Sunday,5,1.200000,6,3,5
95671,191340,Sunday,1,2.000000,2,1,1
95672,191341,Sunday,10,1.200000,12,5,10


In [64]:
# Save the VisitNumber
kg_index = kg['VisitNumber'].astype(str)
kg_index.head()

0    1
1    2
2    3
3    4
4    6
Name: VisitNumber, dtype: object

In [65]:
# Drop the VisitNumber column
kg.drop(columns='VisitNumber', inplace=True)

# Run the preprocessing transformer
kg_proc = transformer.transform(kg)
kg_proc.head(10)

Unnamed: 0,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.305814,-0.038488,-0.340559,-0.12325,-0.257237
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.305814,-0.038488,-0.340559,-0.12325,-0.393816
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,-1.551462,-0.729958,-0.823057,-0.666973
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,-0.038488,-0.632608,-0.823057,-0.666973
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,-1.551462,-0.729958,-0.823057,-0.666973
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.543979,-0.038488,-0.535258,-0.473153,-0.530394
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408683,0.112809,0.340889,0.926459,0.425655
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170518,0.528877,0.340889,0.226653,0.152498
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051435,-0.038488,-0.04851,0.576556,0.01592
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051435,-0.038488,-0.04851,0.226653,0.152498


In [66]:
# Predict on kaggle test
kg_proba = model.predict_proba(kg_proc)
kg_proba

array([[1.49628776e-03, 3.84610555e-03, 4.57479421e-02, ...,
        1.73624405e-02, 1.99553048e-03, 9.97683604e-03],
       [1.41094245e-03, 2.37179609e-03, 4.10019411e-02, ...,
        1.41711304e-02, 2.75951888e-03, 1.87254796e-02],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 9.99032739e-01],
       ...,
       [6.34079701e-03, 2.93903142e-03, 3.92686192e-02, ...,
        0.00000000e+00, 0.00000000e+00, 4.16767626e-02],
       [1.44717800e-05, 7.20626666e-04, 1.29951633e-02, ...,
        2.17491814e-02, 2.86317377e-02, 2.20738454e-03],
       [9.00330890e-04, 3.38505317e-03, 4.34675873e-02, ...,
        1.35005214e-02, 4.44071130e-03, 8.03290362e-03]],
      shape=(95674, 38))

In [67]:
# Import headers from sample_submission
sample_sub = pd.read_csv('../data-ignore/sample_submission.csv')
headers = sample_sub.columns.to_list()
headers

['VisitNumber',
 'TripType_3',
 'TripType_4',
 'TripType_5',
 'TripType_6',
 'TripType_7',
 'TripType_8',
 'TripType_9',
 'TripType_12',
 'TripType_14',
 'TripType_15',
 'TripType_18',
 'TripType_19',
 'TripType_20',
 'TripType_21',
 'TripType_22',
 'TripType_23',
 'TripType_24',
 'TripType_25',
 'TripType_26',
 'TripType_27',
 'TripType_28',
 'TripType_29',
 'TripType_30',
 'TripType_31',
 'TripType_32',
 'TripType_33',
 'TripType_34',
 'TripType_35',
 'TripType_36',
 'TripType_37',
 'TripType_38',
 'TripType_39',
 'TripType_40',
 'TripType_41',
 'TripType_42',
 'TripType_43',
 'TripType_44',
 'TripType_999']

In [68]:
# Create submission dataframe with predictions
submission = pd.DataFrame(kg_proba, index=kg_index, columns=headers[1:])
submission.reset_index(inplace=True)
submission

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,0.001496,0.003846,0.045748,0.010821,0.077726,0.019570,0.010536,0.005599,0.000019,...,0.104025,0.036305,0.052658,0.108593,0.000459,0.008859,0.030194,0.017362,0.001996,0.009977
1,2,0.001411,0.002372,0.041002,0.014130,0.079200,0.019165,0.008717,0.005879,0.000027,...,0.079553,0.019370,0.066132,0.093735,0.000796,0.008538,0.023526,0.014171,0.002760,0.018725
2,3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000101,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000026,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.999033
3,4,0.078952,0.009929,0.062181,0.030884,0.075299,0.320487,0.283928,0.000138,0.000000,...,0.002586,0.001624,0.000695,0.000489,0.000076,0.000353,0.000894,0.000000,0.000003,0.047041
4,6,0.000000,0.000000,0.000000,0.000000,0.000000,0.000101,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000026,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.999033
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191338,0.000986,0.002144,0.033588,0.006050,0.083906,0.001089,0.000407,0.006214,0.000230,...,0.063078,0.038226,0.056911,0.174549,0.009901,0.006133,0.024571,0.008034,0.004041,0.008883
95670,191339,0.000703,0.003285,0.046391,0.007078,0.077001,0.005569,0.002817,0.007158,0.000039,...,0.073870,0.034529,0.062160,0.126788,0.001337,0.008731,0.025331,0.016692,0.003566,0.008894
95671,191340,0.006341,0.002939,0.039269,0.030882,0.102185,0.387940,0.240326,0.003884,0.000000,...,0.000963,0.000565,0.000619,0.000577,0.000009,0.000000,0.000740,0.000000,0.000000,0.041677
95672,191341,0.000014,0.000721,0.012995,0.002507,0.068195,0.000030,0.000226,0.006211,0.000005,...,0.044095,0.051470,0.073917,0.391903,0.012308,0.014742,0.047890,0.021749,0.028632,0.002207


In [69]:
# Save submission to csv
submission.to_csv('../data-ignore/submission2.csv', header=True, index=False)

In [70]:
# Kaggle score: private = 2.14829, public = 2.16805
# Kaggle place 647 / 1043 = 62%