# Baseline Model using XGBoost

- Goal: Top 40% = 418/1043 participants
- 418th place = 0.98853

In [21]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter

from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn import set_config
from sklearn.metrics import log_loss
from sklearn.compose import ColumnTransformer

from xgboost import XGBClassifier

set_config(transform_output='pandas')

In [22]:
# Import data
original_data = pd.read_csv('../data-ignore/train.csv', dtype={'Upc': str})
original_data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,60538815980,1,SHOES,8931.0
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504.0
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017.0


In [23]:
# Group by VisitNumber to get features and target (triptype)
df = original_data.groupby('VisitNumber').agg(triptype=('TripType','first'),
                              weekday=('Weekday','first'),
                              num_unique_upc=('Upc','nunique'),
                              avg_scancount=('ScanCount','mean'),
                              total_scancount=('ScanCount','sum'),
                              num_unique_dept=('DepartmentDescription','nunique'),
                              num_unique_fileline=('FinelineNumber','nunique')
                              )
df.reset_index(inplace=True)
df

Unnamed: 0,VisitNumber,triptype,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
0,5,999,Friday,1,-1.000000,-1,1,1
1,7,30,Friday,2,1.000000,2,2,2
2,8,26,Friday,20,1.217391,28,6,16
3,9,8,Friday,3,1.000000,3,2,3
4,10,8,Friday,3,1.000000,3,2,3
...,...,...,...,...,...,...,...,...
95669,191343,25,Sunday,7,1.285714,9,3,5
95670,191344,22,Sunday,5,1.000000,5,2,3
95671,191345,39,Sunday,13,1.307692,17,8,12
95672,191346,39,Sunday,17,1.000000,17,8,16


# Pipeline

### Preprocessing

In [24]:
# Drop the VisitNumber column
df.drop(columns='VisitNumber', inplace=True)

# Use LabelEncoder to encode the labels
le = LabelEncoder()
df['triptype'] = le.fit_transform(df['triptype'])

# Get the list of classes
class_list = le.classes_
class_list

array([  3,   4,   5,   6,   7,   8,   9,  12,  14,  15,  18,  19,  20,
        21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,
        34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44, 999])

### Train test split

In [25]:
# Train test split
X = df.drop(columns='triptype').copy()
y = df['triptype'].copy()

# Create a test (holdout) set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((81322, 6), (81322,), (14352, 6), (14352,))

### Scaling & Encoding

In [26]:
# OHE and scale features

num_cols = X_train.select_dtypes(include='number').columns.to_list()

transformer = ColumnTransformer([
  ('ohe', OneHotEncoder(handle_unknown='error', sparse_output=False), ['weekday']),
  ('scaler', StandardScaler(), num_cols)
], remainder='drop', verbose_feature_names_out=False)

X_train_proc = transformer.fit_transform(X_train)
X_test_proc = transformer.transform(X_test)

X_train_proc

Unnamed: 0,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
74630,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.663062,-0.038488,-0.632608,-0.823057,-0.666973
61468,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.663062,-3.064435,-0.827308,-0.823057,-0.666973
10207,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.424896,-0.038488,-0.437909,-0.473153,-0.393816
25328,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.765932,-0.038488,0.535589,1.976168,0.835391
15352,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.408683,-0.038488,0.243540,0.576556,0.562234
...,...,...,...,...,...,...,...,...,...,...,...,...
57115,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.051435,0.177651,0.048840,-0.123250,-0.120659
360,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,1.474485,-0.535258,-0.823057,-0.666973
89473,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.067648,-0.254627,-0.145859,-0.123250,-0.393816
51195,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.051435,-0.038488,-0.048510,0.226653,0.015920


### Modeling

In [27]:
# Train model
model = XGBClassifier(objective='multi:softprob', num_class=38, eval_metric='mlogloss')

# Cross validate train
y_proba = cross_val_predict(model, X_train_proc, y_train, cv=3, method='predict_proba')
train_loss = log_loss(y_train, y_proba)

print(f"Cross-validated multi-class log loss on train: {train_loss:.4f}")

Cross-validated multi-class log loss on train: 2.1975


In [28]:
# Predict on test
model.fit(X_train_proc, y_train)
y_proba = model.predict_proba(X_test_proc)
test_loss = log_loss(y_test, y_proba)
test_loss

2.1652875473731754

### Save to experiment tracker

In [29]:
# Create a summary writer
writer = SummaryWriter(log_dir='../runs/baseline_xgb')

# Add model metrics
writer.add_scalar('Loss/Train', train_loss, 0)
writer.add_scalar('Loss/Test', test_loss, 0)

# Log model feature importances
importances = model.feature_importances_
for i, imp in enumerate(importances):
  writer.add_scalar(f"FeatureImportance/Feature_{i}", imp, 0)

writer.add_histogram("FeatureImportances", importances, 0)

preprocessing_notes = """
### Preprocessing
- Scaled numerical features with StandardScaler
- One-hot encoded 'weekday'

### Features used
- weekday	
- num_unique_upc	
- avg_scancount	
- total_scancount	
- num_unique_dept	
- num_unique_fileline
"""
writer.add_text("Preprocessing/Features", preprocessing_notes)

# Flatten all predicted probabilities
writer.add_histogram("Probabilities/AllClasses", y_proba.flatten(), 0)

# log per-class
for class_idx in range(y_proba.shape[1]):
    writer.add_histogram(f"Probabilities/Class_{class_idx}", y_proba[:, class_idx], 0)

fig, ax = plt.subplots()
ax.hist(y_proba.max(axis=1), bins=20)
ax.set_title("Max Class Probability Distribution")
writer.add_figure("Confidence Histogram", fig, 0)

writer.close()

# Predict on Kaggle Test

In [30]:
# Import data
kaggle_test = pd.read_csv('../data-ignore/test.csv', dtype={'Upc': str})
kaggle_test.head()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503389714,1,SHOES,3002.0
1,1,Friday,1707710732,1,DAIRY,1526.0
2,1,Friday,89470001026,1,DAIRY,1431.0
3,1,Friday,88491211470,1,GROCERY DRY GOODS,3555.0
4,2,Friday,2840015224,1,DSD GROCERY,4408.0


In [31]:
# Group by VisitNumber to get features and target (triptype)
kg = kaggle_test.groupby('VisitNumber').agg(
  weekday=('Weekday','first'),
  num_unique_upc=('Upc','nunique'),
  avg_scancount=('ScanCount','mean'),
  total_scancount=('ScanCount','sum'),
  num_unique_dept=('DepartmentDescription','nunique'),
  num_unique_fileline=('FinelineNumber','nunique')
  )
kg.reset_index(inplace=True)
kg

Unnamed: 0,VisitNumber,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
0,1,Friday,4,1.000000,4,3,4
1,2,Friday,4,1.000000,4,3,3
2,3,Friday,1,0.000000,0,1,1
3,4,Friday,1,1.000000,1,1,1
4,6,Friday,1,0.000000,0,1,1
...,...,...,...,...,...,...,...
95669,191338,Sunday,7,1.714286,12,3,6
95670,191339,Sunday,5,1.200000,6,3,5
95671,191340,Sunday,1,2.000000,2,1,1
95672,191341,Sunday,10,1.200000,12,5,10


In [32]:
# Save the VisitNumber
kg_index = kg['VisitNumber'].astype(str)
kg_index.head()

0    1
1    2
2    3
3    4
4    6
Name: VisitNumber, dtype: object

In [33]:
# Drop the VisitNumber column
kg.drop(columns='VisitNumber', inplace=True)

# Run the preprocessing transformer
kg_proc = transformer.transform(kg)
kg_proc.head(10)

Unnamed: 0,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.305814,-0.038488,-0.340559,-0.12325,-0.257237
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.305814,-0.038488,-0.340559,-0.12325,-0.393816
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,-1.551462,-0.729958,-0.823057,-0.666973
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,-0.038488,-0.632608,-0.823057,-0.666973
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,-1.551462,-0.729958,-0.823057,-0.666973
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.543979,-0.038488,-0.535258,-0.473153,-0.530394
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408683,0.112809,0.340889,0.926459,0.425655
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170518,0.528877,0.340889,0.226653,0.152498
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051435,-0.038488,-0.04851,0.576556,0.01592
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051435,-0.038488,-0.04851,0.226653,0.152498


In [34]:
# Predict on kaggle test
kg_proba = model.predict_proba(kg_proc)
kg_proba

array([[6.69823668e-04, 8.50425381e-03, 4.87887710e-02, ...,
        1.92619376e-02, 1.67543942e-03, 5.64805744e-03],
       [5.97441453e-04, 1.46969117e-03, 3.17238942e-02, ...,
        1.33483885e-02, 3.13356821e-03, 1.34360380e-02],
       [2.96764738e-05, 2.29443299e-06, 3.23339154e-05, ...,
        9.26129502e-08, 6.34022754e-08, 9.98272181e-01],
       ...,
       [3.64044006e-03, 2.96663260e-03, 3.06051336e-02, ...,
        8.84115798e-06, 5.78014669e-06, 3.98550890e-02],
       [3.62725063e-06, 6.57929559e-05, 1.59967840e-02, ...,
        1.54962586e-02, 1.82930604e-02, 1.45816681e-04],
       [2.38359300e-03, 6.24049781e-03, 4.70122099e-02, ...,
        9.85510554e-03, 6.09356118e-03, 3.78366839e-03]],
      shape=(95674, 38), dtype=float32)

In [35]:
# Import headers from sample_submission
sample_sub = pd.read_csv('../data-ignore/sample_submission.csv')
headers = sample_sub.columns.to_list()
headers

['VisitNumber',
 'TripType_3',
 'TripType_4',
 'TripType_5',
 'TripType_6',
 'TripType_7',
 'TripType_8',
 'TripType_9',
 'TripType_12',
 'TripType_14',
 'TripType_15',
 'TripType_18',
 'TripType_19',
 'TripType_20',
 'TripType_21',
 'TripType_22',
 'TripType_23',
 'TripType_24',
 'TripType_25',
 'TripType_26',
 'TripType_27',
 'TripType_28',
 'TripType_29',
 'TripType_30',
 'TripType_31',
 'TripType_32',
 'TripType_33',
 'TripType_34',
 'TripType_35',
 'TripType_36',
 'TripType_37',
 'TripType_38',
 'TripType_39',
 'TripType_40',
 'TripType_41',
 'TripType_42',
 'TripType_43',
 'TripType_44',
 'TripType_999']

In [36]:
# Create submission dataframe with predictions
submission = pd.DataFrame(kg_proba, index=kg_index, columns=headers[1:])
submission.reset_index(inplace=True)
submission

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,0.000670,0.008504,0.048789,0.014986,0.074966,0.028406,0.012976,0.003796,1.277288e-06,...,0.092038,0.025643,0.047978,0.108621,1.559826e-04,0.008955,0.042377,1.926194e-02,1.675439e-03,0.005648
1,2,0.000597,0.001470,0.031724,0.022029,0.078093,0.020806,0.009684,0.004464,6.030197e-06,...,0.063087,0.009508,0.050005,0.116448,2.000954e-04,0.007364,0.034995,1.334839e-02,3.133568e-03,0.013436
2,3,0.000030,0.000002,0.000032,0.000018,0.000030,0.000200,0.000075,0.000003,2.595615e-08,...,0.000010,0.000005,0.000013,0.000002,7.211104e-07,0.000012,0.000002,9.261295e-08,6.340228e-08,0.998272
3,4,0.086811,0.010125,0.060122,0.031385,0.076197,0.316479,0.284150,0.000060,3.059809e-06,...,0.003143,0.002273,0.000877,0.000350,1.031009e-04,0.000311,0.000741,2.231420e-05,1.228524e-05,0.046275
4,6,0.000030,0.000002,0.000032,0.000018,0.000030,0.000200,0.000075,0.000003,2.595615e-08,...,0.000010,0.000005,0.000013,0.000002,7.211104e-07,0.000012,0.000002,9.261295e-08,6.340228e-08,0.998272
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191338,0.000048,0.000017,0.074071,0.001318,0.137350,0.000009,0.000003,0.002615,1.608705e-04,...,0.016267,0.118280,0.045342,0.194098,8.458779e-03,0.002835,0.039198,4.901049e-04,3.506094e-04,0.004923
95670,191339,0.000054,0.002503,0.054425,0.004602,0.072113,0.000881,0.000147,0.005641,1.317236e-06,...,0.044439,0.044832,0.063925,0.128068,1.313457e-04,0.011267,0.028184,2.174642e-02,4.549739e-03,0.004450
95671,191340,0.003640,0.002967,0.030605,0.028778,0.111439,0.393907,0.236336,0.004228,6.378179e-06,...,0.001016,0.000279,0.000233,0.000082,9.434490e-05,0.000006,0.000203,8.841158e-06,5.780147e-06,0.039855
95672,191341,0.000004,0.000066,0.015997,0.000351,0.078230,0.000006,0.000003,0.002946,3.130251e-06,...,0.033098,0.069901,0.087919,0.408950,1.513500e-02,0.009274,0.030790,1.549626e-02,1.829306e-02,0.000146


In [37]:
# Save submission to csv
submission.to_csv('../data-ignore/submission3.csv', header=True, index=False)

In [39]:
# Kaggle score: private = 2.16854, public = 2.18641
# Kaggle place 647 / 1043 = 62%