# Create Baseline Model

- Goal: Top 40% = 418/1043 participants
- 418th place = 0.98853

In [73]:
# Imports
import pandas as pd
import datetime
from torch.utils.tensorboard import SummaryWriter

from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn import set_config
from sklearn.metrics import log_loss
from sklearn.compose import ColumnTransformer

set_config(transform_output='pandas')

In [74]:
# Import data
original_data = pd.read_csv('../data-ignore/train.csv', dtype={'Upc': str})
original_data.head()

Unnamed: 0,TripType,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,999,5,Friday,68113152929,-1,FINANCIAL SERVICES,1000.0
1,30,7,Friday,60538815980,1,SHOES,8931.0
2,30,7,Friday,7410811099,1,PERSONAL CARE,4504.0
3,26,8,Friday,2238403510,2,PAINT AND ACCESSORIES,3565.0
4,26,8,Friday,2006613744,2,PAINT AND ACCESSORIES,1017.0


In [75]:
# Group by VisitNumber to get features and target (triptype)
df = original_data.groupby('VisitNumber').agg(triptype=('TripType','first'),
                              weekday=('Weekday','first'),
                              num_unique_upc=('Upc','nunique'),
                              avg_scancount=('ScanCount','mean'),
                              total_scancount=('ScanCount','sum'),
                              num_unique_dept=('DepartmentDescription','nunique'),
                              num_unique_fileline=('FinelineNumber','nunique')
                              )
df.reset_index(inplace=True)
df

Unnamed: 0,VisitNumber,triptype,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
0,5,999,Friday,1,-1.000000,-1,1,1
1,7,30,Friday,2,1.000000,2,2,2
2,8,26,Friday,20,1.217391,28,6,16
3,9,8,Friday,3,1.000000,3,2,3
4,10,8,Friday,3,1.000000,3,2,3
...,...,...,...,...,...,...,...,...
95669,191343,25,Sunday,7,1.285714,9,3,5
95670,191344,22,Sunday,5,1.000000,5,2,3
95671,191345,39,Sunday,13,1.307692,17,8,12
95672,191346,39,Sunday,17,1.000000,17,8,16


# Pipeline

### Preprocessing

In [76]:
# Drop the VisitNumber column
df.drop(columns='VisitNumber', inplace=True)

# Use LabelEncoder to encode the labels
le = LabelEncoder()
df['triptype'] = le.fit_transform(df['triptype'])

# Get the list of classes
class_list = le.classes_
class_list

array([  3,   4,   5,   6,   7,   8,   9,  12,  14,  15,  18,  19,  20,
        21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32,  33,
        34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44, 999])

### Train test split

In [77]:
# Train test split
X = df.drop(columns='triptype').copy()
y = df['triptype'].copy()

# Create a test (holdout) set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((81322, 6), (81322,), (14352, 6), (14352,))

### Scaling & Encoding

In [78]:
# OHE and scale features

num_cols = X_train.select_dtypes(include='number').columns.to_list()

transformer = ColumnTransformer([
  ('ohe', OneHotEncoder(handle_unknown='error', sparse_output=False), ['weekday']),
  ('scaler', StandardScaler(), num_cols)
], remainder='drop', verbose_feature_names_out=False)

X_train_proc = transformer.fit_transform(X_train)
X_test_proc = transformer.transform(X_test)

X_train_proc

Unnamed: 0,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
74630,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.663062,-0.038488,-0.632608,-0.823057,-0.666973
61468,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.663062,-3.064435,-0.827308,-0.823057,-0.666973
10207,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.424896,-0.038488,-0.437909,-0.473153,-0.393816
25328,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.765932,-0.038488,0.535589,1.976168,0.835391
15352,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.408683,-0.038488,0.243540,0.576556,0.562234
...,...,...,...,...,...,...,...,...,...,...,...,...
57115,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.051435,0.177651,0.048840,-0.123250,-0.120659
360,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,1.474485,-0.535258,-0.823057,-0.666973
89473,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.067648,-0.254627,-0.145859,-0.123250,-0.393816
51195,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.051435,-0.038488,-0.048510,0.226653,0.015920


### Model 1: LogisticRegression

In [79]:
# Train model
model_lr = LogisticRegression(solver='lbfgs', max_iter=1000)

# Cross validate train
y_proba = cross_val_predict(model_lr, X_train_proc, y_train, cv=3, method='predict_proba')
train_loss = log_loss(y_train, y_proba)

print(f"Cross-validated multi-class log loss on train: {train_loss:.4f}")

Cross-validated multi-class log loss on train: 2.3045


In [80]:
# Predict on test
model_lr.fit(X_train_proc, y_train)
y_proba = model_lr.predict_proba(X_test_proc)
test_loss = log_loss(y_test, y_proba)
test_loss

2.296101731762037

### Save to experiment tracker

In [81]:
# Create a summary writer
writer = SummaryWriter(log_dir='runs/baseline')

# Add model metrics
writer.add_scalar('Loss/Train', train_loss, 0)
writer.add_scalar('Loss/Test', test_loss, 0)

# Log model coefficients and intercept
for i, coef in enumerate(model_lr.coef_):
  writer.add_histogram(f"Coef/Class_{i}", coef, 0)

writer.add_histogram("Intercept", model_lr.intercept_, 0)

preprocessing_notes = """
### Preprocessing
- Scaled numerical features with StandardScaler
- One-hot encoded 'weekday'

### Features used
- weekday	
- num_unique_upc	
- avg_scancount	
- total_scancount	
- num_unique_dept	
- num_unique_fileline
"""
writer.add_text("Preprocessing/Features", preprocessing_notes)

# Log hyperparameters
writer.add_text("Hyperparameters", f"max_iter={model_lr.max_iter}, solver={model_lr.solver}")

# Predict on Kaggle Test

In [82]:
# Import data
kaggle_test = pd.read_csv('../data-ignore/test.csv', dtype={'Upc': str})
kaggle_test.head()

Unnamed: 0,VisitNumber,Weekday,Upc,ScanCount,DepartmentDescription,FinelineNumber
0,1,Friday,72503389714,1,SHOES,3002.0
1,1,Friday,1707710732,1,DAIRY,1526.0
2,1,Friday,89470001026,1,DAIRY,1431.0
3,1,Friday,88491211470,1,GROCERY DRY GOODS,3555.0
4,2,Friday,2840015224,1,DSD GROCERY,4408.0


In [83]:
# Group by VisitNumber to get features and target (triptype)
kg = kaggle_test.groupby('VisitNumber').agg(
  weekday=('Weekday','first'),
  num_unique_upc=('Upc','nunique'),
  avg_scancount=('ScanCount','mean'),
  total_scancount=('ScanCount','sum'),
  num_unique_dept=('DepartmentDescription','nunique'),
  num_unique_fileline=('FinelineNumber','nunique')
  )
kg.reset_index(inplace=True)
kg

Unnamed: 0,VisitNumber,weekday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
0,1,Friday,4,1.000000,4,3,4
1,2,Friday,4,1.000000,4,3,3
2,3,Friday,1,0.000000,0,1,1
3,4,Friday,1,1.000000,1,1,1
4,6,Friday,1,0.000000,0,1,1
...,...,...,...,...,...,...,...
95669,191338,Sunday,7,1.714286,12,3,6
95670,191339,Sunday,5,1.200000,6,3,5
95671,191340,Sunday,1,2.000000,2,1,1
95672,191341,Sunday,10,1.200000,12,5,10


In [84]:
# Save the VisitNumber
kg_index = kg['VisitNumber'].astype(str)
kg_index.head()

0    1
1    2
2    3
3    4
4    6
Name: VisitNumber, dtype: object

In [85]:
# Drop the VisitNumber column
kg.drop(columns='VisitNumber', inplace=True)

# Run the preprocessing transformer
kg_proc = transformer.transform(kg)
kg_proc.head(10)

Unnamed: 0,weekday_Friday,weekday_Monday,weekday_Saturday,weekday_Sunday,weekday_Thursday,weekday_Tuesday,weekday_Wednesday,num_unique_upc,avg_scancount,total_scancount,num_unique_dept,num_unique_fileline
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.305814,-0.038488,-0.340559,-0.12325,-0.257237
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.305814,-0.038488,-0.340559,-0.12325,-0.393816
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,-1.551462,-0.729958,-0.823057,-0.666973
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,-0.038488,-0.632608,-0.823057,-0.666973
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.663062,-1.551462,-0.729958,-0.823057,-0.666973
5,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.543979,-0.038488,-0.535258,-0.473153,-0.530394
6,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408683,0.112809,0.340889,0.926459,0.425655
7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.170518,0.528877,0.340889,0.226653,0.152498
8,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051435,-0.038488,-0.04851,0.576556,0.01592
9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.051435,-0.038488,-0.04851,0.226653,0.152498


In [86]:
# Predict on kaggle test
kg_proba = model_lr.predict_proba(kg_proc)
kg_proba

array([[2.41178048e-02, 5.16175548e-03, 6.26173942e-02, ...,
        1.56683450e-02, 2.20143351e-03, 2.56816679e-02],
       [1.03660284e-02, 5.05312398e-03, 6.75433494e-02, ...,
        1.91752119e-02, 2.42813203e-03, 2.71472878e-02],
       [2.59255918e-02, 8.05605396e-04, 2.80276914e-02, ...,
        1.69880581e-04, 1.26930675e-05, 7.96918025e-01],
       ...,
       [7.42008167e-02, 5.73085145e-03, 2.43516681e-02, ...,
        1.13533848e-03, 5.58796968e-05, 7.78009431e-04],
       [3.21196123e-05, 1.10630017e-04, 1.44109965e-03, ...,
        1.03536175e-02, 1.21975494e-02, 2.18051656e-05],
       [6.11243593e-03, 1.60568525e-03, 8.35066174e-03, ...,
        7.10584420e-03, 2.88870902e-03, 4.90363566e-03]],
      shape=(95674, 38))

In [87]:
# Import headers from sample_submission
sample_sub = pd.read_csv('../data-ignore/sample_submission.csv')
headers = sample_sub.columns.to_list()
headers

['VisitNumber',
 'TripType_3',
 'TripType_4',
 'TripType_5',
 'TripType_6',
 'TripType_7',
 'TripType_8',
 'TripType_9',
 'TripType_12',
 'TripType_14',
 'TripType_15',
 'TripType_18',
 'TripType_19',
 'TripType_20',
 'TripType_21',
 'TripType_22',
 'TripType_23',
 'TripType_24',
 'TripType_25',
 'TripType_26',
 'TripType_27',
 'TripType_28',
 'TripType_29',
 'TripType_30',
 'TripType_31',
 'TripType_32',
 'TripType_33',
 'TripType_34',
 'TripType_35',
 'TripType_36',
 'TripType_37',
 'TripType_38',
 'TripType_39',
 'TripType_40',
 'TripType_41',
 'TripType_42',
 'TripType_43',
 'TripType_44',
 'TripType_999']

In [88]:
# Create submission dataframe with predictions
submission1 = pd.DataFrame(kg_proba, index=kg_index, columns=headers[1:])
submission1.reset_index(inplace=True)
submission1

Unnamed: 0,VisitNumber,TripType_3,TripType_4,TripType_5,TripType_6,TripType_7,TripType_8,TripType_9,TripType_12,TripType_14,...,TripType_36,TripType_37,TripType_38,TripType_39,TripType_40,TripType_41,TripType_42,TripType_43,TripType_44,TripType_999
0,1,0.024118,0.005162,0.062617,0.023886,0.127033,0.109903,0.064964,0.003160,3.043056e-05,...,0.049529,0.020436,0.025955,0.092932,0.001787,0.005942,0.016564,0.015668,0.002201,0.025682
1,2,0.010366,0.005053,0.067543,0.034850,0.116975,0.097688,0.081227,0.002733,2.921484e-05,...,0.039136,0.011074,0.022838,0.085009,0.001521,0.006965,0.021029,0.019175,0.002428,0.027147
2,3,0.025926,0.000806,0.028028,0.002436,0.003758,0.062474,0.066156,0.000018,1.316747e-07,...,0.000702,0.000238,0.000143,0.000263,0.000002,0.000112,0.000115,0.000170,0.000013,0.796918
3,4,0.125835,0.005821,0.066705,0.018840,0.032612,0.321856,0.282376,0.000286,3.179363e-06,...,0.007457,0.001538,0.002183,0.004579,0.000043,0.000308,0.000904,0.001058,0.000062,0.057227
4,6,0.025926,0.000806,0.028028,0.002436,0.003758,0.062474,0.066156,0.000018,1.316747e-07,...,0.000702,0.000238,0.000143,0.000263,0.000002,0.000112,0.000115,0.000170,0.000013,0.796918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95669,191338,0.000083,0.000649,0.022188,0.002337,0.075375,0.000038,0.000014,0.010878,1.415784e-04,...,0.061197,0.041381,0.088491,0.118277,0.015938,0.003962,0.022486,0.006075,0.003272,0.000005
95670,191339,0.006346,0.002607,0.033445,0.009999,0.122288,0.019307,0.009417,0.008536,5.497050e-05,...,0.063778,0.040896,0.051615,0.133918,0.004349,0.007025,0.021647,0.011465,0.002550,0.003292
95671,191340,0.074201,0.005731,0.024352,0.022598,0.054982,0.348978,0.272967,0.001654,1.559106e-05,...,0.013897,0.002414,0.008080,0.017844,0.000251,0.000227,0.001646,0.001135,0.000056,0.000778
95672,191341,0.000032,0.000111,0.001441,0.000297,0.033706,0.000032,0.000007,0.011775,7.712512e-05,...,0.064887,0.117948,0.094380,0.265177,0.037381,0.011959,0.038424,0.010354,0.012198,0.000022


In [None]:
# Save submission to csv
submission1.to_csv('../data-ignore/submission1.csv', header=True, index=False)

In [90]:
# Kaggle score: private = 2.29623, public = 2.31532
# Kaggle place 660th / 1043 = 63.3%