In [55]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [111]:
import pandas as pd, numpy as np, gc
from sklearn.model_selection import KFold, GroupKFold
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from numerize import numerize
import gameplay_utils as utils

# Feature Pre-Processing

In [57]:
train = pd.read_csv(utils.data_wd / 'train.csv', dtype=utils.dtypes)
train.head(3).T

Unnamed: 0,0,1,2
session_id,20090312431273200,20090312431273200,20090312431273200
index,0,1,2
elapsed_time,0,1323,831
event_name,cutscene_click,person_click,person_click
name,basic,basic,basic
level,0,0,0
page,,,
room_coor_x,-413.991394,-413.991394,-413.991394
room_coor_y,-159.314682,-159.314682,-159.314682
screen_coor_x,380.0,380.0,380.0


In [71]:
train_events = train.pivot_table(values='level', index=['session_id', 'level_group'],
                                 columns='event_name', aggfunc='count')
train_events

Unnamed: 0_level_0,event_name,checkpoint,cutscene_click,map_click,map_hover,navigate_click,notebook_click,notification_click,object_click,object_hover,observation_click,person_click
session_id,level_group,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
20090312431273200,0-4,1,28,2,4,81,0,8,11,4,4,22
20090312431273200,13-22,1,60,6,14,170,0,10,20,13,3,123
20090312431273200,5-12,1,12,8,9,103,0,9,28,21,1,104
20090312433251036,0-4,1,36,3,3,49,2,5,15,5,2,18
20090312433251036,13-22,1,65,45,186,637,50,14,83,66,5,145
...,...,...,...,...,...,...,...,...,...,...,...,...
22100219442786200,13-22,1,54,8,13,181,26,8,15,11,5,101
22100219442786200,5-12,1,11,7,10,85,18,9,23,18,1,95
22100221145014656,0-4,1,31,2,2,92,6,9,27,9,5,27
22100221145014656,13-22,1,76,16,65,363,22,6,48,20,4,139


# Label Pre-Processing

In [None]:
labels = (pd.read_csv(utils.data_wd / 'train_labels.csv')
          .rename(columns=dict(session_id='sq_id')))
labels.loc[:, ['session_id', 'q']] = labels.sq_id.str.split('_', expand=True).values
labels = (labels.set_index('sq_id')
          .assign(session_id=lambda x: x.session_id.astype('int64'),
                  q=lambda x: x.q.str.replace('q', '').astype(int),
                  level_group=lambda x: pd.Categorical(np.where(x.q <= 3, '0-4', 
                                                                     np.where(x.q <= 12, '5-12', '13-22')),
                                                      **utils.level_group_kwargs)
                 )
          .set_index(['session_id', 'level_group', 'q']).sort_index()
         )
labels

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,correct
session_id,level_group,q,Unnamed: 3_level_1
20090312431273200,0-4,1,1
20090312431273200,0-4,2,1
20090312431273200,0-4,3,1
20090312431273200,5-12,4,1
20090312431273200,5-12,5,1
...,...,...,...
22100221145014656,13-22,14,0
22100221145014656,13-22,15,0
22100221145014656,13-22,16,0
22100221145014656,13-22,17,1


# Model Data

In [83]:
model_data = labels.join(train_events)
target_name = 'correct'
feature_names = [x for x in model_data.columns.tolist() if x != target_name]
print(f'Target Name: {target_name}')
print('Features Names:\n' + '-' * 40 + '\n' + ', '.join(feature_names))
X, y = model_data.loc[:, feature_names], model_data.loc[:, target_name]

Target Name: correct
Features Names:
----------------------------------------
checkpoint, cutscene_click, map_click, map_hover, navigate_click, notebook_click, notification_click, object_click, object_hover, observation_click, person_click


One question I'm thinking about is do we have to stratify by `session_id`? Probably!

The reason is that the evaluation set will most likely have only new sessions.

I'm going to skip that step for the MVP. My priority right now is to get to a submissions. We can figure out details like this in future models.

In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=69)

In [101]:
print(f'Our training data has {numerize.numerize(X_train.shape[0], 0)} records and '
      f'our test data has {numerize.numerize(X_test.shape[0], 0)} records')

Our training data has 284K records and our test data has 140K records


# Model

I'm using XGBoost based on [this notebook](https://www.kaggle.com/code/cdeotte/xgboost-baseline-0-680) from Chris Deotte!
I'm sourcing the model params directly from there.

I'm baselining even harder with only a train test split instead of cross validation!
Cross validation is a great idea by Chris though and I should definitely incorporate in future models!

In [107]:
xgb_params = dict(
    objective='binary:logistic',
    eval_metric='logloss',
    learning_rate= 0.05,
    max_depth= 4,
    n_estimators= 1000,
    early_stopping_rounds= 50,
    tree_method='hist',
    subsample=0.8,
    colsample_bytree= 0.4,
    use_label_encoder=False
)
clf =  XGBClassifier(**xgb_params)
clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=0)

In [114]:
yhat_train = clf.predict(X_train)
train_accuracy, train_f1 = accuracy_score(yhat_train, y_train), f1_score(yhat_train, y_train)
print(f'Train Accuracy: {train_accuracy:,.0%}')
print(f'Train F1: {train_f1:,.0%}')

yhat_test = clf.predict(X_test)
test_accuracy, test_f1 = accuracy_score(yhat_test, y_test), f1_score(yhat_test, y_test)
print(f'Test Accuracy: {test_accuracy:,.0%}')
print(f'Test F1: {test_f1:,.0%}')

Train Accuracy: 72%
Train F1: 83%
Test Accuracy: 71%
Test F1: 83%


Cool beans!
Looks like we predicting almost 3/4 of questions correct with an F1 score of 83%.
Not too shabby for a baseline model!

Our train and test scores are pretty close too, which means we not dangerously overfit.
There's definitely a ton of room from improvement but great start here!

# Submission

This [sample notebook](https://www.kaggle.com/code/philculliton/basic-submission-demo) shows hhow to submit.

You need to import an API (`jo_wilder`). This API delivers two data frames in specific order, for every session+level grouping (one group per session for each checkpoint).

In [2]:
import jo_wilder_310
env = jo_wilder_310.make_env()
iter_test = env.iter_test()

ModuleNotFoundError: No module named 'jo_wilder_310.competition'

In [3]:
import sys; sys.version

'3.11.3 (main, Apr 19 2023, 23:54:32) [GCC 11.2.0]'