In [1]:
import pandas as pd
import numpy as np

import  sklearn

from sklearn.preprocessing import LabelEncoder

import optuna 

import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

import warnings

import dask.array as da
import dask.distributed

In [2]:
cluster = dask.distributed.LocalCluster(n_workers=8, threads_per_worker=1)
client = dask.distributed.Client(cluster)

In [4]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:41335  Dashboard: http://127.0.0.1:44373/status,Cluster  Workers: 8  Cores: 8  Memory: 67.45 GB


In [11]:
train = pd.read_csv("~/kaggledatasets/riiid-test-answer-prediction/train.csv", 
                        # nrows=5e6, 
                        dtype={'row_id': 'int64', 
                        'timestamp': 'int64', 
                        'user_id': 'int32', 
                        'content_id': 'int16', 
                        'content_type_id': 'int8',
                        'task_container_id': 'int16', 
                        'user_answer': 'int8', 
                        'answered_correctly': 'int8', 
                        'prior_question_elapsed_time': 'float32', 
                        'prior_question_had_explanation': 'boolean'},
                        )

questions = pd.read_csv("~/kaggledatasets/riiid-test-answer-prediction/questions.csv")
lectures = pd.read_csv("~/kaggledatasets/riiid-test-answer-prediction/lectures.csv")

In [12]:
def split_train_validation():
    global train
    validation = train.groupby("user_id").tail(20)
    train = train[~train.index.isin(validation.index)]
    print(f"valaidation data percentage={round((validation.shape[0]*100)/train.shape[0], 2)}")
    return train, validation


def feature_engineering_utils(m_content_id, m_task_container_id):
    te_content_id = calc_smooth_mean(by="content_id", on="answered_correctly", m=m_content_id)
    te_task_container_id = calc_smooth_mean(by="task_container_id", 
                                                      on="answered_correctly", 
                                                      m=m_task_container_id
                                                     )
    te_content_id.name = "te_content_id"
    te_task_container_id.name = "te_task_container_id"
    elapsed_mean = train.prior_question_elapsed_time.mean()
    return te_content_id, te_task_container_id, elapsed_mean
        
        
def calc_smooth_mean(by, on, m, name):
    global train
    # Compute the global mean
    mean = train[on].mean()
    # Compute the number of values and the mean of each group
    agg = train.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']
    # Compute the "smoothed" means
    smooth = (counts * means + m * mean) / (counts + m)
    smooth.name = name
    return smooth
    
def get_train_data(data, te_content_id, te_task_container_id):
    data = pd.merge(data, te_content_id, left_on = 'content_id', right_index=True, how='left')
    data = pd.merge(data, te_task_container_id, left_on = 'task_container_id', right_index=True, how='left')
    data.te_content_id.fillna(te_content_id.mean(), inplace=True)
    data.te_task_container_id.fillna(te_task_container_id.mean(), inplace=True)
    return data


    
class DataPipeline:
    def __init__(self, train, validation):
        # self.data = data
        self.train = train
        self.validation = validation        
        self.elapsed_mean = self.train.prior_question_elapsed_time.mean()
        self.train = self.clean_impute(self.train)
        self.validation = self.clean_impute(self.validation)
        self.is_state_reset = False
        self.usefull_columns = ['user_id',
                                'timestamp',
                               'content_id',
                               'task_container_id',
                               'prior_question_elapsed_time',
                               'prior_question_had_explanation',
                                "answered_correctly",
                               ]
                                    
    
    
    def clean_impute(self, data):
        data = data[data.content_type_id == 0]
        data.prior_question_elapsed_time.fillna(self.elapsed_mean, inplace=True)
        data.prior_question_had_explanation.fillna(False, inplace=True)
        data.prior_question_had_explanation = data.prior_question_had_explanation.astype(int)
        return data


def get_preprocessed_train_val():
    train, validation = split_train_validation()
    dp = DataPipeline(train, validation)
    train, validation = dp.clean_impute(train), dp.clean_impute(validation)
    return train, validation



In [13]:
train, validation = get_preprocessed_train_val()

valaidation data percentage=8.12


In [14]:
train.user_id.nunique()

332654

In [15]:
validation.user_id.nunique()

393644

In [16]:
train.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,13741.267578,0
1,1,56943,115,5716,0,2,2,1,37000.0,0
2,2,118363,115,128,0,0,0,1,55000.0,0
3,3,131167,115,7860,0,3,0,1,19000.0,0
4,4,137965,115,7922,0,4,1,1,11000.0,0


In [17]:
questions.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92
3,3,3,0,1,131 149 162 29
4,4,4,3,1,131 5 162 38


In [18]:
lectures.head()

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question


### Feature Engineering
* Remove the lecturres
* Include question features

In [44]:
from sklearn.preprocessing import LabelEncoder

questions['tags'] = LabelEncoder().fit_transform(questions.tags.astype(str))


In [50]:
train = pd.merge(train, questions[['question_id', 'bundle_id', 'part', 'tags']], left_on=['content_id'], right_on=['question_id'], how='left')

In [51]:
validation = pd.merge(validation, questions[['question_id', 'bundle_id', 'part', 'tags']], left_on=['content_id'], right_on=['question_id'], how='left')

In [54]:
validation.head()

Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,question_id,bundle_id,part,tags
0,26,621464,115,6,0,26,2,1,20000.0,0,6,6,1,26
1,27,645415,115,172,0,27,1,1,22000.0,0,172,172,1,9
2,28,670520,115,7898,0,28,2,1,22000.0,0,7898,7898,1,337
3,29,692971,115,175,0,29,0,0,23000.0,0,175,175,1,1376
4,30,710402,115,100,0,30,0,1,20000.0,0,100,100,1,322


In [55]:
pwd

'/home/aravind/Riiid-Answer-Correctness-Prediction/notebooks'

In [56]:
import os

if not os.path.isdir("../data"):
    os.mkdir("../data")

In [58]:
useful_columns = ['row_id', 'timestamp', 'user_id', 'content_id', 'content_type_id',
       'task_container_id', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'bundle_id', 'part', 'tags']

In [61]:
train[useful_columns].reset_index(drop=True).to_feather(f'train.feather')
validation[useful_columns].reset_index(drop=True).to_feather(f'test.feather')


In [65]:
tmp = pd.read_feather('../data/train.feather')

In [69]:
import gc
del tmp
gc.collect()

21111

In [70]:
import dask.dataframe as dd

train[useful_columns].reset_index(drop=True).to_parquet('../data/train.parquet.gzip', compression='gzip')
validation[useful_columns].reset_index(drop=True).to_parquet('../data/validation.parquet.gzip', compression='gzip')

In [75]:
cluster = dask.distributed.LocalCluster(n_workers=4, threads_per_worker=1)
client = dask.distributed.Client(cluster)

In [76]:
client

0,1
Client  Scheduler: tcp://127.0.0.1:43959  Dashboard: http://127.0.0.1:43019/status,Cluster  Workers: 4  Cores: 4  Memory: 67.45 GB


In [83]:
def objective(trial):

    m_content_id = trial.suggest_loguniform("m_content_id", 1, 1000)
    m_task_container_id = trial.suggest_loguniform("m_task_container_id", 1, 1000)
    global train
    global validation
    
    train_proccd = dd.from_pandas(train, npartitions=16)
    train_x = train_proccd.drop("answered_correctly", axis=1)
    train_y = train_proccd["answered_correctly"]
    del train_proccd
        
    val_proccd = dd.from_pandas(validation)
    test_x = val_proccd.drop("answered_correctly", axis=1)
    test_y = val_proccd["answered_correctly"]
    del val_proccd
    
    dtrain = xgb.dask.DaskDMatrix(client, train_x, train_y)
    dtest = xgb.DMatrix(test_x, test_y)
    del train_x, test_x
    gc.collect()
    
    param = {
        "objective": "binary:logistic",
        "eval_metric": "auc",
        'gpu_id': 0,
        'tree_method': 'gpu_hist',
        "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
    }

    if param["booster"] == "gbtree" or param["booster"] == "dart":
        param["max_depth"] = trial.suggest_int("max_depth", 1, 20)
        param["eta"] = trial.suggest_loguniform("eta", 1e-8, 1.0)
        param["gamma"] = trial.suggest_loguniform("gamma", 1e-8, 1.0)
        param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])
    if param["booster"] == "dart":
        param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
        param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
        param["rate_drop"] = trial.suggest_loguniform("rate_drop", 1e-8, 1.0)
        param["skip_drop"] = trial.suggest_loguniform("skip_drop", 1e-8, 1.0)

    # Add a callback for pruning.
    # pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    bst = xgb.train(param, dtrain, evals=[(dtest, "validation")], 
                    # callbacks=[pruning_callback],
                    )
    preds = bst.dask.predict(dtest)
    auc = sklearn.metrics.roc_auc_score(test_y, preds)
    return auc

In [84]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=2)

[32m[I 2020-11-20 11:17:53,482][0m A new study created in memory with name: no-name-ace4738e-6c00-4bf9-8f37-c0db4a79cbfd[0m
