In [15]:
import cudf
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os

from pathlib import Path
from sklearn.base import TransformerMixin

### Read data

In [5]:
base_dir = Path('/rapids/ml')
data_dir = base_dir/'data/kaggle/riiideducation/'
!ls -lrth $data_dir

total 6.9G
-rw-rw-r-- 1 1000 1000 290K Oct 14 17:45 questions.csv
-rw-rw-r-- 1 1000 1000 9.5K Oct 14 17:45 lectures.csv
-rw-rw-r-- 1 1000 1000 6.0K Oct 14 17:45 example_test.csv
-rw-rw-r-- 1 1000 1000  971 Oct 14 17:45 example_sample_submission.csv
-rw-rw-r-- 1 1000 1000 5.5G Oct 14 17:46 train.csv
-rw-rw-r-- 1 1000 1000 1.5G Dec 18 12:43 train.pq


In [6]:
%time
train_csv_path = data_dir/'train.csv'
train_pq_path = data_dir/'train.pq'

# parquet saves data types and also reduces disk space. Create it if not exists.
if train_pq_path.is_file():
    print(f'{train_pq_path} exists')
else:
    print(f'training file {train_pq_path} does not exist. Creating...')
    train_df = cudf.read_csv((data_dir/'train.csv').as_posix(),
                       dtype={'row_id': 'int64', 'timestamp': 'int64', 'user_id': 'int32', 'content_id': 'int16', 'content_type_id': 'int8',
                              'task_container_id': 'int16', 'user_answer': 'int8', 'answered_correctly': 'int8', 'prior_question_elapsed_time': 'float32', 
                             'prior_question_had_explanation': 'boolean',
                             }
                      )
    train_df.to_parquet(train_pq_path)       

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 7.15 µs
/rapids/ml/data/kaggle/riiideducation/train.pq exists


In [7]:
# read training data
train_df = cudf.read_parquet(train_pq_path); print(f'train_df.shape - {train_df.shape}')
train_df.head(3)

train_df.shape - (101230332, 10)


Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
0,0,0,115,5692,0,1,3,1,,
1,1,56943,115,5716,0,2,2,1,37000.0,False
2,2,118363,115,128,0,0,0,1,55000.0,False


In [8]:
%time
questions_df = cudf.read_csv((data_dir/'questions.csv').as_posix())
lectures_df = cudf.read_csv((data_dir/'lectures.csv').as_posix())
example_test = cudf.read_csv((data_dir/'example_test.csv').as_posix())

print(f'qustions.shape - {questions_df.shape}')
print(f'lectures.shape - {lectures_df.shape}')

CPU times: user 2 µs, sys: 2 µs, total: 4 µs
Wall time: 9.06 µs
qustions.shape - (13523, 5)
lectures.shape - (418, 4)


In [9]:
questions_df.head(3)

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags
0,0,0,0,1,51 131 162 38
1,1,1,1,1,131 36 81
2,2,2,0,1,131 101 162 92


In [10]:
lectures_df.head(3)

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept


In [11]:
example_test.head(3)

Unnamed: 0,row_id,group_num,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
0,0,0,0,275030867,5729,0,0,,,[],[]
1,1,0,13309898705,554169193,12010,0,4427,19000.0,True,,
2,2,0,4213672059,1720860329,457,0,240,17000.0,True,,


In [12]:
print(f'Number of unique users \t\t- {train_df.user_id.nunique()}')
print(f'Number of unique content ids \t- {train_df.content_id.nunique()}')
print(f'Number of unique content types\t- {train_df.content_type_id.nunique()}')
print(f'prior_question_elapsed_time.mean() \t- {train_df.prior_question_elapsed_time.mean()}')
print(f'num_rows per user \t - {train_df.shape[0]/train_df.user_id.nunique()}')

Number of unique users 		- 393656
Number of unique content ids 	- 13782
Number of unique content types	- 2
prior_question_elapsed_time.mean() 	- 25423.810042960275
num_rows per user 	 - 257.1542971528441


In [13]:
train_df.prior_question_elapsed_time.fillna(0, inplace=True)

## Preprocessing

In [16]:
class Preprocess(TransformerMixin):
    def __init__(self, questions_df, lectures_df):
        self.questions_df = questions_df
        self.lectures_df = lectures_df
    
    def fit(self, df):
        return self
    
    def transform(self, df):
        df = df.merge(self.questions_df, left_on='content_id',right_on='question_id', how='left')\
                .merge(self.lectures_df, left_on='content_id',right_on='lecture_id', how='left')

        # fill nulls
        df.type_of.fillna('', inplace=True)
        df.part_y.fillna(0, inplace=True)
        df.part_x.fillna(0, inplace=True)
        df.correct_answer.fillna(-1, inplace=True)
        df.question_id.fillna(-99, inplace=True)
        df.bundle_id.fillna(-99, inplace=True)
        df.lecture_id.fillna(-100, inplace=True)
        df.tag.fillna(-99, inplace=True)

        df.prior_question_had_explanation = train2.prior_question_had_explanation.astype(np.int8)
        df.prior_question_had_explanation.fillna(-1, inplace=True)  
        return df

In [14]:
#         train2 = train_df.merge(questions_df, left_on='content_id',right_on='question_id', how='left')\
#                 .merge(lectures_df, left_on='content_id',right_on='lecture_id', how='left')

#         # fill nulls
#         train2.type_of.fillna('', inplace=True)
#         train2.part_y.fillna(0, inplace=True)
#         train2.part_x.fillna(0, inplace=True)
#         train2.correct_answer.fillna(-1, inplace=True)
#         train2.question_id.fillna(-99, inplace=True)
#         train2.bundle_id.fillna(-99, inplace=True)
#         train2.lecture_id.fillna(-100, inplace=True)
#         train2.tag.fillna(-99, inplace=True)

#         train2.prior_question_had_explanation = train2.prior_question_had_explanation.astype(np.int8)
#         train2.prior_question_had_explanation.fillna(-1, inplace=True) 

In [12]:
train2.isna().sum(axis=0)

row_id                                  0
timestamp                               0
user_id                                 0
content_id                              0
content_type_id                         0
task_container_id                       0
user_answer                             0
answered_correctly                      0
prior_question_elapsed_time             0
prior_question_had_explanation          0
question_id                             0
bundle_id                               0
correct_answer                          0
part_x                                  0
tags                              1191808
lecture_id                              0
tag                                     0
part_y                                  0
type_of                                 0
dtype: uint64

In [26]:
cont_cols = ['timestamp','prior_question_elapsed_time']
cat_cols = ['user_id','content_id', 'content_type_id', 'task_container_id', 'user_answer',
           'prior_question_had_explanation', 'question_id', 'bundle_id', 'correct_answer',
           'lecture_id','tag']
target = 'answered_correctly'

In [None]:
train2.prior_question_had_explanation = train2.prior_question_had_explanation.astype(np.int8)

## Training
### Random Forest

In [19]:
import numpy as np
from cuml.ensemble import RandomForestClassifier as cuRFC

train2_df = Preprocess(questions_df, lectures_df).fit_transform(train_df)

In [27]:
model = cuRFC()
model.fit(train2_df[cat_cols + cont_cols], train2_df[target])

ValueError: The labels need to be consecutive values from 0 to the number of unique label values

In [30]:
train2_df[target].value_counts()

 1    65244627
 0    34026673
-1     1959032
Name: answered_correctly, dtype: int32

### Neural Network

In [None]:
train_df.shape[0]

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pytorch_lightning as pl

In [None]:
class EduModel(pl.LightningModule):
    def __init__(hparams, *args, **kwargs):
        pass
    
    def forward