<a href="https://colab.research.google.com/github/ajenningsfrankston/riiid_kt/blob/master/Riiid_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

import os,shutil, random 
import feather



In [3]:
!pip install -q kaggle

from google.colab import drive
drive.mount('/content/gdrive')

import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

!kaggle competitions download  -c riiid-test-answer-prediction 


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
__init__.py: Skipping, found more recently modified local copy (use --force to force download)
competition.cpython-37m-x86_64-linux-gnu.so: Skipping, found more recently modified local copy (use --force to force download)
questions.csv: Skipping, found more recently modified local copy (use --force to force download)
lectures.csv: Skipping, found more recently modified local copy (use --force to force download)
example_test.csv: Skipping, found more recently modified local copy (use --force to force download)
train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
example_sample_submission.csv: Skipping, found more recently modified local copy (use --force to force download)


In [4]:


if not os.path.exists('../input/riiid-test-answer-prediction'):
    os.makedirs('../input/riiid-test-answer-prediction')

train_df = pd.read_feather('/content/gdrive/My Drive/Kaggle/train.feather')


In [5]:
!pwd

/content


In [6]:
    
source_dir = './'
target_dir = '../input/riiid-test-answer-prediction'
    
file_names = ['lectures.csv','questions.csv','example_test.csv','example_sample_submission.csv']
    
for file_name in file_names:
  if not os.path.exists(target_dir + '/' + file_name):
        shutil.move(os.path.join(source_dir, file_name), target_dir)

os.listdir('../input/riiid-test-answer-prediction')


['questions.csv',
 'example_test.csv',
 'lectures.csv',
 'train.csv',
 'example_sample_submission.csv']

need to create the test set (split train into train/test) before we do the validation creation. From the notebook "Riiid: Creating a test dataset"



In [7]:
def dataset_split(df, q=0.975, print_timestamps=False):
    max_timestamp_per_user = df[['user_id','timestamp']].groupby('user_id').max()
    max_timestamp = max_timestamp_per_user.timestamp.max()
    max_timestamp_per_user['initial_timestamp'] = ((max_timestamp-max_timestamp_per_user.timestamp)*np.random.random(size=len(max_timestamp_per_user))).astype('uint64')
    timestamps_df = pd.merge(df[['user_id','timestamp']],max_timestamp_per_user['initial_timestamp'],left_on='user_id',right_index=True)
    timestamps_df['modified_timestamp'] = (timestamps_df.initial_timestamp+timestamps_df.timestamp).astype('uint64')
    if print_timestamps:
        print(timestamps_df)
    ordered_rows = timestamps_df.modified_timestamp.values.argsort()
    # calculate quantiles
    quantiles = (np.array(q)*len(ordered_rows)).astype('uint64')
    if type(q)==list:
        idx_list = []
        min_idx=0
        for quantile in quantiles:
            idx_list.append(ordered_rows[min_idx:quantile])
            min_idx=quantile
        idx_list.append(ordered_rows[min_idx:])
        return idx_list
    else:
        return [ordered_rows[:quantiles], ordered_rows[quantiles:]]



In [8]:
np.random.seed(49)
train_idx, test_idx = dataset_split(train_df, q=0.25, print_timestamps=True)

              user_id  timestamp  initial_timestamp  modified_timestamp
0                 115          0        26110978556         26110978556
1                 115      56943        26110978556         26111035499
2                 115     118363        26110978556         26111096919
3                 115     131167        26110978556         26111109723
4                 115     137965        26110978556         26111116521
...               ...        ...                ...                 ...
101230327  2147482888  428564420         1942710825          2371275245
101230328  2147482888  428585000         1942710825          2371295825
101230329  2147482888  428613475         1942710825          2371324300
101230330  2147482888  428649406         1942710825          2371360231
101230331  2147482888  428692118         1942710825          2371402943

[101230332 rows x 4 columns]


In [None]:
import gc
print(train_idx,test_idx)

train_df = data.iloc[train_idx]
train_df

test_df = data.iloc[test_idx]
test_df


gc.collect()



[32933156 32933157 32933158 ... 98726965 46494984 13161058] [100264781   7982876   7982875 ...  18511028  18511029  32938743]


Test generator for emulating the riiid test environment. 


In [None]:
class TestGenerator:
    def __init__(self, df, grp_size=[1000,100]):
        self.df = df
        self.answered_correctly = self.df.answered_correctly.values
        self.predictions = np.zeros(len(self.answered_correctly))
        self.grp_size = grp_size
        self.start_idx=0
        self.prediction_called = True
        self.test_cols = [c for c in df.columns if c not in ['answered_correctly','user_answer']]
        self.current_batch = {'prior_group_answers_correct':[], 'prior_group_responses':[]}

    def iter_test(self):
        while self.start_idx<len(self.df):
            assert self.prediction_called, "You must call `predict()` successfully before you can continue with `iter_test()`"
            self.prediction_called = False
            self.end_idx = int(self.start_idx + max(1,np.random.normal(self.grp_size[0],self.grp_size[1])))
            test_df = self.df.iloc[self.start_idx:self.end_idx]
            answered_correctly_previous_batch = list(test_df['answered_correctly'])
            user_answer_previous_batch = list(test_df['user_answer'])
            test_df = test_df[self.test_cols]
            test_df['prior_group_answers_correct'] = None
            test_df['prior_group_responses'] = None
            test_df.loc[test_df.index[0],'prior_group_answers_correct'] = str(self.current_batch['prior_group_answers_correct'])
            test_df.loc[test_df.index[0],'prior_group_responses'] = str(self.current_batch['prior_group_responses'])
            self.current_batch['prior_group_answers_correct'] = answered_correctly_previous_batch
            self.current_batch['prior_group_responses'] = user_answer_previous_batch
            yield test_df

    def predict(self, prediction_df):
        assert not self.prediction_called, "You must get the next test sample from `iter_test()` first."
        self.predictions[self.start_idx:self.end_idx] = prediction_df.answered_correctly
        self.start_idx = self.end_idx
        self.prediction_called = True
        if self.end_idx>=len(self.df):
            print("Final AUC score: {0}".format(roc_auc_score(self.answered_correctly,self.predictions)))

      

write train, val to csv files