<a href="https://colab.research.google.com/github/ajenningsfrankston/Dynamic-Memory-Network-Plus-master/blob/master/Riiid_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

import os,shutil, random 



In [4]:
!pip install -q kaggle

from google.colab import drive
drive.mount('/content/gdrive')

import os
os.environ['KAGGLE_CONFIG_DIR'] = "/content/gdrive/My Drive/Kaggle"

!kaggle competitions download  -c riiid-test-answer-prediction 


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:


if not os.path.exists('../input/riiid-test-answer-prediction'):
    os.makedirs('../input/riiid-test-answer-prediction')

import zipfile

with zipfile.ZipFile('train.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('../input/riiid-test-answer-prediction')


if os.path.exists('train.csv'):
    os.remove('train.csv')


In [7]:
!pwd

/content


In [9]:
    
source_dir = './'
target_dir = '../input/riiid-test-answer-prediction'
    
file_names = ['lectures.csv','questions.csv','example_test.csv','example_sample_submission.csv']
    
for file_name in file_names:
    if not os.path.exists(target_dir + '/' + file_name):
        shutil.move(os.path.join(source_dir, file_name), target_dir)

os.listdir('../input/riiid-test-answer-prediction')


['example_test.csv',
 'example_sample_submission.csv',
 'train.csv',
 'questions.csv',
 'lectures.csv']

need to create the test set (split train into train/test) before we do the validation creation. From the notebook "Riiid: Creating a test dataset"



In [10]:
def dataset_split(df, q=0.975, print_timestamps=False):
    max_timestamp_per_user = df[['user_id','timestamp']].groupby('user_id').max()
    max_timestamp = max_timestamp_per_user.timestamp.max()
    max_timestamp_per_user['initial_timestamp'] = ((max_timestamp-max_timestamp_per_user.timestamp)*np.random.random(size=len(max_timestamp_per_user))).astype('uint64')
    timestamps_df = pd.merge(df[['user_id','timestamp']],max_timestamp_per_user['initial_timestamp'],left_on='user_id',right_index=True)
    timestamps_df['modified_timestamp'] = (timestamps_df.initial_timestamp+timestamps_df.timestamp).astype('uint64')
    if print_timestamps:
        print(timestamps_df)
    ordered_rows = timestamps_df.modified_timestamp.values.argsort()
    # calculate quantiles
    quantiles = (np.array(q)*len(ordered_rows)).astype('uint64')
    if type(q)==list:
        idx_list = []
        min_idx=0
        for quantile in quantiles:
            idx_list.append(ordered_rows[min_idx:quantile])
            min_idx=quantile
        idx_list.append(ordered_rows[min_idx:])
        return idx_list
    else:
        return [ordered_rows[:quantiles], ordered_rows[quantiles:]]



In [15]:
%%time 

data = pd.read_csv('../input/riiid-test-answer-prediction/train.csv',
                   dtype={'row_id': 'int64',
                          'timestamp': 'int64',
                          'user_id': 'int32',
                          'content_id': 'int16',
                          'content_type_id': 'int8',
                          'task_container_id': 'int16',
                          'user_answer': 'int8',
                          'answered_correctly':'int8',
                          'prior_question_elapsed_time': 'float32',
                          'prior_question_had_explanation': 'boolean'
                         })

CPU times: user 2min 25s, sys: 13 s, total: 2min 38s
Wall time: 2min 40s


In [16]:
np.random.seed(49)
train_idx, test_idx = dataset_split(data, q=0.25, print_timestamps=True)

              user_id  timestamp  initial_timestamp  modified_timestamp
0                 115          0        26110978556         26110978556
1                 115      56943        26110978556         26111035499
2                 115     118363        26110978556         26111096919
3                 115     131167        26110978556         26111109723
4                 115     137965        26110978556         26111116521
...               ...        ...                ...                 ...
101230327  2147482888  428564420         1942710825          2371275245
101230328  2147482888  428585000         1942710825          2371295825
101230329  2147482888  428613475         1942710825          2371324300
101230330  2147482888  428649406         1942710825          2371360231
101230331  2147482888  428692118         1942710825          2371402943

[101230332 rows x 4 columns]


In [18]:
print(train_idx,test_idx)

train_df = data.iloc[train_idx]
train_df

test_df = data.iloc[test_idx]
test_df



[32933156 32933157 32933158 ... 98726965 46494984 13161058] [100264781   7982876   7982875 ...  18511028  18511029  32938743]


Unnamed: 0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,user_answer,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
100264781,100264781,1808204,2128229377,152,0,8,2,0,24000.0,False
7982876,7982876,23805166242,173373240,2370,0,608,2,0,26000.0,True
7982875,7982875,23805166242,173373240,2368,0,608,3,1,26000.0,True
7982874,7982874,23805166242,173373240,2369,0,608,0,1,26000.0,True
93583296,93583296,644848044,1986571669,3455,0,284,1,1,23333.0,True
...,...,...,...,...,...,...,...,...,...,...
18511026,18511026,22386599451,397841566,3998,0,2497,1,1,12000.0,True
18511027,18511027,22386635812,397841566,13363,0,2498,2,1,15000.0,True
18511028,18511028,22386667282,397841566,4344,0,2499,1,1,19000.0,True
18511029,18511029,22386699460,397841566,11428,0,2500,3,0,14000.0,True


Test generator for emulating the riiid test environment. 


In [11]:
class TestGenerator:
    def __init__(self, df, grp_size=[1000,100]):
        self.df = df
        self.answered_correctly = self.df.answered_correctly.values
        self.predictions = np.zeros(len(self.answered_correctly))
        self.grp_size = grp_size
        self.start_idx=0
        self.prediction_called = True
        self.test_cols = [c for c in df.columns if c not in ['answered_correctly','user_answer']]
        self.current_batch = {'prior_group_answers_correct':[], 'prior_group_responses':[]}

    def iter_test(self):
        while self.start_idx<len(self.df):
            assert self.prediction_called, "You must call `predict()` successfully before you can continue with `iter_test()`"
            self.prediction_called = False
            self.end_idx = int(self.start_idx + max(1,np.random.normal(self.grp_size[0],self.grp_size[1])))
            test_df = self.df.iloc[self.start_idx:self.end_idx]
            answered_correctly_previous_batch = list(test_df['answered_correctly'])
            user_answer_previous_batch = list(test_df['user_answer'])
            test_df = test_df[self.test_cols]
            test_df['prior_group_answers_correct'] = None
            test_df['prior_group_responses'] = None
            test_df.loc[test_df.index[0],'prior_group_answers_correct'] = str(self.current_batch['prior_group_answers_correct'])
            test_df.loc[test_df.index[0],'prior_group_responses'] = str(self.current_batch['prior_group_responses'])
            self.current_batch['prior_group_answers_correct'] = answered_correctly_previous_batch
            self.current_batch['prior_group_responses'] = user_answer_previous_batch
            yield test_df

    def predict(self, prediction_df):
        assert not self.prediction_called, "You must get the next test sample from `iter_test()` first."
        self.predictions[self.start_idx:self.end_idx] = prediction_df.answered_correctly
        self.start_idx = self.end_idx
        self.prediction_called = True
        if self.end_idx>=len(self.df):
            print("Final AUC score: {0}".format(roc_auc_score(self.answered_correctly,self.predictions)))

      

Notebook "train-val-split on subset of users" 

**Have a (random) subset of users' data to enable quick model experiments, split into train, val w/o much hassle**

principles:
* keep each user's time course healthy, i.e. do not downsample for the sake of smaller data
* first part of time course should go into train - remaining part into val
* across users, have a fixed portion go into train and val, respectively

In [None]:
import pandas as pd
import random

random.seed(33)

fraction_users = 0.001
split = 0.8

take a subset of users

In [None]:
users = data.user_id.unique()
no_users_sample = int(round(len(users)*fraction_users,0))

print(f'no. of unique users: {len(users)}')
print(f'no. of users in sample: {no_users_sample}')

users = random.sample(list(users), no_users_sample)
mask = data.user_id.isin(users)
data = data[mask]

get xth percentile of timestamp, to prepare split

In [None]:
my_planet = data[['timestamp', 'user_id']].groupby('user_id').quantile(split).reset_index()

mark rows with train, val flag

In [None]:
# https://stackoverflow.com/questions/20625582/how-to-deal-with-settingwithcopywarning-in-pandas
pd.options.mode.chained_assignment = None

data['split'] = 'na'

for i in range(len(data)):
    current_user = data.user_id.iloc[i]
    percentile = float(my_planet.timestamp[my_planet.user_id==current_user])
    if data.timestamp.iloc[i] <= percentile:
        data.split.iloc[i] = 'train'
    else:
        data.split.iloc[i] = 'val'


split

In [None]:
train = data[data.split == 'train']
val = data[data.split == 'val']

train = train.drop('split', axis=1)
val = val.drop('split', axis=1)

check properties for train and val

In [None]:
print(f'no. of users in train: {len(train.user_id.unique())}')
print(f'no. of users in val: {len(val.user_id.unique())}')
print('-> should be the same')
print(f'no. of records in train: {len(train)}')
print(f'no. of records in val: {len(val)}')

oops - what's this?

In [None]:
train_ids = set(train['user_id'].unique())
val_ids = set(val['user_id'].unique()) 
cnt=0

missing_ids = train_ids - val_ids

for id in missing_ids:
    print(data[['timestamp', 'user_id']][data['user_id']==id])

interesting that there are a number of entries with equal timestamp for this user. But for now let us remove them just from train, val

In [None]:
train = train[~train.user_id.isin(missing_ids)]

check again

In [None]:
print(f'no. of users in train: {len(train.user_id.unique())}')
print(f'no. of users in val: {len(val.user_id.unique())}')
print('-> should be the same')
print(f'no. of records in train: {len(train)}')
print(f'no. of records in val: {len(val)}')

look at mean, standard deviation

In [None]:
valid_question = val[val.content_type_id == 0]
train_question = train[train.content_type_id == 0]

print(f'accuracy of answering in train: {round(train_question.answered_correctly.mean(), 2)} (stdev={round(train_question.answered_correctly.std(), 2)})')
print(f'accuracy of answering in val: {round(valid_question.answered_correctly.mean(), 2)} (stdev={round(valid_question.answered_correctly.std(), 2)})')

In [None]:
if not os.path.exists('../working'):
    os.makedirs('../working')


write train, val to csv files

In [None]:
#train.to_csv(f'../working/train_{int(fraction_users*100)}percent.csv')
#val.to_csv(f'../working/val_{int(fraction_users*100)}percent.csv')


thank you, tito, https://www.kaggle.com/its7171/cv-strategy for valuable insights