# Brief Description

My final solution is based on ensemble of LGBM and SAKT, which give 0.786 on private leaderboard, 0.784 on public leaderboard. This notebook is the inference based on the single LGBM.

In [1]:
%reset -f

In [2]:
import numpy as np
import pandas as pd
from random import sample 
from collections import defaultdict
from tqdm import tqdm
import time
import pickle

import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#sns.set()

import gc
import riiideducation

import os

gc.collect()

4

In [3]:
RUN_CONTENT_AGG = False
RUN_USER_AGG = False
RUN_ATTEMPT_NO_AGG = False
DEBUG = False

# Load Data

In [4]:
prior_question_elapsed_time_mean = 25423.84 # mean of all train
def preprocess_df(path):
    cols = ['user_id', 'answered_correctly', 'content_id', 'timestamp']
    df = pd.read_pickle(path)[cols].reset_index(drop=True)
    df = df[df.answered_correctly != -1].reset_index(drop=True)
    return df

train_path = "/kaggle/input/riiid-train-data-multiple-formats/riiid_train.pkl.gzip"

train = preprocess_df(train_path)

if DEBUG:
    train = train[:1000000]
print("Train size:", train.shape)
train.head()

Train size: (99271300, 4)


Unnamed: 0,user_id,answered_correctly,content_id,timestamp
0,115,1,5692,0
1,115,1,5716,56943
2,115,1,128,118363
3,115,1,7860,131167
4,115,1,7922,137965


# Feature Engineering

Generate dictionaries to store users's statistics on full train data.

### dict_user_previous_ts & dict_user_continuous_correct
**dict_user_previous_ts:** dictionary to record previous 3 answers timestamp  
**dict_user_continuous_correct:** dictionary to record previous conrinuous correct answers

In [5]:
dict_user_previous_ts = defaultdict(list)
dict_user_continuous_correct = defaultdict(int)

for i, row in enumerate(tqdm(train.loc[:50000000, ['user_id', 'timestamp', 'answered_correctly']].values)):
    if len(dict_user_previous_ts[row[0]]) == 3:
        dict_user_previous_ts[row[0]].pop(0)
        dict_user_previous_ts[row[0]].append(row[1])
    else:
        dict_user_previous_ts[row[0]].append(row[1])
        
    if row[2] == 0:
        dict_user_continuous_correct[row[0]] = 0
    else:
        dict_user_continuous_correct[row[0]] += 1

time.sleep(15)

for i, row in enumerate(tqdm(train.loc[50000001:, ['user_id', 'timestamp', 'answered_correctly']].values)):
    if len(dict_user_previous_ts[row[0]]) == 3:
        dict_user_previous_ts[row[0]].pop(0)
        dict_user_previous_ts[row[0]].append(row[1])
    else:
        dict_user_previous_ts[row[0]].append(row[1])
        
    if row[2] == 0:
        dict_user_continuous_correct[row[0]] = 0
    else:
        dict_user_continuous_correct[row[0]] += 1

100%|██████████| 50000001/50000001 [04:12<00:00, 198234.45it/s]
100%|██████████| 49271299/49271299 [04:11<00:00, 195924.31it/s]


### dict_user_content_total_questions  

This is the dictionary to keep track of (user_id, content_id) pairs. To get how many times the user has seen the question previously. 

In [6]:
# with trace("create default"):
#     dict_user_content_total_questions = defaultdict(lambda: defaultdict(int))
# with trace("run loop"):    
#     for i, row in enumerate(train[['user_id', 'content_id']].values):
#         dict_user_content_total_questions[row[0]][row[1]] += 1

if RUN_ATTEMPT_NO_AGG:
    train.drop(['timestamp', 'answered_correctly'], axis=1, inplace=True)
    gc.collect()
    time.sleep(15)
    
    train["attempt_no"] = 1
    train.attempt_no=train.attempt_no.astype('int8')
    attempt_no_agg=train.groupby(["user_id","content_id"])["attempt_no"].agg(['sum'])
    attempt_no_agg=attempt_no_agg.astype('int8')
    attempt_no_agg = attempt_no_agg[attempt_no_agg['sum']>1]
    attempt_no_agg.to_pickle('attempt_no_agg.pkl')
else:
    del train
    gc.collect()
    time.sleep(15)
    attempt_no_agg = pd.read_pickle('/kaggle/input/riid-budingtanke/attempt_no_agg.pkl')

dict_user_content_total_questions = defaultdict(lambda: defaultdict(int))

for index, value in zip(list(attempt_no_agg.index), attempt_no_agg['sum']):
    dict_user_content_total_questions[index[0]][index[1]] = value


In [7]:
del attempt_no_agg
gc.collect()
time.sleep(15)

### user_agg
Get user aggregations.

In [8]:
# user aggregation
if RUN_USER_AGG:
    questions = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/questions.csv')
    questions['part'] = questions['part'].astype('int8')
    
    user_agg = train.groupby('user_id').aggregate(\
                                                  {'answered_correctly': ['count', 'sum'], \
                                                   'prior_question_elapsed_time': 'sum', \
                                                   'prior_question_had_explanation': 'sum'
                                                  })
    user_agg.columns = ['user_total_questions', 'user_correct_questions', 'prior_question_elapsed_time_sum', 'prior_question_had_explanation_sum']
    user_agg['user_correct_questions'] = user_agg['user_correct_questions'].astype(int)
    user_agg['prior_question_had_explanation_sum'] = user_agg['prior_question_had_explanation_sum'].astype('float32')
    user_agg.index.name = None
    user_agg.to_pickle('user_agg.pkl')  
else:
    user_agg = pd.read_pickle('/kaggle/input/riid-budingtanke/user_agg.pkl')
    
user_agg.head()

Unnamed: 0,user_total_questions,user_correct_questions,prior_question_elapsed_time_sum,prior_question_had_explanation_sum
115,46,32,922422.8,6.0
124,30,7,570420.8,0.0
2746,19,11,350423.8,11.0
5382,125,84,4495424.0,113.0
8623,109,70,2845024.0,96.0


In [9]:
# create dictionaries from user aggregation
dict_user_total_questions = dict(user_agg['user_total_questions'])
dict_user_correct_questions = dict(user_agg['user_correct_questions'])
dict_user_prior_question_elapsed_time_sum = dict(user_agg['prior_question_elapsed_time_sum'])
dict_user_prior_question_had_explanation_sum = dict(user_agg['prior_question_had_explanation_sum'])

dict_user_total_questions = defaultdict(int, dict_user_total_questions)
dict_user_correct_questions = defaultdict(int, dict_user_correct_questions)
dict_user_prior_question_elapsed_time_sum = defaultdict(int, dict_user_prior_question_elapsed_time_sum)
dict_user_prior_question_had_explanation_sum = defaultdict(int, dict_user_prior_question_had_explanation_sum)

del user_agg
gc.collect()

20

### content_agg
Get content aggregations.

In [10]:
# content aggregation
def get_content_agg(train):
    content_agg = train.groupby('content_id').aggregate({'answered_correctly': ['count', np.nanmean], \
                                                         'prior_question_elapsed_time': np.nanmean, \
                                                         'prior_question_had_explanation': np.nanmean})
    content_agg.columns = ['content_total_questions', 'content_accuracy', \
                           'content_prior_question_elapsed_time_avg', \
                           'content_prior_question_had_explanation_avg']

    content_agg = content_agg.merge(questions[['question_id', 'part']], how='left', \
                           left_index=True, right_on='question_id')\
    .drop('question_id', axis=1)
    
    content_explanation_agg = train[["content_id","prior_question_had_explanation",'answered_correctly']].groupby(["content_id","prior_question_had_explanation"])['answered_correctly'].agg(['mean'])
    content_explanation_agg = content_explanation_agg.unstack()
    content_explanation_agg=content_explanation_agg.reset_index()
    content_explanation_agg.columns = ['content_id', 'content_explanation_false_mean','content_explanation_true_mean']
    
    content_agg = content_agg.merge(content_explanation_agg, how='left', \
                                   left_index=True, right_on='content_id'\
                                   ).drop('content_id', axis=1)

    column_type = {'content_total_questions':'int64', \
                   'content_accuracy': 'float16', \
                   'content_prior_question_elapsed_time_avg': 'float32', \
                   'content_prior_question_had_explanation_avg': 'float16', \
                   'part': 'int8', \
                   'content_explanation_false_mean': 'float16', \
                   'content_explanation_true_mean': 'float16'}
    content_agg = content_agg.astype(column_type)
    
    content_agg.to_pickle('content_agg.pkl')   
    return content_agg

if RUN_CONTENT_AGG:
    content_agg = get_content_agg(train)

else:
    # saved content_agg is calculated on all train
    content_agg = pd.read_pickle('/kaggle/input/riid-budingtanke/content_agg_3.pkl')

content_agg = content_agg[['content_total_questions', \
                           'content_accuracy',\
                           'content_accuracy_std', \
                           'content_prior_question_elapsed_time_avg', \
                           'content_prior_question_had_explanation_avg', \
                           'part', \
#                            'bundle_id', \
                           'content_explanation_false_mean', \
                           'content_explanation_true_mean']]

content_agg.head()

Unnamed: 0,content_total_questions,content_accuracy,content_accuracy_std,content_prior_question_elapsed_time_avg,content_prior_question_had_explanation_avg,part,content_explanation_false_mean,content_explanation_true_mean
0,6903,0.907715,0.28944,21876.357422,0.947754,1,0.830566,0.912109
1,7398,0.890625,0.312104,22091.626953,0.980469,1,0.813965,0.89209
2,44905,0.554199,0.49705,23546.447266,0.888184,1,0.490967,0.562012
3,22973,0.779297,0.414636,23318.945312,0.958496,1,0.686035,0.783691
4,31736,0.613281,0.487021,23126.990234,0.530273,1,0.566895,0.654297


# Load Model

In [11]:
model = lgb.Booster(model_file='/kaggle/input/riid-budingtanke/model-15.txt')

# Inference

In [12]:
target = 'answered_correctly'
features = ['prior_question_had_explanation', \
            'prior_question_elapsed_time', \
            'user_cum_total_questions', \
            'user_cum_accuracy', \
            'user_cum_prior_question_elapsed_time_avg', \
            'user_cum_prior_question_had_explanation_avg', \
            'user_cum_content_total_questions', \
            'content_total_questions', \
            'content_accuracy', \
            'content_accuracy_std', \
            'content_prior_question_elapsed_time_avg', \
            'content_prior_question_had_explanation_avg', \
            'content_explanation_false_mean', \
            'content_explanation_true_mean', \
            'user_ts_lag_1', \
            'user_ts_lag_2', \
            'user_ts_lag_3', \
            'user_continuous_correct', \
            'part' 
           ]

In [13]:
env = riiideducation.make_env()

In [14]:
iter_test = env.iter_test()

In [15]:
prior_test_df = None

for (test_df, sample_prediction_df) in iter_test:
    
    if prior_test_df is not None:
        prior_test_df[target] = eval(test_df["prior_group_answers_correct"].iloc[0])
        prior_test_df = prior_test_df[prior_test_df.content_type_id == 0]
        
        ################ update dicts ####################
        for i, row in enumerate(prior_test_df[['user_id', \
                                               'answered_correctly', \
                                               'prior_question_elapsed_time', \
                                               'prior_question_had_explanation', \
                                               'content_id', \
                                               'timestamp']].values):
            
            dict_user_total_questions[row[0]] += 1
            dict_user_correct_questions[row[0]] += row[1]
            dict_user_prior_question_elapsed_time_sum[row[0]] += row[2]
            dict_user_prior_question_had_explanation_sum[row[0]] += row[3]
            dict_user_content_total_questions[row[0]][row[4]] += 1
            
            if len(dict_user_previous_ts[row[0]]) == 3:
                dict_user_previous_ts[row[0]].pop(0)
                dict_user_previous_ts[row[0]].append(row[5])
            else:
                dict_user_previous_ts[row[0]].append(row[5])
                
            if row[1] == 0:
                dict_user_continuous_correct[row[0]] = 0
            else:
                dict_user_continuous_correct[row[0]] += 1     
                
        ############# preprocess test_df #################
        cols = ['row_id', \
                'user_id', \
                'content_id', \
                'content_type_id', \
                'prior_question_had_explanation', \
                'prior_question_elapsed_time', \
                'timestamp']
        test_df = test_df[cols]
        test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(0).astype('bool')
        test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean)

        prior_test_df = test_df.copy()
        
        ############# get features ################
        user_cum_total_questions = np.zeros(test_df.shape[0], dtype=np.int32)
        user_cum_accuracy = np.zeros(test_df.shape[0], dtype=np.float32)
        user_cum_prior_question_elapsed_time_avg = np.zeros(test_df.shape[0], dtype=np.float32)
        user_cum_prior_question_had_explanation_avg = np.zeros(test_df.shape[0], dtype=np.float32)
        user_cum_content_total_questions = np.zeros(test_df.shape[0], dtype=np.int32)
        user_ts_lag_1 = np.zeros(test_df.shape[0], dtype = np.float32)
        user_ts_lag_2 = np.zeros(test_df.shape[0], dtype = np.float32)
        user_ts_lag_3 = np.zeros(test_df.shape[0], dtype = np.float32)
        user_continuous_correct = np.zeros(test_df.shape[0], dtype = np.float16)

        for i, row in enumerate(tqdm(test_df[['user_id', \
                                         'prior_question_elapsed_time', \
                                         'prior_question_had_explanation', \
                                         'content_id', \
                                         'timestamp']].values)):
            # get features from dicts
            user_cum_total_questions[i] = dict_user_total_questions[row[0]]
            if dict_user_total_questions[row[0]] != 0:      
                user_cum_accuracy[i] = dict_user_correct_questions[row[0]]/dict_user_total_questions[row[0]]
                user_cum_prior_question_elapsed_time_avg[i] = dict_user_prior_question_elapsed_time_sum[row[0]]/dict_user_total_questions[row[0]]
                user_cum_prior_question_had_explanation_avg[i] = dict_user_prior_question_had_explanation_sum[row[0]]/dict_user_total_questions[row[0]]
            else:
                user_cum_accuracy[i] = np.nan 
                user_cum_prior_question_elapsed_time_avg[i] = np.nan 
                user_cum_prior_question_had_explanation_avg[i] = np.nan

            user_cum_content_total_questions[i] = dict_user_content_total_questions[row[0]][row[3]]

            if len(dict_user_previous_ts[row[0]]) == 0:
                user_ts_lag_1[i] = np.nan
                user_ts_lag_2[i] = np.nan
                user_ts_lag_3[i] = np.nan
            elif len(dict_user_previous_ts[row[0]]) == 1:
                user_ts_lag_1[i] = row[4] - dict_user_previous_ts[row[0]][0]
                user_ts_lag_2[i] = np.nan
                user_ts_lag_3[i] = np.nan
            elif len(dict_user_previous_ts[row[0]]) == 2:
                user_ts_lag_1[i] = row[4] - dict_user_previous_ts[row[0]][1]
                user_ts_lag_2[i] = row[4] - dict_user_previous_ts[row[0]][0]
                user_ts_lag_3[i] = np.nan
            elif len(dict_user_previous_ts[row[0]]) == 3:
                user_ts_lag_1[i] = row[4] - dict_user_previous_ts[row[0]][2]
                user_ts_lag_2[i] = row[4] - dict_user_previous_ts[row[0]][1]
                user_ts_lag_3[i] = row[4] - dict_user_previous_ts[row[0]][0]
            
            if row[4] == 0:
                user_continuous_correct[i] = np.nan
            else:
                user_continuous_correct[i] = dict_user_continuous_correct[row[0]]
                
        # add new features to df
        test_df['user_cum_total_questions'] = user_cum_total_questions
        test_df['user_cum_accuracy'] = user_cum_accuracy
        test_df['user_cum_prior_question_elapsed_time_avg'] = user_cum_prior_question_elapsed_time_avg
        test_df['user_cum_prior_question_had_explanation_avg'] = user_cum_prior_question_had_explanation_avg
        test_df['user_cum_content_total_questions'] = user_cum_content_total_questions
        test_df['user_ts_lag_1'] = user_ts_lag_1
        test_df['user_ts_lag_2'] = user_ts_lag_2
        test_df['user_ts_lag_3'] = user_ts_lag_3
        test_df['user_continuous_correct'] = user_continuous_correct
        
        test_df = test_df.merge(content_agg, right_index=True, left_on='content_id', how='left')
        test_df['answered_correctly'] =  model.predict(test_df[features])
        
    else:
        ############# preprocess test_df #################
        cols = ['row_id', \
                'user_id', \
                'content_id', \
                'content_type_id', \
                'prior_question_had_explanation', \
                'prior_question_elapsed_time', \
                'timestamp']
        test_df = test_df[cols]
        test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(0).astype('bool')
        test_df['prior_question_elapsed_time'] = test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean)

        prior_test_df = test_df.copy()
        
        ############# get features ################
        user_cum_total_questions = np.zeros(test_df.shape[0], dtype=np.int32)
        user_cum_accuracy = np.zeros(test_df.shape[0], dtype=np.float32)
        user_cum_prior_question_elapsed_time_avg = np.zeros(test_df.shape[0], dtype=np.float32)
        user_cum_prior_question_had_explanation_avg = np.zeros(test_df.shape[0], dtype=np.float32)
        user_cum_content_total_questions = np.zeros(test_df.shape[0], dtype=np.int32)
        user_ts_lag_1 = np.zeros(test_df.shape[0], dtype = np.float32)
        user_ts_lag_2 = np.zeros(test_df.shape[0], dtype = np.float32)
        user_ts_lag_3 = np.zeros(test_df.shape[0], dtype = np.float32)
        user_continuous_correct = np.zeros(test_df.shape[0], dtype = np.float16)

        for i, row in enumerate(tqdm(test_df[['user_id', \
                                         'prior_question_elapsed_time', \
                                         'prior_question_had_explanation', \
                                         'content_id', \
                                         'timestamp']].values)):
            # get features from dicts
            user_cum_total_questions[i] = dict_user_total_questions[row[0]]
            if dict_user_total_questions[row[0]] != 0:      
                user_cum_accuracy[i] = dict_user_correct_questions[row[0]]/dict_user_total_questions[row[0]]
                user_cum_prior_question_elapsed_time_avg[i] = dict_user_prior_question_elapsed_time_sum[row[0]]/dict_user_total_questions[row[0]]
                user_cum_prior_question_had_explanation_avg[i] = dict_user_prior_question_had_explanation_sum[row[0]]/dict_user_total_questions[row[0]]
            else:
                user_cum_accuracy[i] = np.nan 
                user_cum_prior_question_elapsed_time_avg[i] = np.nan 
                user_cum_prior_question_had_explanation_avg[i] = np.nan

            user_cum_content_total_questions[i] = dict_user_content_total_questions[row[0]][row[3]]

            if len(dict_user_previous_ts[row[0]]) == 0:
                user_ts_lag_1[i] = np.nan
                user_ts_lag_2[i] = np.nan
                user_ts_lag_3[i] = np.nan
            elif len(dict_user_previous_ts[row[0]]) == 1:
                user_ts_lag_1[i] = row[4] - dict_user_previous_ts[row[0]][0]
                user_ts_lag_2[i] = np.nan
                user_ts_lag_3[i] = np.nan
            elif len(dict_user_previous_ts[row[0]]) == 2:
                user_ts_lag_1[i] = row[4] - dict_user_previous_ts[row[0]][1]
                user_ts_lag_2[i] = row[4] - dict_user_previous_ts[row[0]][0]
                user_ts_lag_3[i] = np.nan
            elif len(dict_user_previous_ts[row[0]]) == 3:
                user_ts_lag_1[i] = row[4] - dict_user_previous_ts[row[0]][2]
                user_ts_lag_2[i] = row[4] - dict_user_previous_ts[row[0]][1]
                user_ts_lag_3[i] = row[4] - dict_user_previous_ts[row[0]][0]
            
            if row[4] == 0:
                user_continuous_correct[i] = np.nan
            else:
                user_continuous_correct[i] = dict_user_continuous_correct[row[0]]
                
        # add new features to df
        test_df['user_cum_total_questions'] = user_cum_total_questions
        test_df['user_cum_accuracy'] = user_cum_accuracy
        test_df['user_cum_prior_question_elapsed_time_avg'] = user_cum_prior_question_elapsed_time_avg
        test_df['user_cum_prior_question_had_explanation_avg'] = user_cum_prior_question_had_explanation_avg
        test_df['user_cum_content_total_questions'] = user_cum_content_total_questions
        test_df['user_ts_lag_1'] = user_ts_lag_1
        test_df['user_ts_lag_2'] = user_ts_lag_2
        test_df['user_ts_lag_3'] = user_ts_lag_3
        test_df['user_continuous_correct'] = user_continuous_correct
        
        test_df = test_df.merge(content_agg, right_index=True, left_on='content_id', how='left')
        test_df['answered_correctly'] =  model.predict(test_df[features])
    
    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

100%|██████████| 18/18 [00:00<00:00, 3039.35it/s]
100%|██████████| 27/27 [00:00<00:00, 19819.08it/s]
100%|██████████| 26/26 [00:00<00:00, 18331.13it/s]
100%|██████████| 33/33 [00:00<00:00, 21956.22it/s]


# Draft

In [16]:
# cols = ['user_id', 'answered_correctly', 'content_id', 'content_type_id', \
#         'prior_question_had_explanation', 'prior_question_elapsed_time', 'timestamp']
# target_df = pd.read_pickle('../input/riiid-cross-validation-files/cv1_train.pickle')[50_000_000:52_500_000]#[cols]

# target_df = preprocess_test_df(target_df)

In [17]:
# class Iter_Valid(object):
#     def __init__(self, df, max_user=1000):
#         df = df.reset_index(drop=True)
#         self.df = df
#         self.user_answer = df['user_answer'].astype(str).values
#         self.answered_correctly = df['answered_correctly'].astype(str).values
#         df['prior_group_responses'] = "[]"
#         df['prior_group_answers_correct'] = "[]"
#         self.sample_df = df[df['content_type_id'] == 0][['row_id']]
#         self.sample_df['answered_correctly'] = 0
#         self.len = len(df)
#         self.user_id = df.user_id.values
#         self.task_container_id = df.task_container_id.values
#         self.content_type_id = df.content_type_id.values
#         self.max_user = max_user
#         self.current = 0
#         self.pre_user_answer_list = []
#         self.pre_answered_correctly_list = []

#     def __iter__(self):
#         return self
    
#     def fix_df(self, user_answer_list, answered_correctly_list, pre_start):
#         df= self.df[pre_start:self.current].copy()
#         sample_df = self.sample_df[pre_start:self.current].copy()
#         df.loc[pre_start,'prior_group_responses'] = '[' + ",".join(self.pre_user_answer_list) + ']'
#         df.loc[pre_start,'prior_group_answers_correct'] = '[' + ",".join(self.pre_answered_correctly_list) + ']'
#         self.pre_user_answer_list = user_answer_list
#         self.pre_answered_correctly_list = answered_correctly_list
#         return df, sample_df

#     def __next__(self):
#         added_user = set()
#         pre_start = self.current
#         pre_added_user = -1
#         pre_task_container_id = -1

#         user_answer_list = []
#         answered_correctly_list = []
#         while self.current < self.len:
#             crr_user_id = self.user_id[self.current]
#             crr_task_container_id = self.task_container_id[self.current]
#             crr_content_type_id = self.content_type_id[self.current]
#             if crr_content_type_id == 1:
#                 # no more than one task_container_id of "questions" from any single user
#                 # so we only care for content_type_id == 0 to break loop
#                 user_answer_list.append(self.user_answer[self.current])
#                 answered_correctly_list.append(self.answered_correctly[self.current])
#                 self.current += 1
#                 continue
#             if crr_user_id in added_user and ((crr_user_id != pre_added_user) or (crr_task_container_id != pre_task_container_id)):
#                 # known user(not prev user or differnt task container)
#                 return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
#             if len(added_user) == self.max_user:
#                 if  crr_user_id == pre_added_user and crr_task_container_id == pre_task_container_id:
#                     user_answer_list.append(self.user_answer[self.current])
#                     answered_correctly_list.append(self.answered_correctly[self.current])
#                     self.current += 1
#                     continue
#                 else:
#                     return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
#             added_user.add(crr_user_id)
#             pre_added_user = crr_user_id
#             pre_task_container_id = crr_task_container_id
#             user_answer_list.append(self.user_answer[self.current])
#             answered_correctly_list.append(self.answered_correctly[self.current])
#             self.current += 1
#         if pre_start < self.current:
#             return self.fix_df(user_answer_list, answered_correctly_list, pre_start)
#         else:
#             raise StopIteration()

In [18]:
# iter_test = Iter_Valid(target_df,max_user=1000)
# predicted = []
# def set_predict(df):
#     predicted.append(df)

In [19]:
# pbar = tqdm(total=2500000)
# prior_test_df = None

# for (test_df, sample_prediction_df) in iter_test:
    
#     if prior_test_df is not None:
#         prior_test_df[target] = eval(test_df["prior_group_answers_correct"].iloc[0])
#         prior_test_df = prior_test_df[prior_test_df.content_type_id == 0]
    
#         for i, row in enumerate(prior_test_df[['user_id', \
#                                                'answered_correctly', \
#                                                'prior_question_elapsed_time', \
#                                                'prior_question_had_explanation', \
#                                                'content_id', \
#                                                'timestamp']].values):
#             dict_user_total_questions[row[0]] += 1
#             dict_user_correct_questions[row[0]] += row[1]
#             dict_user_prior_question_elapsed_time_sum[row[0]] += row[2]
#             dict_user_prior_question_had_explanation_sum[row[0]] += row[3]
#             dict_user_content_total_questions[row[0]][row[4]] += 1
            
#             if len(dict_user_previous_ts[row[0]]) == 3:
#                 dict_user_previous_ts[row[0]].pop(0)
#                 dict_user_previous_ts[row[0]].append(row[5])
#             else:
#                 dict_user_previous_ts[row[0]].append(row[5])

#         test_df = preprocess_test_df(test_df)
#         prior_test_df = test_df.copy()
#         test_df = get_features(test_df, \
#                                dict_user_total_questions, \
#                                dict_user_correct_questions, \
#                                dict_user_prior_question_elapsed_time_sum, \
#                                dict_user_prior_question_had_explanation_sum, \
#                                dict_user_content_total_questions, \
#                                dict_user_previous_ts)
#         test_df = test_df.merge(content_agg, right_index=True, left_on='content_id', how='left')
#         test_df['answered_correctly'] =  model.predict(test_df[features])
        
#     else:
        
#         test_df = preprocess_test_df(test_df)
#         prior_test_df = test_df.copy()
#         test_df = test_df[test_df.content_type_id == 0]
#         test_df = get_features(test_df, \
#                                dict_user_total_questions, \
#                                dict_user_correct_questions, \
#                                dict_user_prior_question_elapsed_time_sum, \
#                                dict_user_prior_question_had_explanation_sum, \
#                                dict_user_content_total_questions, \
#                                dict_user_previous_ts)
#         test_df = test_df.merge(content_agg, right_index=True, left_on='content_id', how='left')
#         test_df['answered_correctly'] =  model.predict(test_df[features])
    
#     test_df = test_df[test_df.content_type_id == 0]
#     set_predict(test_df.loc[:,['row_id', 'answered_correctly']])
#     pbar.update(len(test_df))