## Importing required modules

In [1]:
import pandas as pd
import numpy as np
import pickle
import gc
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from tqdm.notebook import tqdm
import lightgbm as lgb
import dill

## Reading dataset
Train and validation files were used from [here](https://www.kaggle.com/its7171/riiid-cross-validation-files). **community** feature was used from this great [kernel](https://www.kaggle.com/spacelx/2020-r3id-clustering-question-tags) in which questions were clustered into communities using informaion provided by tags

In [2]:
train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
question_file = '../input/riiid-test-answer-prediction/questions.csv'
lecture_file = '../input/riiid-test-answer-prediction/lectures.csv'
community = pd.read_csv('../input/2020-r3id-clustering-question-tags/question_cmnts.csv')

In [3]:
questions = pd.read_csv(question_file)
questions = pd.concat([questions, community.community], axis = 1)
questions.head()

Unnamed: 0,question_id,bundle_id,correct_answer,part,tags,community
0,0,0,0,1,51 131 162 38,2
1,1,1,1,1,131 36 81,2
2,2,2,0,1,131 101 162 92,2
3,3,3,0,1,131 149 162 29,2
4,4,4,3,1,131 5 162 38,2


Let us see the maximum number of tags associated with a single question

In [4]:
# maximum no. of tags for a question
y = 0
for x in questions.tags.values:
    if str(x) == 'nan': continue
    z = len(x.split())
    y = max(y,z)
y

6

In [5]:
questions.correct_answer.unique()

array([0, 1, 3, 2])

In [6]:
lectures = pd.read_csv(lecture_file)
lectures.head()

Unnamed: 0,lecture_id,tag,part,type_of
0,89,159,5,concept
1,100,70,1,concept
2,185,45,6,concept
3,192,79,5,solving question
4,317,156,5,solving question


In [7]:
questions.set_index('question_id', inplace = True)
lectures.set_index('lecture_id', inplace = True)

In [8]:
questions.drop(['bundle_id', 'tags'], axis = 1, inplace = True)
gc.collect()

0

In [9]:
questions = questions.astype('int8')

As we can see, the **type_of** lecture carries some important information. So I encoded them into numerical form for further use

In [10]:
enc = LabelEncoder()
enc.fit(lectures.type_of)

LabelEncoder()

In [11]:
lectures['type'] = enc.transform(lectures.type_of)
lectures.head()

Unnamed: 0_level_0,tag,part,type_of,type
lecture_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
89,159,5,concept,0
100,70,1,concept,0
185,45,6,concept,0
192,79,5,solving question,2
317,156,5,solving question,2


In [12]:
lectures.drop(['tag','type_of'], axis = 1, inplace = True)
lectures = lectures.astype('int8')

## Feature engineering

The **add_features** function takes a Pandas DataFrame as input and computes features based **only on past information**. If update is set True, it updates the **dynamic feature accumulation dictionaries**.

In [13]:
def add_features(df, update = True):

    # Student features
    answered_correctly_u_avg = np.zeros(len(df), dtype = np.float32)
    explanation_u_avg = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_1 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_2 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_recency_3 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_incorrect_recency = np.zeros(len(df), dtype = np.float32)
    answer_freq_u = [np.zeros(len(df), dtype = np.int16) for i in range(4)]

    # Question features
    answered_correctly_q_avg = np.zeros(len(df), dtype = np.float32)
    explanation_q_avg = np.zeros(len(df), dtype = np.float32)
    elapsed_time_q_avg = np.zeros(len(df), dtype = np.float32)
    comm = np.zeros(len(df), dtype = np.int8)

    # Lecture features
    num_lect = np.zeros(len(df), dtype = np.int16)
    part = np.zeros(len(df), dtype = np.int8)
    
    # User Question features
    answered_correctly_uq_count = np.zeros(len(df), dtype = np.int32)
   
    for num, row in enumerate(tqdm(df[['user_id', 'answered_correctly', 'content_id', 'user_answer', 'prior_question_had_explanation', 'timestamp', 'content_type_id', 'prior_question_elapsed_time']].values)):
        
        # Student features assignation
        num_lect[num] = lecture_count[row[0]]
       
        # Assigning values to average accuracy of the student, and % explanations seen by the student till now
        if answered_correctly_u_count[row[0]] != 0:
            answered_correctly_u_avg[num] = answered_correctly_u_sum[row[0]] / answered_correctly_u_count[row[0]]
            explanation_u_avg[num] = explanation_u_sum[row[0]] / answered_correctly_u_count[row[0]]
        else:
            answered_correctly_u_avg[num] = np.nan
            explanation_u_avg[num] = np.nan

        # Assigning values to time recencies which are time elapsed since last interaction    
        if len(timestamp_u[row[0]]) == 0:
            timestamp_u_recency_1[num] = np.nan
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 1:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][0]
            timestamp_u_recency_2[num] = np.nan
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 2:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][0]
            timestamp_u_recency_3[num] = np.nan
        elif len(timestamp_u[row[0]]) == 3:
            timestamp_u_recency_1[num] = row[5] - timestamp_u[row[0]][2]
            timestamp_u_recency_2[num] = row[5] - timestamp_u[row[0]][1]
            timestamp_u_recency_3[num] = row[5] - timestamp_u[row[0]][0]

        # Assigning value to incorrect recency which is the time elapsed since last wrong answer given by the student
        if len(timestamp_u_incorrect[row[0]]) == 0:
            timestamp_u_incorrect_recency[num] = np.nan
        else:
            timestamp_u_incorrect_recency[num] = row[5] - timestamp_u_incorrect[row[0]][0]
        
        # Assigning values to frequencies of ecah answer given by the student
        for i in range(4):
            answer_freq_u[i][num] = answer_u_count[row[0]][i]

        # Question features assignation
        # Assigning values to average accuracy, % explanations seen, and average time elapsed for the question till now
        if answered_correctly_q_count[row[2]] != 0:
            answered_correctly_q_avg[num] = answered_correctly_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            elapsed_time_q_avg[num] = elapsed_time_q_sum[row[2]] / answered_correctly_q_count[row[2]]
            explanation_q_avg[num] = explanation_q_sum[row[2]] / answered_correctly_q_count[row[2]]
        else:
            answered_correctly_q_avg[num] = np.nan
            elapsed_time_q_avg[num] = np.nan
            explanation_q_avg[num] = np.nan
    
        # Assigning the number of times this student answered this question till now
        answered_correctly_uq_count[num] = answered_correctly_uq[row[0]][row[2]]
        
        # If the user interaction is a question
        if row[6] == 0:
            # Assigning part and community of the question
            comm[num] = questions.community[row[2]]
            part[num] = questions.part[row[2]]
            # Flag for training and inference
            if update:
                # Student features updates
                answered_correctly_u_count[row[0]] += 1
                explanation_u_sum[row[0]] += int(row[4])
                if len(timestamp_u[row[0]]) == 3:
                    timestamp_u[row[0]].pop(0)
                    timestamp_u[row[0]].append(row[5])
                else:
                    timestamp_u[row[0]].append(row[5])
                answered_correctly_u_sum[row[0]] += row[1]
                if row[1] == 0:
                    if len(timestamp_u_incorrect[row[0]]) == 1:
                        timestamp_u_incorrect[row[0]].pop(0)
                        timestamp_u_incorrect[row[0]].append(row[5])
                    else:
                        timestamp_u_incorrect[row[0]].append(row[5])
                answer_u_count[row[0]][row[3]] += 1

                # Question features updates
                answered_correctly_q_sum[row[2]] += row[1]
                elapsed_time_q_sum[row[2]] += row[7]
                answered_correctly_q_count[row[2]] += 1
                explanation_q_sum[row[2]] += int(row[4])

                answered_correctly_uq[row[0]][row[2]] += 1
        else:
        # If the user interaction is a lecture     
            if update:
                lecture_count[row[0]] += 1
        
    user_df = pd.DataFrame({'answered_correctly_u_avg': answered_correctly_u_avg, 'explanation_u_avg': explanation_u_avg, 'part' : part, 'community' : comm,
                            'answered_correctly_q_avg': answered_correctly_q_avg, 'explanation_q_avg': explanation_q_avg, 'elapsed_time_q_avg' : elapsed_time_q_avg, 
                            'timestamp_u_recency_1': timestamp_u_recency_1, 'timestamp_u_recency_2': timestamp_u_recency_2,
                            'timestamp_u_recency_3': timestamp_u_recency_3, 'timestamp_u_incorrect_recency': timestamp_u_incorrect_recency,
                            'num_lectures' : num_lect, 'num_0' : answer_freq_u[0], 'num_1' : answer_freq_u[1],
                            'num_2' : answer_freq_u[2], 'num_3' : answer_freq_u[3], 'answered_correctly_uq_count': answered_correctly_uq_count,})
    
    # Merging calculated features with the DataFrame
    df = pd.concat([df, user_df], axis = 1)
    return df

In [14]:
def update_features(df):
    for row in df[['user_id', 'answered_correctly', 'content_id', 'content_type_id', 'timestamp', 'user_answer', 'prior_question_had_explanation', 'prior_question_elapsed_time']].values:
        if row[3] == 0:
            # ------------------------------------------------------------------
            # Client features updates
            answered_correctly_u_count[row[0]] += 1
            answered_correctly_u_sum[row[0]] += row[1]
            explanation_u_sum[row[0]] += int(row[6])
            if row[1] == 0:
                if len(timestamp_u[row[0]]) == 3:
                    timestamp_u[row[0]].pop(0)
                    timestamp_u[row[0]].append(row[4])
                else:
                    timestamp_u[row[0]].append(row[4])
                if len(timestamp_u_incorrect[row[0]]) == 1:
                    timestamp_u_incorrect[row[0]].pop(0)
                    timestamp_u_incorrect[row[0]].append(row[4])
                else:
                    timestamp_u_incorrect[row[0]].append(row[4])
            answer_u_count[row[0]][row[5]] += 1
            # ------------------------------------------------------------------
            # Question features updates
            answered_correctly_q_sum[row[2]] += row[1]
            answered_correctly_q_count[row[2]] += 1
            elapsed_time_q_sum[row[2]] += row[7]
            explanation_q_sum[row[2]] += int(row[6])
            # ------------------------------------------------------------------
            answered_correctly_uq[row[0]][row[2]] += 1
        else:
            lecture_count[row[0]] += 1
    return


In [15]:
# read data
feld_needed = ['user_id', 'content_id', 'content_type_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'user_answer','timestamp']
train = pd.read_pickle(train_pickle)[feld_needed].iloc[:int(98730332/3)]
# changing dtype to avoid lightgbm error
prior_question_elapsed_time_mean = 25439.41
train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
train.reset_index(drop = True, inplace = True)
gc.collect()

13

In [16]:
def dd():
    return defaultdict(int)

### Dynamic Feature accumulation 
**answered_correctly_u_count** : Number of questions answered by each student 

**answered_correctly_u_sum** : Number of questions answered correctly by each student 

**explanation_u_sum** : Number of explanations seen by each student 

**timestamp_u** : Timestamps of last 3 user interactions of each student

**timestamp_u_incorrect** : Timestamp of last incorrectly answered question of each student

**lecture_count** : Number of lectures attended by each student 

**answer_u_count** : Frequency of each answer given by each student

**answered_correctly_q_count** : Number of times each question is answered 

**answered_correctly_q_sum** : Number of times each question is answered correctly

**elapsed_time_q_sum** : Total time taken to answer all occurences of each question

**explanation_q_sum** : Number of times an explanation is seen after answering each question

**answered_correctly_uq** : Number of times each question is answered by each student 

In [17]:
# Student dictionaries
answered_correctly_u_count = defaultdict(int)
answered_correctly_u_sum = defaultdict(int)
explanation_u_sum = defaultdict(int)
timestamp_u = defaultdict(list)
timestamp_u_incorrect = defaultdict(list)
lecture_count = defaultdict(int)
answer_u_count = defaultdict(dd)

# Question dictionaries
answered_correctly_q_count = defaultdict(int)
answered_correctly_q_sum = defaultdict(int)
elapsed_time_q_sum = defaultdict(int)
explanation_q_sum = defaultdict(int)

answered_correctly_uq = defaultdict(dd)

In [18]:
train = add_features(train)
gc.collect()

HBox(children=(FloatProgress(value=0.0, max=32910110.0), HTML(value='')))




3

Removing rows which are lectures as they cannot be used for training. After that I removed some columns which don't have any significance by themselves

In [19]:
train = train.loc[train.content_type_id == False].reset_index(drop=True)

In [20]:
train = train.drop(['content_type_id','user_answer', 'timestamp'], axis = 1)
gc.collect()

20

In [21]:
train.columns

Index(['user_id', 'content_id', 'answered_correctly',
       'prior_question_elapsed_time', 'prior_question_had_explanation',
       'answered_correctly_u_avg', 'explanation_u_avg', 'part', 'community',
       'answered_correctly_q_avg', 'explanation_q_avg', 'elapsed_time_q_avg',
       'timestamp_u_recency_1', 'timestamp_u_recency_2',
       'timestamp_u_recency_3', 'timestamp_u_incorrect_recency',
       'num_lectures', 'num_0', 'num_1', 'num_2', 'num_3',
       'answered_correctly_uq_count'],
      dtype='object')

In [22]:
train.dtypes

user_id                             int32
content_id                          int16
answered_correctly                   int8
prior_question_elapsed_time       float32
prior_question_had_explanation       int8
answered_correctly_u_avg          float32
explanation_u_avg                 float32
part                                 int8
community                            int8
answered_correctly_q_avg          float32
explanation_q_avg                 float32
elapsed_time_q_avg                float32
timestamp_u_recency_1             float32
timestamp_u_recency_2             float32
timestamp_u_recency_3             float32
timestamp_u_incorrect_recency     float32
num_lectures                        int16
num_0                               int16
num_1                               int16
num_2                               int16
num_3                               int16
answered_correctly_uq_count         int32
dtype: object

Saving the train DataFrame as feather and all the feature accumulated dictionaries as pickles

In [23]:
def save_obj(obj, name ):
    with open('./'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('./' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [24]:
train.to_feather('riiid_train1.feather')

In [25]:
%%time
save_obj(answered_correctly_u_count, 'answered_correctly_u_count')
save_obj(answered_correctly_u_sum, 'answered_correctly_u_sum')
save_obj(explanation_u_sum, 'explanation_u_sum')
save_obj(timestamp_u, 'timestamp_u')
save_obj(timestamp_u_incorrect, 'timestamp_u_incorrect')
save_obj(lecture_count, 'lecture_count')
save_obj(answer_u_count, 'answer_u_count')
save_obj(answered_correctly_q_count, 'answered_correctly_q_count')
save_obj(answered_correctly_q_sum, 'answered_correctly_q_sum')
save_obj(elapsed_time_q_sum, 'elapsed_time_q_sum')
save_obj(explanation_q_sum, 'explanation_q_sum')
save_obj(answered_correctly_uq, 'answered_correctly_uq')

CPU times: user 3min 4s, sys: 10.7 s, total: 3min 14s
Wall time: 3min 14s
