# Part I: Feature Engineering

In [None]:
import numpy as np
from numpy import savez_compressed
import  pandas as pd
pd.set_option('display.max_columns', 50)
import os
os.makedirs('./compressed_features')
import gc

## Helper functions

In [None]:
def np_compress(feature):
    data = np.ndarray(shape=(train_df.shape[0], 1))
    data = train_df[feature].values.astype(np_dict[feature])
    # train_df.drop(columns=[feature], inplace=True)
    print(f'The length of {feature}: {len(data)}')
    print(data)
    np.savez_compressed(f'./compressed_features/{feature}', data = data)

# used for setting the index of a new DataFrame
N_ROWS = 99271300
def get_index_np():
    return np.arange(N_ROWS)

# This function returns a feature
# A list of indices and a data type can be passed to retrieve spcific training/validation rows as float32
def load_feature(feature, idxs=None, dtype=None):
    file_path = f'/kaggle/input/riiid-answer-correctness-prediction-features-temp/FEATURES_V1G/{feature}.npz'
    if idxs is None and dtype is None:
        return np.load(file_path, allow_pickle=True)['v']
    elif idxs is not None and dtype is not None:
        return np.load(file_path, allow_pickle=True)['v'][idxs].astype(dtype)

In [None]:
target = 'answered_correctly'

data_types_dict = {
    'timestamp': 'int64',
    'user_id': 'int32', 
    'content_id': 'int16', 
    'task_container_id': 'int16',
    'answered_correctly': 'int8', 
    'prior_question_elapsed_time': 'float32', 
}
                   
features_dtypes = {
    'mean_user_accuracy': 'float32',
    'answered_user': 'uint16',
    'mean_content_accuracy': 'float32',
    'content_count': 'int32',
    'user_attempts': 'uint16',
    'user_rating': 'float32',
    'mean_user_part_accuracy': 'float32',
    'part_cumcount': 'uint16',
    'last_interaction_elapsed_time_l1': 'float64',
    'last_interaction_elapsed_time_l2': 'float64',
    'last_interaction_elapsed_time_l3': 'float64',
    'prior_tag': 'int16',
}

np_dict = {
    'timestamp':np.float32,
    'user_id':np.int32,
    'content_id':np.int16,
    'task_container_id':np.int16,
    'answered_correctly':np.int8,
    'mean_user_accuracy':np.float32,
    'answered_user':np.int16,
    'answered_correctly_user':np.int16,
    'user_attempts':np.int16,
    'hmean_user_content_accuracy':np.float32,
    'mean_content_accuracy':np.float32,
    'content_count':np.float32,
    'user_rating':np.float32,
    'part':np.int8,
    'tags1':np.int8,
    'tags2':np.int8,
    'mean_user_part_accuracy':np.float32,
    'last_interaction_elapsed_time_l1':np.float32,
    'last_interaction_elapsed_time_l2':np.float32,
    'last_interaction_elapsed_time_l3':np.float32,
    'last_correct_time_elapsed':np.float32,
    'last_incorrect_time_elapsed':np.float32,
    'prior_question_elapsed_time':np.float32,
}

## Load data

In [None]:
train_df = pd.read_csv('/kaggle/input/riiid-test-answer-prediction/train.csv',
                       low_memory=False, nrows=1_000_000,
                       usecols=['timestamp',
                                'user_id', 
                                'content_id', 
                                'task_container_id',
                                'answered_correctly', 
                                'prior_question_elapsed_time', 
                                ],
                       dtype=data_types_dict
                       )

train_df = train_df[train_df[target] != -1].reset_index(drop=True)

In [None]:
train_df.prior_question_elapsed_time.fillna(23916, inplace = True)
np_compress('prior_question_elapsed_time')

* **last_interaction_elapsed_time_l1**
* **last_interaction_elapsed_time_l2**
* **last_interaction_elapsed_time_l2**

In [None]:
train_df['timestamp'] = train_df['timestamp']/(1000*3600)
train_df.timestamp = train_df.timestamp.astype('float32')

timestamp_df= train_df.groupby(['user_id', 'task_container_id']).head(1)[['user_id', 'task_container_id', 'timestamp']]

timestamp_df['last_interaction_elapsed_time_l1'] = timestamp_df.groupby('user_id')['timestamp'].shift()
timestamp_df['last_interaction_elapsed_time_l2'] = timestamp_df.groupby('user_id')['timestamp'].shift(2)
timestamp_df['last_interaction_elapsed_time_l3'] = timestamp_df.groupby('user_id')['timestamp'].shift(3)

timestamp_df.drop(columns=['timestamp'], inplace=True)
train_df = pd.merge(train_df, timestamp_df, on=['user_id', 'task_container_id'], how='left')

time_diff1_mean = train_df['last_interaction_elapsed_time_l1'].mean()
time_diff2_mean = train_df['last_interaction_elapsed_time_l2'].mean()
time_diff3_mean = train_df['last_interaction_elapsed_time_l3'].mean()

train_df.timestamp.fillna(0, inplace = True)
train_df.last_interaction_elapsed_time_l1.fillna(time_diff1_mean, inplace = True)
train_df.last_interaction_elapsed_time_l2.fillna(time_diff2_mean, inplace = True)
train_df.last_interaction_elapsed_time_l3.fillna(time_diff3_mean, inplace = True)

# del timestamp_df

np_compress('last_interaction_elapsed_time_l1')
np_compress('last_interaction_elapsed_time_l2')
np_compress('last_interaction_elapsed_time_l3')
np_compress('timestamp')

* **user_attempts**

In [None]:
train_df["user_attempts"] = 1
train_df["user_attempts"] = train_df[["user_id","content_id","user_attempts"]].groupby(["user_id","content_id"])["user_attempts"].cumsum()
train_df["user_attempts"] = train_df["user_attempts"].mask((train_df['user_attempts'] > 5), 5)
train_df.user_attempts = train_df.user_attempts - 1
print(train_df['user_attempts'].value_counts())
np_compress('user_attempts')

* **mean_user_accuracy**
* **answered_correctly_user**
* **answered_user**

In [None]:
train_df['lag'] = train_df[['user_id', 'answered_correctly']].groupby('user_id')[target].shift()
cum = train_df[['user_id', 'lag']].groupby('user_id')['lag'].agg(['cumsum', 'cumcount'])
train_df['mean_user_accuracy'] = cum['cumsum'] / cum['cumcount']
train_df.mean_user_accuracy.fillna(0.680, inplace = True)
cum.columns = ['answered_correctly_user', 'answered_user']
train_df.drop(columns=['lag'], inplace=True)

* **tags1**
* **tags2**
* **mean_content_accuracy**
* **content_count**

In [None]:
questions_df = pd.read_csv('../input/riiid-test-answer-prediction/questions.csv', 
                            usecols=['question_id', 'part', 'tags'],
                            dtype={'question_id': 'int16', 'part': 'int8', 'tags': 'str'}
                            )
                          
tag = questions_df["tags"].str.split(" ", n = 10, expand = True) 
tag.columns = ['tags1','tags2','tags3','tags4','tags5','tags6']
tag.drop(columns=['tags3','tags4','tags5','tags6'], inplace=True)

questions_df =  pd.concat([questions_df,tag],axis=1)
questions_df.drop(columns=['tags'], inplace=True)
questions_df['tags1'] = pd.to_numeric(questions_df['tags1'], errors='coerce')
questions_df['tags2'] = pd.to_numeric(questions_df['tags2'], errors='coerce')
questions_df['tags2'].fillna(-1, inplace = True)
del tag

questions_df[['mean_content_accuracy', 'content_count']] = train_df[['content_id', 'answered_correctly']].groupby('content_id')[target].agg(['mean', 'count'])
train_df = pd.merge(train_df, questions_df, left_on='content_id', right_on='question_id', how='left')
train_df.drop(columns=['question_id'], inplace=True)
np_compress('tags1')
np_compress('tags2')
np_compress('content_count')
del(questions_df)

train_df = train_df.join(cum)
train_df.answered_correctly_user.fillna(0, inplace = True)
train_df.answered_user.fillna(0.0, inplace = True)
np_compress('answered_correctly_user')
del(cum)

* **part**
* **mean_user_part_accuracy**

In [None]:
train_df['lag'] = train_df.groupby(['user_id', 'part'])[target].shift()
cum = train_df.groupby(['user_id', 'part'])['lag'].agg(['cumsum', 'cumcount'])
train_df['cumcount_p'] = cum['cumcount']
train_df['mean_user_part_accuracy'] = cum['cumsum'] / cum['cumcount']
train_df.drop(columns=['lag'], inplace=True)

np_compress('part')
np_compress('mean_user_part_accuracy')
del cum

* **user_rating**

In [None]:
cols = ['user_id', 'task_container_id', 'answered_user', 'mean_content_accuracy', 'answered_correctly']

train_df['user_rating'] = train_df['answered_correctly'] - train_df['mean_content_accuracy']
train_df['user_rating'] = train_df.groupby('user_id')['user_rating'].shift()
train_df['user_rating'] = train_df.groupby('user_id')['user_rating'].cumsum()

df_ = train_df.groupby(['user_id', 'task_container_id']).head(1)[['user_id', 'task_container_id', 'user_rating']]
train_df.drop(columns=['user_rating'], inplace=True)
train_df = pd.merge(train_df, df_, on=['user_id', 'task_container_id'], how='left')

train_df['user_rating'] = train_df['user_rating'] / train_df['answered_user']
train_df['user_rating'].fillna(0, inplace=True)

np_compress('user_rating')
np_compress('task_container_id')
np_compress('answered_user')
del df_

* **hmean_user_content_accuracy**

In [None]:
train_df['hmean_user_content_accuracy'] = 2 * ((train_df['mean_user_accuracy'] *  train_df['mean_content_accuracy']) / (train_df['mean_user_accuracy'] + train_df['mean_content_accuracy']))
train_df.hmean_user_content_accuracy.fillna(0.0, inplace = True)

np_compress('hmean_user_content_accuracy')
np_compress('mean_content_accuracy')
np_compress('mean_user_accuracy')

In [None]:
np_compress('user_id')
np_compress('content_id')
np_compress('answered_correctly')

In [None]:
n_rows = len(train_df)
n_rows

In [None]:
train_df.isnull().sum()

In [None]:
# del(train_df)
gc.collect()

In [None]:
train_df

# Part II: Training

In [None]:
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm.notebook import tqdm
from cairosvg import svg2png
from PIL import Image
from io import BytesIO

from glob import glob
import pickle
import re

In [None]:
# This simply shows all features
for file_path in glob('./compressed_features/*.npz'):
    print(re.findall('(?<=.\/)([a-z_0-9]*)(.npz)', file_path)[0][0])

In [None]:
# this are given features, bundle_id is retrieved by merging the questions df with the train df
given_features = [
    'prior_question_elapsed_time',
    'prior_question_had_explanation',
    # 'bundle_id',
]

deduced_features = [
    # user features
    'mean_user_accuracy',
    'answered_correctly_user',
    'answered_user',
    # content features
    'mean_content_accuracy',
    'content_count',
    # part features
    'part',
    'mean_user_part_accuracy',
    # tag features
    'tag_1',
    'tag_2',
    # user content features
    'hmean_user_content_accuracy',
    # last interaction elapsed time
    'last_interaction_elapsed_time_l1',
    'last_interaction_elapsed_time_l2',
    'last_interaction_elapsed_time_l3',
    # other
    'attempt',
    # 'user_ratings',
    
    # lastly added features
    'last_correct_time_elapsed',
    'last_incorrect_time_elapsed',
]

features = given_features + deduced_features

features_df_cols = [
    'user_id', 'content_id', 'part', 'tags', # merge keys
    'tags_label', 'answered_user', # deduced data
    'answered_correctly_user', 'mean_user_accuracy', 'mean_content_accuracy', # deduced features
]

target = 'answered_correctly'

# specify the indices of the columns with categorical features to LightGBM
categorical_feature = ['part', 'retry', 'prior_question_had_explanation', 'bundle_id', 'tag_1', 'tag_2']
categorical_feature_idxs = []
for v in categorical_feature:
    try:
        categorical_feature_idxs.append(features.index(v))
    except:
        pass

## Make train and validation datasets

In [None]:
def get_train_val_idxs(TRAIN_SIZE, VAL_SIZE):
    train_idxs = []
    val_idxs = []
    NEW_USER_FRAC = 1/4 # fraction of new users, 25% of validation rows are new users
    np.random.seed(42)
    
    # create df with user_ids and indices
    df = pd.DataFrame(index=get_index_np())
    for feature in ['user_id']:
        df[feature] = load_feature(feature)

    df['index'] = df.index.values.astype(np.uint32)
    user_id_index = df.groupby('user_id')['index'].apply(np.array)
    
    # iterate over users in random order
    for indices in user_id_index.sample(user_id_index.size, random_state=42):
        if len(train_idxs) > TRAIN_SIZE:
            break

        # fill validation data
        if len(val_idxs) < VAL_SIZE:
            # add new user
            if np.random.rand() < NEW_USER_FRAC:
                val_idxs += list(indices)
            # randomly split user between train and val
            else:
                offset = np.random.randint(0, indices.size)
                train_idxs += list(indices[:offset])
                val_idxs += list(indices[offset:])
        else:
            train_idxs += list(indices)
    
    train_idxs = np.array(train_idxs, dtype=np.uint32)
    val_idxs = np.array(val_idxs, dtype=np.uint32)
    
    return train_idxs, val_idxs

train_idxs, val_idxs = get_train_val_idxs(int(1e6), 0.25e6)
print(f'len train_idxs: {len(train_idxs)}, len validation_idxs: {len(val_idxs)}')

In [None]:
def make_x_y(train_idxs, val_idxs):
    # create numpy arrays
    X_train = np.ndarray(shape=(len(train_idxs), len(features)), dtype=np.float32)
    X_val = np.ndarray(shape=(len(val_idxs), len(features)), dtype=np.float32)
    
    # now fill them up column wise to reduce memory usage
    # features are loaded from disk as npz files (compressed numpy arrays)
    for idx, feature in enumerate(tqdm(features)):
        X_train[:,idx] = load_feature(feature, train_idxs, np.float32)
        X_val[:,idx] = load_feature(feature, val_idxs, np.float32)
    
    y_train = load_feature(target, train_idxs, np.int8)
    y_val = load_feature(target, val_idxs, np.int8)
                         
    return X_train, y_train, X_val, y_val
    
X_train, y_train, X_val, y_val = make_x_y(train_idxs, val_idxs)

In [None]:
print(f'X_train.shape: {X_train.shape}\t y_train.shape: {y_train.shape}')
print(f'X_val.shape: {X_val.shape}\t y_val.shape: {y_val.shape}')

In [None]:
# show train features
display(pd.DataFrame(X_train[:25], columns=features))

In [None]:
# Check the target (answered correctly) as sanity check
display(y_train[:25])

In [None]:
# make train and validation dataset
train_data = lgb.Dataset(
    data = X_train,
    label = y_train,
    categorical_feature = None,
)

val_data = lgb.Dataset(
    data = X_val,
    label = y_val,
    categorical_feature = None,
)

# Free up RAM
del X_train, y_train, X_val, y_val, train_idxs, val_idxs
gc.collect()

## Training

In [None]:
%%time

# Simple LightGBM parameters
lgbm_params = {
    'objective': 'binary',
    'metric': ['auc'],
    'num_leaves': 200,
    'learning_rate': 0.1,
}

def train():
    evals_result = {}
    model = lgb.train(
        params = lgbm_params,
        train_set = train_data,
        valid_sets = [val_data],
        num_boost_round = 5000,
        verbose_eval = 10,
        evals_result = evals_result,
        early_stopping_rounds = 10,
        categorical_feature = categorical_feature_idxs,
        feature_name = features,
    ) 

    # save model
    model.save_model(f'model.lgb')
    
    return model, evals_result
    
model, evals_result = train()

## Training History

In [None]:
def plot_history(evals_result):
    for metric in ['auc']:
        plt.figure(figsize=(20,8))
        
        for key in evals_result.keys():
            history_len = len(evals_result.get(key)[metric])
            history = evals_result.get(key)[metric]
            x_axis = np.arange(1, history_len + 1)
            plt.plot(x_axis, history, label=key)
        
        x_ticks = list(filter(lambda e: (e % (history_len // 100 * 10) == 0) or e == 1, x_axis))
        plt.xticks(x_ticks, fontsize=12)
        plt.yticks(fontsize=12)

        plt.title(f'{metric.upper()} History of training', fontsize=18);
        plt.xlabel('EPOCH', fontsize=16)
        plt.ylabel(metric.upper(), fontsize=16)
        
        if metric in ['auc']:
            plt.legend(loc='upper left', fontsize=14)
        else:
            plt.legend(loc='upper right', fontsize=14)
        plt.grid()
        plt.show()

plot_history(evals_result)

In [None]:
def show_feature_importances(model, importance_type, max_num_features=10**10):
    feature_importances = pd.DataFrame()
    feature_importances['feature'] = features
    feature_importances['value'] = pd.DataFrame(model.feature_importance(importance_type))
    feature_importances = feature_importances.sort_values(by='value', ascending=False) # sort feature importance
    feature_importances.to_csv(f'feature_importances_{importance_type}.csv') # write feature importance to csv
    feature_importances = feature_importances[:max_num_features] # only show max_num_features
    
    plt.figure(figsize=(18, 8))
    plt.xlim([0, feature_importances.value.max()*1.1])
    plt.title(f'Feature {importance_type}', fontsize=18);
    sns.barplot(data=feature_importances, x='value', y='feature', palette='rocket');
    for idx, v in enumerate(feature_importances.value):
        plt.text(v, idx, "  {:.2e}".format(v))

show_feature_importances(model, 'gain')
show_feature_importances(model, 'split')

In [None]:
# show tree and save as png
def save_tree_diagraph(model):
    tree_digraph = lgb.create_tree_digraph(model, show_info=['split_gain', 'internal_count'])

    tree_png = svg2png(tree_digraph._repr_svg_(), output_width=3840)
    tree_png = Image.open(BytesIO(tree_png))

    tree_png.save('create_tree_digraph.png')

    display(tree_png)
    
save_tree_diagraph(model)

In [None]:
# remove train and validation data to free memory before prediction phase
del train_data, val_data
gc.collect()

# Part III: Prediction

In [None]:
def get_features_questions_df():
    # create DataFrame of features
    features_questions_df = pd.DataFrame(index=get_index_np())
    cols = [
        'content_id',
        'part',
        'tag_1',
        'tag_2',
        'content_count',
        'bundle_id',
    ]
    
    for feature in tqdm(cols):
        features_questions_df[feature] = load_feature(feature)

    # content features
    features_questions_df.drop_duplicates(subset='content_id', inplace=True)
    features_questions_df.sort_values('content_id', inplace=True)
    features_questions_df.reset_index(drop=True, inplace=True)
    
    return features_questions_df
    
features_questions_df = get_features_questions_df()
print(f'features_questions_df, rows: {features_questions_df.shape[0]}')
display(features_questions_df.head())

## STATE

This next function is the beating heart of my prediction phase, a massive dictionary to keep track of all features of all users and update them with every interaction.

I agree, the code is somewhat unreadable, but the basic idea is as follows:

Compute features over all user data (mean_user_accuracy, answered_user, answered_correctly_user) 
as these features have a lag of 1

Get the last data point for other features (lecturs seen, mean_content_accuracy, etc)

Create a dictionary where for each user all features are kept track of, an example of a user is shown below the function

In [None]:
def get_state():
    # create DataFrame of features
    features_df = pd.DataFrame(index=get_index_np())
    
    cols = ['user_id', 'content_id', 'answered_correctly', 'mean_content_accuracy', 'last_correct_time_elapsed', 'last_incorrect_time_elapsed',
            'timestamp', 'last_interaction_elapsed_time_l1', 'last_interaction_elapsed_time_l2', 'last_interaction_elapsed_time_l3']
    for f in tqdm(cols):
        features_df[f] = load_feature(f)
        
    # get last features
    last_features = features_df.groupby('user_id')[['timestamp', 'mean_content_accuracy', 
                                                    'last_correct_time_elapsed', 'last_incorrect_time_elapsed',
                                                    'last_interaction_elapsed_time_l1', 'last_interaction_elapsed_time_l2', 
                                                    'last_interaction_elapsed_time_l3']].last()
    
    # last correct/incorrect time elapsed
    last_correct_features = features_df.groupby(['user_id', 'answered_correctly'])['timestamp'].last()
    
    # drop features only used for last feature computation
    features_df.drop([ 'timestamp', 'last_interaction_elapsed_time_l1', 'last_interaction_elapsed_time_l2', 'last_interaction_elapsed_time_l3',
                        'last_correct_time_elapsed', 'last_incorrect_time_elapsed'], axis=1, inplace=True)
        
    # compute user features over all train data
    features_df_grouped_by_user = features_df[['user_id', 'answered_correctly']].groupby('user_id')['answered_correctly']
    mean_user_accuracy = features_df_grouped_by_user.mean().values.astype(np.float32)
    answered_correctly_user = features_df_grouped_by_user.sum().values.astype(np.uint16)
    answered_user = features_df_grouped_by_user.count().values.astype(np.uint16)
    # user_mean_content_accuracy_sum for computing mean_user_content_accuracy
    mean_content_accuracy_sum = features_df.groupby('user_id')['mean_content_accuracy'].sum().values
    
    del features_df_grouped_by_user, features_df
    gc.collect()
    
    # get state with precomputed attempts
    with open('/kaggle/input/riiid-answer-correctness-prediction-features/state.pkl', 'rb') as state_pickle_file:
         state = pickle.load(state_pickle_file)
    
    # add all features to state
    for idx, user_id in tqdm(enumerate(state.keys()), total=len(state)):
        state[user_id]['mean_user_accuracy'] = mean_user_accuracy[idx]
        state[user_id]['answered_correctly_user'] = answered_correctly_user[idx]
        state[user_id]['answered_user'] = answered_user[idx]
        state[user_id]['mean_content_accuracy_sum'] = mean_content_accuracy_sum[idx]
        # last features
        state[user_id]['timestamp'] = last_features.loc[user_id, 'timestamp']
        state[user_id]['last_mean_content_accuracy'] = last_features.loc[user_id, 'mean_content_accuracy']
        state[user_id]['last_correct_timestamp'] = last_correct_features.loc[user_id, True] if (user_id, True) in last_correct_features else np.nan
        state[user_id]['last_incorrect_timestamp'] = last_correct_features.loc[user_id, False] if (user_id, False) in last_correct_features else np.nan
        state[user_id]['last_interaction_elapsed_time_l1'] = last_features.loc[user_id, 'last_interaction_elapsed_time_l1']
        state[user_id]['last_interaction_elapsed_time_l2'] = last_features.loc[user_id, 'last_interaction_elapsed_time_l2']
        state[user_id]['last_interaction_elapsed_time_l3'] = last_features.loc[user_id, 'last_interaction_elapsed_time_l3']
                
    return state

state = get_state()
gc.collect()

In [None]:
# Example of the state for the famous user 115
display(state[124])

In [None]:
# adds all new users to the state with default values
def add_new_users(test_df):
    for idx, row in test_df.iterrows():
        # check if user exists
        if not row['user_id'] in state:
            state[row['user_id']] = get_new_user(row)

# Gives the state for a new user with all default values
def get_new_user(row):
    return {
        'mean_user_accuracy': 0.680,
        'answered_correctly_user': 0,
        'answered_user': 0,
        'user_content_attempts': dict(),
        'timestamp': row['timestamp'],
        'last_mean_content_accuracy': 0,
        'last_correct_timestamp': np.nan,
        'last_incorrect_timestamp': np.nan,
        'last_interaction_elapsed_time_l1': 0,
        'last_interaction_elapsed_time_l2': 0,
        'last_interaction_elapsed_time_l3': 0,
        'mean_content_accuracy_sum': 0,
    }

# returns a dictionary with a list for all user features
def get_user_data_dict():
    return {
        'mean_user_accuracy': [],
        'answered_correctly_user': [],
        'answered_user': [],
        'last_interaction_elapsed_time_l1': [],
        'last_interaction_elapsed_time_l2': [],
        'last_interaction_elapsed_time_l3': [],
        'mean_user_part_accuracy': [],
        'last_correct_time_elapsed': [],
        'last_incorrect_time_elapsed': [],
    }

The next function retrieves features for all user from the state and returns a dictionary with all features as shown above.

In [None]:
def get_user_data(state, test_df):
    # updated data
    user_data = get_user_data_dict()
    
    # mean first question part accuracies
    part_first_question_mean_accuracy_dict = {1: 0.75, 2: 0.60, 3: 0.49, 4: 0.41, 5: 0.52, 6: 0.51, 7: 0.47}
    cols = ['user_id', 'content_id', 'content_type_id', 'timestamp', 'part', 'mean_content_accuracy']
    
    for idx, (user_id, content_id, is_lecture, timestamp, part, mean_content_accuracy) in test_df[cols].iterrows():
        # LECTURE
        if is_lecture:
            
            # fill user data with dummy value
            for key in user_data.keys():
                user_data[key].append(0)
            
        # QUESTION
        else:
            part = int(part)
            
            # update last interaction elapsed time
            state[user_id]['last_interaction_elapsed_time_l3'] = state[user_id]['last_interaction_elapsed_time_l2']
            state[user_id]['last_interaction_elapsed_time_l2'] = state[user_id]['last_interaction_elapsed_time_l1']
            if timestamp != state[user_id]['timestamp']:
                state[user_id]['last_interaction_elapsed_time_l1'] = timestamp - state[user_id]['timestamp']
                state[user_id]['timestamp'] = timestamp
            
            # add various features
            cols = [
                'mean_user_accuracy', 'answered_correctly_user', 'answered_user', 
                'last_interaction_elapsed_time_l1', 'last_interaction_elapsed_time_l2', 'last_interaction_elapsed_time_l3',
            ]
            for feature in cols:
                user_data[feature].append(state[user_id][feature])
            
            state[user_id]['last_mean_content_accuracy'] = mean_content_accuracy
            
            # user part features
            if f'part_{part}_count' in state[user_id]:
                if state[user_id][f'part_{part}_sum'] == 0:
                    user_data['mean_user_part_accuracy'].append(0)
                else:
                    user_data['mean_user_part_accuracy'].append(state[user_id][f'part_{part}_sum'] / state[user_id][f'part_{part}_count'])
                
            else:
                state[user_id][f'part_{part}_sum'] = 0
                state[user_id][f'part_{part}_count'] = 0
                user_data['mean_user_part_accuracy'].append(part_first_question_mean_accuracy_dict[part])
            
            # last correct/incorrect time elapsed
            user_data['last_correct_time_elapsed'].append(timestamp - state[user_id]['last_correct_timestamp'])
            user_data['last_incorrect_time_elapsed'].append(timestamp - state[user_id]['last_incorrect_timestamp'])
    
    return user_data

In [None]:
# adds the attempt and retry feature to the test_df
def add_attempt_retry(test_df):
    attempt = []
    for user_id, content_id in test_df[['user_id', 'content_id']].itertuples(name=None, index=False):
        if content_id in state[user_id]['user_content_attempts']:
            state[user_id]['user_content_attempts'][content_id] += 1
        else:
            state[user_id]['user_content_attempts'][content_id] = 0

        attempt.append(state[user_id]['user_content_attempts'][content_id])
    
    test_df['attempt'] = attempt
    test_df['retry'] = test_df['attempt'] > 0

This next function adds the mean_content_accuracy taking the prior_question_had_explanation and retry feature into account. an example of the effect of prior_question_had_explanation and retry are given below.

In [None]:
with open('/kaggle/input/riiid-answer-correctness-prediction-features/mean_content_accuracy_cases_dict.pickle', 'rb') as f:
    mean_content_accuracy_cases_dict = pickle.load(f)

def add_mean_content_accuracy(test_df):
    mean_content_accuracy = []
    for key in test_df[['content_id', 'prior_question_had_explanation', 'retry']].itertuples(name=None, index=False):
        # get mean content accuracy
        if key in mean_content_accuracy_cases_dict:
            mean_content_accuracy.append(mean_content_accuracy_cases_dict[key])
        else:
            mean_content_accuracy.append(0)
        
    test_df['mean_content_accuracy'] = mean_content_accuracy

In [None]:
# Example of the mean_content_accuracy for quesiton 6116, the most answered question
for (content_id, prior_question_had_explanation, retry), mean_content_accuracy in mean_content_accuracy_cases_dict.items():
    if content_id == 6116:
        print(f'content_id {content_id}, prior_question_had_explanation: {prior_question_had_explanation}, retry: {retry}, mean_content_accuracy: {mean_content_accuracy}')

After each prediction iteration the user features are updated

In [None]:
def update_user_data(state, prev_test_df):
    for idx, row in prev_test_df.iterrows():
        if not row['content_type_id']:
            answered_correctly = row['answered_correctly']
            user_id = row['user_id']
            part = int(row['part'])
            # update user features
            state[user_id]['answered_correctly_user'] += answered_correctly
            state[user_id]['answered_user'] += 1
            state[user_id]['mean_user_accuracy'] = state[user_id]['answered_correctly_user'] / state[user_id]['answered_user']
            # add user part features, initialize if no part answered yet
            state[user_id][f'part_{part}_sum'] += answered_correctly
            state[user_id][f'part_{part}_count'] += 1
            # update other user features
            state[user_id]['mean_content_accuracy_sum'] += row['mean_content_accuracy']
            # last correct/incorrect time elapsed
            if answered_correctly:
                state[user_id]['last_correct_timestamp'] = row['timestamp']
            else:
                state[user_id]['last_incorrect_timestamp'] = row['timestamp']

## Submission

In [None]:
import riiideducation

env = riiideducation.make_env()
iter_test = env.iter_test()
prev_test_df = None
model = lgb.Booster(model_file='./model.lgb')

In [None]:
for idx, (test_df, _) in tqdm(enumerate(iter_test)):
    # from 2nd iteration, update user data
    if prev_test_df is not None:
        prev_test_df['answered_correctly'] = eval(test_df['prior_group_answers_correct'].iloc[0])
        update_user_data(state, prev_test_df)
        if idx < 4:
            display(test_df)
            display(prev_test_df)
            
    # fill prior question had explenation
    test_df['prior_question_had_explanation'] = test_df['prior_question_had_explanation'].fillna(value = False).astype(bool)
    test_df['prior_question_elapsed_time'].fillna(23916, inplace=True)
            
    # merge with all features
    test_df = features_questions_df.merge(test_df, how='right', on='content_id')
    
    # add new users to state
    add_new_users(test_df)
    
    # add attempt, retry and mean_content_accuracy
    add_attempt_retry(test_df)
    add_mean_content_accuracy(test_df)
    
    # get user data from state and update attempt
    user_data = get_user_data(state, test_df)
    for feature, values in user_data.items():
        test_df[feature] = values
    
    # add harmonic mean
    test_df['hmean_user_content_accuracy'] = 2 * (
        (test_df['mean_user_accuracy'] * test_df['mean_content_accuracy']) /
        (test_df['mean_user_accuracy'] + test_df['mean_content_accuracy'])
    )

    test_df['answered_correctly'] = model.predict(test_df[features])

    env.predict(test_df.loc[test_df['content_type_id'] == 0, ['row_id', 'answered_correctly']])

    # set previour test_df
    prev_test_df = test_df.copy()

In [None]:
submission = pd.read_csv('./submission.csv')
submission.info()

In [None]:
# show the first 5 predictions
submission.head()