In [2]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm.notebook import tqdm
import lightgbm as lgb

In [3]:
pd.set_option('display.float_format', lambda x: '%.5f' % x)
pd.set_option('display.max_columns', 500)

In [4]:
import pickle
def load_model(model_name):
    with open(model_name, 'rb') as file:
        model = pickle.load(file)
    return model

In [5]:
model = load_model('../input/valid-upload/lgbm_11_25_20.pkl')

In [6]:
model.feature_name()

['answered_correctly_avg_u',
 'answered_correctly_sum_u',
 'count_u',
 'answered_correctly_avg_c',
 'content_sum',
 'run_diff_time',
 'diff_pass_rate',
 'prior_question_elapsed_time_mean',
 'part_2',
 'part_5',
 'part_7',
 'diff_win_sos']

In [7]:
test_df = pd.read_csv('../input/riiid-test-answer-prediction/example_test.csv')

In [8]:
results_c_r3 = pd.read_csv('../input/valid-upload/results_c_r3')

In [9]:
# created dataset for latest value of 'diff_win_sos' by each user_id
combo_sos_fin = pd.read_csv('../input/valid-combo2/combo_sos_fin')

In [10]:
# latest value of 'run_diff_time' by each user_id
combo_final = pd.read_csv('../input/combo-final/combo_final')

In [11]:
combo_sos_fin.head()

Unnamed: 0,index,user_id,task_container_id,answered_correctly_content,answered_correctly_avg_u,run_sos,sos_fin,lag_sos_fin,diff_win_sos,run_diff_win_sos
0,93828029,115,41,0.54716,0.68889,32.88323,0.78293,0.76991,0.08102,1.07061
1,9965798,124,15,0.27143,0.24138,13.23457,0.82716,0.8102,0.56882,10.64802
2,12360138,2746,19,0.73937,0.55556,13.88044,0.69402,0.69164,0.13608,4.87217
3,83207367,5382,127,0.63091,0.67742,81.80157,0.63907,0.63914,-0.03828,-1.30695
4,66781417,8623,99,0.61029,0.63889,64.37545,0.64375,0.64409,0.0052,-6.31754


## feature engineering

In [12]:
# Taken from https://www.kaggle.com/its7171/lgbm-with-loop-feature-engineering 
# Thanks Tito!
# funcs for user stats with loop
def add_user_feats(df, answered_correctly_sum_u_dict, count_u_dict):
    acsu = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    for cnt,row in enumerate(tqdm(df[['user_id','answered_correctly']].values)):
        acsu[cnt] = answered_correctly_sum_u_dict[row[0]]
        cu[cnt] = count_u_dict[row[0]]
        answered_correctly_sum_u_dict[row[0]] += row[1]
        count_u_dict[row[0]] += 1
    user_feats_df = pd.DataFrame({'answered_correctly_sum_u':acsu, 'count_u':cu})
    user_feats_df['answered_correctly_avg_u'] = user_feats_df['answered_correctly_sum_u'] / user_feats_df['count_u']
    df = pd.concat([df, user_feats_df], axis=1)
    return df

def add_user_feats_without_update(df, answered_correctly_sum_u_dict, count_u_dict):
    acsu = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    for cnt,row in enumerate(df[['user_id']].values):
        acsu[cnt] = answered_correctly_sum_u_dict[row[0]]
        cu[cnt] = count_u_dict[row[0]]
    user_feats_df = pd.DataFrame({'answered_correctly_sum_u':acsu, 'count_u':cu})
    user_feats_df['answered_correctly_avg_u'] = user_feats_df['answered_correctly_sum_u'] / user_feats_df['count_u']
    df = pd.concat([df, user_feats_df], axis=1)
    return df

def update_user_feats(df, answered_correctly_sum_u_dict, count_u_dict):
    for row in df[['user_id','answered_correctly','content_type_id']].values:
        if row[2] == 0:
            answered_correctly_sum_u_dict[row[0]] += row[1]
            count_u_dict[row[0]] += 1

## inference

In [13]:
# Put the final values for users into dictionary so Tito's function will work and update as it runs inference by batch.
answered_correctly_sum_u_dict = pd.read_pickle('../input/dict-model/answered_correctly_sum_u_dict.pkl')
count_u_dict = pd.read_pickle('../input/dict-model/count_u_dict.pkl')
# Fillna values for inference
fil_na_valid = pd.read_pickle('../input/valid-upload/fil_na_valid.pkl')

In [14]:
feats = model.feature_name()

In [15]:
feats

['answered_correctly_avg_u',
 'answered_correctly_sum_u',
 'count_u',
 'answered_correctly_avg_c',
 'content_sum',
 'run_diff_time',
 'diff_pass_rate',
 'prior_question_elapsed_time_mean',
 'part_2',
 'part_5',
 'part_7',
 'diff_win_sos']

In [16]:
TARGET = 'answered_correctly'

In [17]:
prior_question_elapsed_time_mean = 254.39410

In [19]:
results_c_r3 = results_c_r3.rename(columns={'answered_correctly_content':'answered_correctly_avg_c'})

In [18]:
import riiideducation
env = riiideducation.make_env()
iter_test = env.iter_test()
set_predict = env.predict

In [20]:
feats

['answered_correctly_avg_u',
 'answered_correctly_sum_u',
 'count_u',
 'answered_correctly_avg_c',
 'content_sum',
 'run_diff_time',
 'diff_pass_rate',
 'prior_question_elapsed_time_mean',
 'part_2',
 'part_5',
 'part_7',
 'diff_win_sos']

In [21]:
fil_na_valid

{'tr_answered_correctly_avg_u_avg': 0.6435324918263083,
 'mean_diff_win_sos': 0.23022967212088155,
 'median_run_diff_win_sos': 54.92441085160483,
 'med_run_diff_time': 7235.340028076175}

In [22]:
previous_test_df = None
for (test_df, sample_prediction_df) in iter_test:
    if previous_test_df is not None:
        previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
        update_user_feats(previous_test_df, answered_correctly_sum_u_dict, count_u_dict)
    previous_test_df = test_df.copy()
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = add_user_feats_without_update(test_df, answered_correctly_sum_u_dict, count_u_dict)
    test_df['answered_correctly_avg_u'] = test_df['answered_correctly_avg_u'].fillna(fil_na_valid['tr_answered_correctly_avg_u_avg'])
    
    # Merge the latest values of 'run_diff_time'
    test_df = pd.merge(test_df, combo_final[['user_id', 'run_diff_time']], on=['user_id'],  how="left")
    test_df['run_diff_time'] = test_df['run_diff_time'].fillna(fil_na_valid['med_run_diff_time'])
    # Static sos
    test_df = pd.merge(test_df, combo_sos_fin[['user_id', 'diff_win_sos']], on=['user_id'],  how="left")
    test_df['diff_win_sos'] = test_df['diff_win_sos'].fillna(fil_na_valid['mean_diff_win_sos'])
    
    test_df = pd.merge(test_df, results_c_r3[['content_id','diff_pass_rate','part_2','part_5','part_7',   \
                                              'answered_correctly_avg_c', 'content_sum']], on='content_id',  how="left")
    test_df.prior_question_elapsed_time = test_df.prior_question_elapsed_time/100
    test_df['prior_question_elapsed_time_mean'] = test_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
    
    test_df[TARGET] =  model.predict(test_df[feats])
    set_predict(test_df[['row_id', TARGET]])

In [23]:
previous_test_df.head()

Unnamed: 0_level_0,row_id,timestamp,user_id,content_id,content_type_id,task_container_id,prior_question_elapsed_time,prior_question_had_explanation,prior_group_answers_correct,prior_group_responses
group_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
3,74,75311,275030867,8308,0,3,15000.0,False,"[1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, ...","[0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 3, 0, 0, ..."
3,75,31220886463,1305988022,396,0,4163,19000.0,True,,
3,76,48613916248,1310228392,11869,0,1458,26333.0,True,,
3,77,48613916248,1310228392,11871,0,1458,26333.0,True,,
3,78,48613916248,1310228392,11870,0,1458,26333.0,True,,


## Inference attempt 2
### Continuous update to Strength of Schedule (SOS) for each user's incoming question difficulty by inference batch
#### Improved the speed of pd.merge by indexing
#### Modified Tito's code to work for my SOS

In [None]:
# takes the latest SOS per user
sos_sum_u_dict = dict(zip(combo_sos_fin['user_id'].values, combo_sos_fin['run_sos'].values))

In [None]:
# funcs for user stats with loop
def add_sos_feats(df, sos_sum_u_dict):
    acsu = np.zeros(len(df), dtype=np.int32)
    cu = np.zeros(len(df), dtype=np.int32)
    for cnt,row in enumerate(df[['user_id']].values):
        acsu[cnt] = sos_sum_u_dict[row[0]]
    user_sos_df = pd.DataFrame({'sos_sum_u_dict':acsu})
    
    df = pd.concat([df, user_feats_df], axis=1)
    return df

In [None]:
def update_sos_feats(df, sos_sum_u_dict):
    sos_df = None
    sos_df = df.merge(results_c_r3[['answered_correctly_avg_c']].loc[results_c_r3.index.isin(df['content_id'])], how='left', left_on='content_id',\
                     right_index=True)
    for row in sos_df[['user_id','answered_correctly_avg_c','content_type_id', 'timestamp']].values:
        if row[2] == 0:
            try:
                sos_sum_u_dict[row[0]] += row[1]
            except KeyError:
                sos_sum_u_dict.update({row[0]: row[1]})

In [None]:
# Ran > 9 hrs for inference batch so, unfortunately, Kaggle Inference Timed Out
## Did not add to leaderboard
previous_test_df = None
for (test_df, sample_prediction_df) in iter_test:
    if previous_test_df is not None:
        previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
        update_user_feats(previous_test_df, answered_correctly_sum_u_dict, count_u_dict)
        update_sos_feats(previous_test_df, sos_sum_u_dict)
    previous_test_df = test_df.copy()
    test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop=True)
    test_df = add_user_feats_without_update(test_df, answered_correctly_sum_u_dict, count_u_dict)
    test_df['answered_correctly_avg_u'] = test_df['answered_correctly_avg_u'].fillna(fil_na_valid['tr_answered_correctly_avg_u_avg'])
    
    test_df = pd.merge(test_df, combo_final1.loc[combo_final1.index.isin(test_df['user_id'])], how="left", left_on=['user_id'], right_index=True)
    test_df['run_diff_time'] = test_df['run_diff_time'].fillna(fil_na_valid['med_run_diff_time'])
    test_df = pd.merge(test_df, results_c_r3[['diff_pass_rate','part_2','part_5','part_7','answered_correctly_avg_c', \
                                              'content_sum']].loc[results_c_r3.index.isin(test_df['content_id'])], \
                                                                  left_on='content_id',  how="left", right_index=True)
    test_df.prior_question_elapsed_time = test_df.prior_question_elapsed_time/100
    test_df['prior_question_elapsed_time_mean'] = test_df.prior_question_elapsed_time.fillna(prior_question_elapsed_time_mean)
    
    test_df = add_sos_feats(test_df, sos_sum_u_dict)
    test_df['sos_fin'] = test_df['sos_sum_u_dict'] / (test_df['task_container_id']+1)
    test_df['diff_win_sos']= test_df['sos_fin'] - test_df['answered_correctly_avg_u']
    test_df['diff_win_sos'] = test_df['diff_win_sos'].fillna(fil_na_valid['mean_diff_win_sos'])
    
    test_df[TARGET] =  model.predict(test_df[feats])
    set_predict(test_df[['row_id', TARGET]])