# Comments
Thanks to tito for this great script https://www.kaggle.com/its7171/lgbm-with-loop-feature-engineering

* Creating predictive feature is very important, here I just used 14 features and 15M data points to train the model.
* The dataset is big to preprocess using python with a for loop, their are other tools and frameworks like (SQL, Spark, Apache Beam, Dask) where you could make feature engineering much faster but if we are smart and make predictive feature it's ok to just use for loops.
* Foward feature engineering seems a good technique to try in this problem (create 1 new feature that you think it could be predective based on the problem, run the pipeline and check if val score increase, if it increase that feature is predictive and you should add it. Care when you just get some minor improvement, sometime is better to discard that feature because your experimentation process is going to get slower).

In [1]:
import pandas as pd
import numpy as np
import gc
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from tqdm.notebook import tqdm
import lightgbm as lgb
import riiideducation
import matplotlib.pyplot as plt
import seaborn as sns

import random
import os

In [None]:
# Random seed
SEED = 123

# Function to seed everything
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    
seed_everything(SEED)

In [2]:
def read_and_preprocess(feature_engineering = False):
    
    train_pickle = '../input/riiid-cross-validation-files/cv1_train.pickle'
    valid_pickle = '../input/riiid-cross-validation-files/cv1_valid.pickle'
    question_file = '../input/riiid-test-answer-prediction/questions.csv'
    
    # Read data
    feld_needed = ['timestamp', 'user_id', 'answered_correctly', 'content_id', 'content_type_id', 'prior_question_elapsed_time', 'prior_question_had_explanation']
    train = pd.read_pickle(train_pickle)[feld_needed]
    valid = pd.read_pickle(valid_pickle)[feld_needed]
    # Delete some trianing data to don't have ram problems
    if feature_engineering:
        train = train.iloc[-40000000:]
    
    # Filter by content_type_id to discard lectures
    train = train.loc[train.content_type_id == False].reset_index(drop = True)
    valid = valid.loc[valid.content_type_id == False].reset_index(drop = True)
    
    # Changing dtype to avoid lightgbm error
    train['prior_question_had_explanation'] = train.prior_question_had_explanation.fillna(False).astype('int8')
    valid['prior_question_had_explanation'] = valid.prior_question_had_explanation.fillna(False).astype('int8')
    
    # Fill prior question elapsed time with the mean
    prior_question_elapsed_time_mean = train['prior_question_elapsed_time'].dropna().mean()
    train['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    valid['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
    
    # Merge with question dataframe
    questions_df = pd.read_csv(question_file)
    questions_df['part'] = questions_df['part'].astype(np.int32)
    questions_df['bundle_id'] = questions_df['bundle_id'].astype(np.int32)
    
    train = pd.merge(train, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')
    valid = pd.merge(valid, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')

    answered_correctly_u_count_dict = defaultdict(int)
    answered_correctly_u_sum_dict = defaultdict(int)
    answered_correctly_uq_dict = defaultdict(lambda: defaultdict(int))

    elapsed_time_u_sum_dict = defaultdict(int)
    explanation_u_sum_dict = defaultdict(int)
    question_u_count_dict = defaultdict(int)
    question_u_last_bundle_count_dict = defaultdict(int)

    part_user_count_dict = defaultdict(lambda: defaultdict(int))
    part_user_sum_dict = defaultdict(lambda: defaultdict(int))

    question_correct_last_20_count_dict = defaultdict(int)
    question_correct_last_20_sum_dict = defaultdict(int)
    question_correct_last_20_all_dict = defaultdict(list)

    timestamp_u_correct_dict = defaultdict(list)
    timestamp_u_incorrect_dict = defaultdict(list)

    timestamp_u_dict = defaultdict(list)

    user_tag_acc_count_dict = defaultdict(lambda: defaultdict(int))
    user_tag_acc_sum_dict = defaultdict(lambda: defaultdict(int))
    
    # Client Question dictionary
    answered_correctly_uq = defaultdict(lambda: defaultdict(int))

    # Client Question dictionary
    answered_correctly_uq = defaultdict(lambda: defaultdict(int))
    
    print('User feature calculation started...')
    print('\n')
    train = add_features(train)
    valid = add_features(valid)
    gc.collect()
    print('User feature calculation completed...')
    print('\n')
    
    features_dicts = {
        'answered_correctly_u_count_dict': answered_correctly_u_count_dict,
        'answered_correctly_u_sum_dict': answered_correctly_u_sum_dict,
        'answered_correctly_uq_dict': answered_correctly_uq_dict,
        'elapsed_time_u_sum_dict': elapsed_time_u_sum_dict,
        'explanation_u_sum_dict': explanation_u_sum_dict,
        'question_u_count_dict': question_u_count_dict,
        'question_u_last_bundle_count_dict': question_u_last_bundle_count_dict,
        'part_user_count_dict': part_user_count_dict,
        'part_user_sum_dict': part_user_sum_dict,
        'question_correct_last_20_count_dict': question_correct_last_20_count_dict,
        'question_correct_last_20_sum_dict': question_correct_last_20_sum_dict
        'question_correct_last_20_all_dict': question_correct_last_20_all_dict,
        'timestamp_u_incorrect_dict': timestamp_u_incorrect_dict,
        'timestamp_u_correct_dict': timestamp_u_correct_dict,
        'timestamp_u_dict': timestamp_u_dict,
        'user_tag_acc_count_dict': user_tag_acc_count_dict,
        'user_tag_acc_sum_dict': user_tag_acc_sum_dict,
        'answered_correctly_uq': answered_correctly_uq,
    }
    
    return train, valid, questions_df, prior_question_elapsed_time_mean, features_dicts


# Function for user stats with loops
def add_features(df, update=True):
    global answered_correctly_u_count_dict
    global answered_correctly_u_sum_dict
    global answered_correctly_uq_dict
    global elapsed_time_u_sum_dict
    global explanation_u_sum_dict
    global question_u_count_dict
    global question_u_last_bundle_count_dict
    global part_user_count_dict
    global part_user_sum_dict
    global question_correct_last_20_count_dict
    global question_correct_last_20_sum_dict
    global question_correct_last_20_all_dict
    global timestamp_u_correct_dict
    global timestamp_u_incorrect_dict
    global timestamp_u_dict
    global user_tag_acc_count_dict
    global user_tag_acc_sum_dict
    global answered_correctly_uq

    # Client features
    answered_correctly_u_avg = np.zeros(len(df), dtype = np.float32)
    answered_correctly_u_count = np.zeros(len(df), dtype = np.float32)
    answered_correctly_uq_count = np.zeros(len(df), dtype = np.int32)

    elapsed_time_u_avg = np.zeros(len(df), dtype = np.float32)
    explanation_u_avg = np.zeros(len(df), dtype = np.float32)

    part_user_count = np.zeros(len(df), dtype = np.float32)
    part_user_mean = np.zeros(len(df), dtype = np.float32)

    question_correct_rate_last_20_sum = np.zeros(len(df), dtype = np.float32)

    timestamp_u_correct_recency_1 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_incorrect_recency_1 = np.zeros(len(df), dtype = np.float32)

    timestamp_u_diff_1 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_diff_2 = np.zeros(len(df), dtype = np.float32)
    timestamp_u_diff_3 = np.zeros(len(df), dtype = np.float32)

    user_tag_acc_count = np.zeros(len(df), dtype = np.float32)
    user_tag_acc_max = np.zeros(len(df), dtype = np.float32)
    user_tag_acc_min = np.zeros(len(df), dtype = np.float32)

    list_last_user_task_table=[]####定义数组 用来保存旧组的信息
    list_last_user_task_table_un_back=[]####定义数组 用来保存旧组的信息
#     for num, row in enumerate(tqdm(df[['user_id', 'answered_correctly', 'content_id', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'timestamp','task_container_id']].values)):
    flag_current_task=0
    df_temp=df[['user_id',"task_container_id", 'content_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'timestamp','part',"tags"]].values
    for num in tqdm(range(len(df))):
        row=df_temp[num]
        if num+1!=len(df):
            row2=df_temp[num+1]
        else:
            row2=[-100 for i in range(len(row))]


        ####*********  elapsed_time_u_avg_xiuzheng和explanation_u_avg_xiuzheng
        if row[6]!=0:##如果时间戳不是0的时候
            if flag_current_task==0:
                question_u_count_dict[row[0]]+=question_u_last_bundle_count_dict[row[0]]
                elapsed_time_u_sum_dict[row[0]]+=row[4]*question_u_last_bundle_count_dict[row[0]]
                explanation_u_sum_dict[row[0]]+=row[5]*question_u_last_bundle_count_dict[row[0]]
            elapsed_time_u_avg[num]= elapsed_time_u_sum_dict[row[0]]/question_u_count_dict[row[0]]
            explanation_u_avg[num] = explanation_u_sum_dict[row[0]]/question_u_count_dict[row[0]]
            ###⑥只需要当前组的prior（也就是上一组的平均时间或者是否解答），就可以计算了
        else:##时间戳为0的时候，肯定是不知道当前组的用时和解答情况的
            elapsed_time_u_avg[num]=np.nan
            explanation_u_avg[num] = np.nan
        flag_current_task=1

        ###①求这个特征，需要不断的记录上一组一共有多少道题，到最后用    （不断累加（每组多少道题*每道题平均时间））/总做题次数
        ###②需要把记录这组有多少道题放在后面计算，在前面计算平均时间并且填充到特征数组里
        list_last_user_task_table_un_back.append([row[0]])###没换人换组的时候，先不断保存旧组的信息,并且在换人换组的时候也要保存，以防那次信息没被用到
        if row[0]!=row2[0] or row[1]!=row2[1]:###换了一个task
            flag_current_task=0
            question_u_last_bundle_count_dict[row[0]]=len(list_last_user_task_table_un_back)
            list_last_user_task_table_un_back=[]###在即将换task的时候，把旧组需要换成新组（更换成新组之前，需要先把旧组的信息在上面用完）

        ####*********

        ####*********   answered_correctly_u_avg、answered_correctly_u_count和answered_correctly_uq_count
        if answered_correctly_u_count_dict[row[0]] != 0:
            answered_correctly_u_avg[num] = answered_correctly_u_sum_dict[row[0]] / answered_correctly_u_count_dict[row[0]]
            answered_correctly_u_count[num] = answered_correctly_u_count_dict[row[0]]
        else:
            answered_correctly_u_avg[num] = 0.67
            answered_correctly_u_count[num] = 0

        answered_correctly_uq_count[num] = answered_correctly_uq_dict[row[0]][row[2]]
        ####*********

        ####*********   part_user_count和part_user_mean
        if part_user_count_dict[row[0]][row[7]]==0:
            part_user_count[num] = 0
            part_user_mean[num] = 0.67
        else:
            part_user_count[num] = part_user_count_dict[row[0]][row[7]]
            part_user_mean[num] = part_user_sum_dict[row[0]][row[7]]/part_user_count_dict[row[0]][row[7]]
        ####*********

        ####*********   question_correct_rate_last_20_mean
#         question_correct_rate_last_20_sum[num]=question_correct_last_20_sum_dict[row[0]]
        ####*********


        ####*********   timestamp_u_correct_recency_1，timestamp_u_incorrect_recency_1
        if len(timestamp_u_correct_dict[row[0]]) == 0:
            timestamp_u_correct_recency_1[num] = np.nan
        elif len(timestamp_u_correct_dict[row[0]]) == 1:
            timestamp_u_correct_recency_1[num] = row[6] - timestamp_u_correct_dict[row[0]][0]

        if len(timestamp_u_incorrect_dict[row[0]]) == 0:
            timestamp_u_incorrect_recency_1[num] = np.nan
        elif len(timestamp_u_incorrect_dict[row[0]]) == 1:
            timestamp_u_incorrect_recency_1[num] = row[6] - timestamp_u_incorrect_dict[row[0]][0]
        ####*********

        ####*********   timestamp_u_diff_1_2，timestamp_u_diff_2_3，timestamp_u_diff_3_end
        if len(timestamp_u_dict[row[0]]) == 0:
            timestamp_u_diff_1[num] = np.nan
            timestamp_u_diff_2[num] = np.nan
            timestamp_u_diff_3[num] = np.nan
        elif len(timestamp_u_dict[row[0]]) == 1:
            timestamp_u_diff_1[num] = row[6] - timestamp_u_dict[row[0]][0]
            timestamp_u_diff_2[num] = np.nan
            timestamp_u_diff_3[num] = np.nan
        elif len(timestamp_u_dict[row[0]]) == 2:
            timestamp_u_diff_1[num] = row[6] - timestamp_u_dict[row[0]][1]
            timestamp_u_diff_2[num] = timestamp_u_dict[row[0]][1] - timestamp_u_dict[row[0]][0]
            timestamp_u_diff_3[num] = np.nan
        elif len(timestamp_u_dict[row[0]]) == 3:
            timestamp_u_diff_1[num] = row[6] - timestamp_u_dict[row[0]][2]
            timestamp_u_diff_2[num] = timestamp_u_dict[row[0]][2] - timestamp_u_dict[row[0]][1]
            timestamp_u_diff_3[num] = timestamp_u_dict[row[0]][1] - timestamp_u_dict[row[0]][0]

        ####*********

        ####*********   user_tag_acc_count，user_tag_acc_max，user_tag_acc_min
        if pd.isnull(row[8]):
            user_tag_acc_count[num]=np.nan
            user_tag_acc_max[num] = np.nan
            user_tag_acc_min[num] = np.nan
            continue
        else:
            tag_list_un_back=row[8].split()
            row_all_tag_sum=0
            row_all_tag_count=0
            row_max_tag_mean=-1###尽量搞小
            row_min_tag_mean=1000###尽量搞大

            for single_tag in tag_list_un_back:
                ###先做需要更新的###
                single_tag_sum=user_tag_acc_sum_dict[row[0]][single_tag]
                single_tag_count=user_tag_acc_count_dict[row[0]][single_tag]
                row_all_tag_sum+=single_tag_sum
                row_all_tag_count+=single_tag_count
                if single_tag_count==0:
                    single_tag_mean=0.67
                else:
                    single_tag_mean=single_tag_sum/single_tag_count
                row_max_tag_mean=max(single_tag_mean,row_max_tag_mean)
                row_min_tag_mean=min(single_tag_mean,row_min_tag_mean)
            if row_all_tag_count==0:
                user_tag_acc_count[num]=0
                user_tag_acc_max[num] = 0.67
                user_tag_acc_min[num] = 0.67
            else:
                user_tag_acc_count[num]=row_all_tag_count
                user_tag_acc_max[num] = row_max_tag_mean
                user_tag_acc_min[num] = row_min_tag_mean
        ####*********

        if update:
            answered_correctly_u_count_dict[row[0]] += 1
            answered_correctly_u_sum_dict[row[0]] += row[3]
            answered_correctly_uq_dict[row[0]][row[2]] += 1
            part_user_count_dict[row[0]][row[7]] += 1
            part_user_sum_dict[row[0]][row[7]] += row[3]
#             if question_correct_last_20_count_dict[row[0]]+1<=20:
#                 question_correct_last_20_count_dict[row[0]]+=1
#                 question_correct_last_20_sum_dict[row[0]]+=row[3]
#                 question_correct_last_20_all_dict[row[0]].append(row[3])
#             else:
#                 question_correct_last_20_sum_dict[row[0]]+=row[3]
#                 question_correct_last_20_sum_dict[row[0]]-=question_correct_last_20_all_dict[row[0]][-1]
#                 question_correct_last_20_all_dict[row[0]].pop(0)
#                 question_correct_last_20_all_dict[row[0]].append(row[3])

            tag_list=row[8].split()
            for single_tag in tag_list:
                ######更新一下 user-tag
                user_tag_acc_count_dict[row[0]][single_tag] += 1
                user_tag_acc_sum_dict[row[0]][single_tag] += row[3]

            #'user_id',"task_container_id", 'content_id', 'answered_correctly', 'prior_question_elapsed_time', 'prior_question_had_explanation', 'timestamp','part'
            list_last_user_task_table.append([row[0],row[1],row[2],row[3],row[4],row[5],row[6],row[7]])###没换人换组的时候，先不断保存旧组的信息,并且在换人换组的时候也要保存，以防那次信息没被用到
            if row[0]!=row2[0] or row[1]!=row2[1]:###换了一个task

                if len(timestamp_u_dict[row[0]]) == 3:
                    timestamp_u_dict[row[0]].pop(0)
                    timestamp_u_dict[row[0]].append(row[6])
                else:
                    timestamp_u_dict[row[0]].append(row[6])

                ####由于bundle下面包含很多question，每个question都有一个correct，所以需要用列表存储“旧的一整个组”的correct
                for single_row_last_user_task_table in list_last_user_task_table:
                    if single_row_last_user_task_table[3]==1:
                        if len(timestamp_u_correct_dict[row[0]]) == 1:###这里，就使用row[0]就行，因为list_last_user_task_que_timestamp里全都是当前user-task的信息，而非下一个user-task的信息
                            timestamp_u_correct_dict[row[0]].pop(0)
                            timestamp_u_correct_dict[row[0]].append(single_row_last_user_task_table[6])
                        else:
                            timestamp_u_correct_dict[row[0]].append(single_row_last_user_task_table[6])
                    else:
                        if len(timestamp_u_incorrect_dict[row[0]]) == 1:###这里，就使用row[0]就行，因为list_last_user_task_que_timestamp里全都是当前user-task的信息，而非下一个user-task的信息
                            timestamp_u_incorrect_dict[row[0]].pop(0)
                            timestamp_u_incorrect_dict[row[0]].append(single_row_last_user_task_table[6])
                        else:
                            timestamp_u_incorrect_dict[row[0]].append(single_row_last_user_task_table[6])
                list_last_user_task_table=[]###在即将换task的时候，把旧组需要换成新组（更换成新组之前，需要先把旧组的信息在上面用完）

    df['answered_correctly_u_avg']=answered_correctly_u_avg
    df['answered_correctly_u_count']=answered_correctly_u_count
    df['answered_correctly_uq_count']=answered_correctly_uq_count
    df['elapsed_time_u_avg_xiuzheng']=elapsed_time_u_avg
    df['explanation_u_avg_xiuzheng']=explanation_u_avg
    df['part_user_count']=part_user_count
    df['part_user_mean']=part_user_mean
    df['timestamp_u_correct_recency_1']=timestamp_u_correct_recency_1
    df['timestamp_u_incorrect_recency_1']=timestamp_u_incorrect_recency_1
    df['timestamp_u_diff_1_2']=timestamp_u_diff_1
    df['timestamp_u_diff_2_3']=timestamp_u_diff_2
    df['timestamp_u_diff_3_end']=timestamp_u_diff_3
    df['part_user_count']=part_user_count
    df['part_user_mean']=part_user_mean
    df['user_tag_acc_count']=user_tag_acc_count
    df['user_tag_acc_max']=user_tag_acc_max
    df['user_tag_acc_min']=user_tag_acc_min

    return df



# Function for training and evaluation
def train_and_evaluate(train, valid):
    
    TARGET = 'answered_correctly'
    # Features to train and predict
    FEATURES = [
        'answered_correctly_u_avg',
        'answered_correctly_u_count',
        'answered_correctly_uq_count',
        'elapsed_time_u_avg_xiuzheng',
        'explanation_u_avg_xiuzheng',
        'part_correctly_q_mean',###线下
        'part_elapsed_time_mean',###线下
        'part_had_explanation_mean',###线下
        'part_user_count',
        'part_user_mean',
        'prior_question_elapsed_time',###原始
        'prior_question_had_explanation',###原始
        'question_correct_rate_last_20_mean',
        'question_correctly_q_count',###线下
        'question_correctly_q_mean',###线下
        'question_elapsed_time_mean',###线下
        'question_had_explanation_mean',###线下
        'tag_acc_count',###线下
        'tag_acc_max',###线下
        'tag_acc_min',###线下
        'tags_lsi',###线下
        'task_container_id',###原始
        'timestamp',###原始
        'timestamp_u_correct_recency_1',
        'timestamp_u_diff_1_2',
        'timestamp_u_diff_2_3',
        'timestamp_u_diff_3_end',
        'timestamp_u_incorrect_recency_1',
        'user_tag_acc_count',
        'user_tag_acc_max',
        'user_tag_acc_min'
    ]
    
    # Delete some training data to experiment faster
    if False:
        train = train.sample(15000000, random_state = SEED)

    gc.collect()
    print(f'Traning with {train.shape[0]} rows and {len(FEATURES)} features')    
    drop_cols = list(set(train.columns) - set(FEATURES))
    y_train = train[TARGET]
    y_val = valid[TARGET]
    # Drop unnecessary columns
    train.drop(drop_cols, axis = 1, inplace = True)
    valid.drop(drop_cols, axis = 1, inplace = True)
    gc.collect()


    model = lgb.LGBMClassifier(num_leaves=300,
                            max_depth=15,
                            learning_rate=0.1,
                            subsample=0.8,
                            feature_fraction=0.8,
                            random_state=2020,
                            n_estimators=200
                            )
    lgb_model = model.fit(train[FEATURES],
                        y_train,
                        eval_names=['train', 'valid'],
                        eval_set=[(train[FEATURES], y_train), (valid[FEATURES], y_val)],
                        verbose=10,
                        eval_metric='auc',
                        early_stopping_rounds=10,
                         categorical_feature=['tags_lsi'])
    
    print('Our Roc Auc score for the validation data is:', roc_auc_score(y_val, lgb_model.predict(valid[FEATURES])))

    feature_importance = lgb_model.feature_importance()
    feature_importance = pd.DataFrame({'Features': FEATURES, 'Importance': feature_importance}).sort_values('Importance', ascending = False)
    
    fig = plt.figure(figsize = (10, 10))
    fig.suptitle('Feature Importance', fontsize = 20)
    plt.tick_params(axis = 'x', labelsize = 12)
    plt.tick_params(axis = 'y', labelsize = 12)
    plt.xlabel('Importance', fontsize = 15)
    plt.ylabel('Features', fontsize = 15)
    sns.barplot(x = feature_importance['Importance'], y = feature_importance['Features'], orient = 'h')
    plt.show()
    
    return TARGET, FEATURES, lgb_model

# Using time series api that simulates production predictions
def inference(TARGET, FEATURES, model, questions_df, prior_question_elapsed_time_mean, features_dicts):
    
    # Get feature dict
    answered_correctly_u_count = features_dicts['answered_correctly_u_count']
    answered_correctly_u_sum = features_dicts['answered_correctly_u_sum']
    elapsed_time_u_sum = features_dicts['elapsed_time_u_sum']
    explanation_u_sum = features_dicts['explanation_u_sum']
    answered_correctly_q_count = features_dicts['answered_correctly_q_count']
    answered_correctly_q_sum = features_dicts['answered_correctly_q_sum']
    elapsed_time_q_sum = features_dicts['elapsed_time_q_sum']
    explanation_q_sum = features_dicts['explanation_q_sum']
    answered_correctly_uq = features_dicts['answered_correctly_uq']
    timestamp_u = features_dicts['timestamp_u']
    timestamp_u_incorrect = features_dicts['timestamp_u_incorrect']
    
    # Get api iterator and predictor
    env = riiideducation.make_env()
    iter_test = env.iter_test()
    set_predict = env.predict
    
    previous_test_df = None
    for (test_df, sample_prediction_df) in iter_test:
        if previous_test_df is not None:
            previous_test_df[TARGET] = eval(test_df["prior_group_answers_correct"].iloc[0])
            update_features(previous_test_df, answered_correctly_u_sum, answered_correctly_q_sum, timestamp_u_incorrect)
        previous_test_df = test_df.copy()
        test_df = test_df[test_df['content_type_id'] == 0].reset_index(drop = True)
        test_df['prior_question_had_explanation'] = test_df.prior_question_had_explanation.fillna(False).astype('int8')
        test_df['prior_question_elapsed_time'].fillna(prior_question_elapsed_time_mean, inplace = True)
        test_df = pd.merge(test_df, questions_df[['question_id', 'part']], left_on = 'content_id', right_on = 'question_id', how = 'left')
        test_df[TARGET] = 0
        test_df = add_features(test_df, update = False)
        test_df[TARGET] =  model.predict(test_df[FEATURES])
        set_predict(test_df[['row_id', TARGET]])
        
    print('Job Done')
    

User feature calculation started...




KeyboardInterrupt: 

In [None]:
train, valid, questions_df, prior_question_elapsed_time_mean, features_dicts = read_and_preprocess(feature_engineering = True)

In [None]:
TARGET, FEATURES, model = train_and_evaluate(train, valid)

In [None]:
# inference(TARGET, FEATURES, model, questions_df, prior_question_elapsed_time_mean, features_dicts)