In [1]:
import pandas as pd
import numpy as np
import gc
from scipy.stats.mstats import hmean
from tqdm import tqdm
import featuretools as ft
import swifter


pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in tqdm(props.columns):
        if (props[col].dtype != object) & (col not in ['type_of','tags']):  # Exclude strings
            
            # Print current column type
#             print("******************************")
            #print("Column: ",col)
#             print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
#             print("dtype after: ",props[col].dtype)
#             print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [3]:
# conda install -c conda-forge featuretools

In [4]:
# train_set = pd.read_pickle('train_set_cv1.pkl').iloc[:,:9].sample(frac = 0.10, random_state = 42)
train_set = pd.read_pickle('train_set_cv1.pkl').iloc[-10000000:,:9]
valid_set = pd.read_pickle('valid_set_cv1.pkl').iloc[:,:9]

print(train_set.shape)
print(valid_set.shape)

(10000000, 9)
(2500000, 9)


In [5]:
train_set = train_set[train_set.answered_correctly != -1]
valid_set = valid_set[valid_set.answered_correctly != -1]

In [6]:
del train_set['user_answer']
del valid_set['user_answer']

In [7]:
train_set.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation
88730332,13269,883629033,4366,0,1,1,19000,0
88730333,0,303940910,7900,0,0,1,0,0
88730334,2156345777,1505339832,9259,0,731,1,33000,1
88730335,5109397244,1534072,6093,0,910,0,18000,1
88730336,1874427519,1919151627,1578,0,627,1,2000,1


In [8]:
questions = pd.read_csv('questions.csv')
questions = questions[questions.tags.notnull()]
questions = questions.join(questions['tags'].str.split(' ', expand=True).add_prefix('tag'))
questions.iloc[:,1:] = questions.iloc[:,1:].astype('category')
del questions['tags']
questions = questions.astype('str')
questions[questions == 'nan'] = 0
questions = questions.astype(np.int)
questions = pd.DataFrame(reduce_mem_usage(questions)[0])
questions.head()

100%|██████████| 10/10 [00:00<00:00, 771.18it/s]

Memory usage of properties dataframe is : 1.243988037109375  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.8829116821289062  MB
This is  70.97429041042122 % of the initial size





Unnamed: 0,question_id,bundle_id,correct_answer,part,tag0,tag1,tag2,tag3,tag4,tag5
0,0,0,0,1,51,131,162,38,0,0
1,1,1,1,1,131,36,81,0,0,0
2,2,2,0,1,131,101,162,92,0,0
3,3,3,0,1,131,149,162,29,0,0
4,4,4,3,1,131,5,162,38,0,0


In [9]:
train_set['prev_answer'] = train_set.groupby('user_id')['answered_correctly'].shift(1)
valid_set['prev_answer'] = valid_set.groupby('user_id')['answered_correctly'].shift(1)

### Setting up Train Data

In [10]:
# train_set = train_set.sort_values(['user_id','timestamp','content_id'])
# valid_set = valid_set.sort_values(['user_id','timestamp','content_id'])
train_set.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,prev_answer
88730332,13269,883629033,4366,0,1,1,19000,0,
88730333,0,303940910,7900,0,0,1,0,0,
88730334,2156345777,1505339832,9259,0,731,1,33000,1,
88730335,5109397244,1534072,6093,0,910,0,18000,1,
88730336,1874427519,1919151627,1578,0,627,1,2000,1,


In [11]:
train_set = train_set.merge(questions, left_on = 'content_id', right_on = 'question_id', how = 'left')
valid_set = valid_set.merge(questions, left_on = 'content_id', right_on = 'question_id', how = 'left')

In [12]:
train_set['timestamp'].dtype

dtype('uint64')

In [13]:
cols_for_lag = ['timestamp',
                'task_container_id',
                'prior_question_elapsed_time',
                'prior_question_had_explanation',
                'question_id',
                'bundle_id',
               'correct_answer',
               'tag0',
               'tag1',
               'tag2',
               'tag3',
               'tag4',
               'tag5']

for i in cols_for_lag:
    print(f"Calculating Lag for {i}")
    train_set[f"{i}_lag"] = train_set.groupby('user_id')[i].shift(1)#.fillna(0).astype(np.int)
    valid_set[f"{i}_lag"] = valid_set.groupby('user_id')[i].shift(1)#.fillna(0).astype(np.int)

Calculating Lag for timestamp
Calculating Lag for task_container_id
Calculating Lag for prior_question_elapsed_time
Calculating Lag for prior_question_had_explanation
Calculating Lag for question_id
Calculating Lag for bundle_id
Calculating Lag for correct_answer
Calculating Lag for tag0
Calculating Lag for tag1
Calculating Lag for tag2
Calculating Lag for tag3
Calculating Lag for tag4
Calculating Lag for tag5


In [15]:
train_set = train_set.fillna(0)
valid_set = valid_set.fillna(0)

In [17]:
for i in cols_for_lag:
    print(f"Calculating Diff for {i}")
    train_set[f"{i}_diff"] = train_set[i] - train_set[f"{i}_lag"]
    valid_set[f"{i}_diff"] = valid_set[i] - valid_set[f"{i}_lag"]

Calculating Diff for timestamp
Calculating Diff for task_container_id
Calculating Diff for prior_question_elapsed_time
Calculating Diff for prior_question_had_explanation
Calculating Diff for question_id
Calculating Diff for bundle_id
Calculating Diff for correct_answer
Calculating Diff for tag0
Calculating Diff for tag1
Calculating Diff for tag2
Calculating Diff for tag3
Calculating Diff for tag4
Calculating Diff for tag5


In [18]:
float_cols = [col for col in train_set.columns if train_set[col].dtype == 'float64']
train_set[float_cols] = reduce_mem_usage(train_set[float_cols])[0]
valid_set[float_cols] = reduce_mem_usage(valid_set[float_cols])[0]

  0%|          | 0/37 [00:00<?, ?it/s]

Memory usage of properties dataframe is : 2841.4628143310547  MB


100%|██████████| 37/37 [00:50<00:00,  1.37s/it]


___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  738.406455039978  MB
This is  25.986842105263158 % of the initial size


In [20]:
train_set = train_set.fillna(0)

In [21]:
cols_for_accuracy = ['prior_question_had_explanation',
                     'prev_answer',
                     'question_id',
                    'bundle_id',
                     'correct_answer',
                    'part',
                    'tag0',
                    'tag1',
                    'tag2',
                    'tag3',
                    'tag4',
                    'tag5']

stats_cols = []

for i in cols_for_accuracy:
    print(f"Calculating stats for {i}")
    train_set[f"{i}_mean"] = np.round(train_set.groupby(i)['answered_correctly'].transform('mean') * 100)
    train_set[f"{i}_std"] = np.round(train_set.groupby(i)['answered_correctly'].transform('std') * 100)
    train_set[f"{i}_skew"] = np.round(train_set.groupby(i)['answered_correctly'].transform('skew') * 100)
    train_set[f"{i}_kurtosis"] = np.round(train_set.groupby(i)['answered_correctly'].transform(lambda x: x.kurtosis()) * 100)
    
#     train_set[f"{i}_mean"] = train_set[f"{i}_mean"].fillna(train_set[f"{i}_mean"].mean()).astype(np.int8)
#     train_set[f"{i}_std"] = train_set[f"{i}_std"].fillna(train_set[f"{i}_std"].mean()).astype(np.int8)
#     train_set[f"{i}_skew"] = train_set[f"{i}_skew"].fillna(train_set[f"{i}_skew"].mean()).astype(np.int8)
#     train_set[f"{i}_kurtosis"] = train_set[f"{i}_kurtosis"].fillna(train_set[f"{i}_kurtosis"].mean()).astype(np.int8)
    
    stats_cols.extend([f"{i}_mean",f"{i}_std",f"{i}_skew",f"{i}_kurtosis"])

Calculating stats for prior_question_had_explanation
Calculating stats for prev_answer
Calculating stats for question_id
Calculating stats for bundle_id
Calculating stats for correct_answer
Calculating stats for part
Calculating stats for tag0
Calculating stats for tag1
Calculating stats for tag2
Calculating stats for tag3
Calculating stats for tag4
Calculating stats for tag5


In [22]:
stats_col_means = train_set[stats_cols].mean()
train_set[stats_cols] = train_set[stats_cols].fillna(stats_col_means).astype(np.int8)

In [23]:
question_cols = questions.columns.tolist()
train_set[question_cols] = train_set[question_cols].astype(questions[question_cols].dtypes)
valid_set[question_cols] = valid_set[question_cols].astype(questions[question_cols].dtypes)

In [24]:
stats_df = train_set[cols_for_accuracy + stats_cols].drop_duplicates()

In [25]:
gc.collect()

20

In [26]:
for i in cols_for_accuracy:
    print(f"Mapping stats in valid_set for {i}")
    valid_set[f"{i}_mean"] = valid_set[i].map(stats_df[[i,f"{i}_mean"]].set_index(i).to_dict()[f"{i}_mean"])
    valid_set[f"{i}_std"] = valid_set[i].map(stats_df[[i,f"{i}_std"]].set_index(i).to_dict()[f"{i}_std"])
    valid_set[f"{i}_skew"] = valid_set[i].map(stats_df[[i,f"{i}_skew"]].set_index(i).to_dict()[f"{i}_skew"])
    valid_set[f"{i}_kurtosis"] = valid_set[i].map(stats_df[[i,f"{i}_kurtosis"]].set_index(i).to_dict()[f"{i}_kurtosis"])

Mapping stats in valid_set for prior_question_had_explanation
Mapping stats in valid_set for prev_answer
Mapping stats in valid_set for question_id
Mapping stats in valid_set for bundle_id
Mapping stats in valid_set for correct_answer
Mapping stats in valid_set for part
Mapping stats in valid_set for tag0
Mapping stats in valid_set for tag1
Mapping stats in valid_set for tag2
Mapping stats in valid_set for tag3
Mapping stats in valid_set for tag4
Mapping stats in valid_set for tag5


In [27]:
valid_set[stats_cols] = valid_set[stats_cols].fillna(stats_col_means).astype(np.int8)

In [28]:
train_set['prev_answer'] = train_set['prev_answer'].fillna(0).astype(np.int8)
valid_set['prev_answer'] = valid_set['prev_answer'].fillna(0).astype(np.int8)

In [29]:
#User accuracy
train_set['user_correct'] = train_set.groupby('user_id')['prev_answer'].cumsum()
train_set['user_count'] = train_set.groupby('user_id')['prev_answer'].cumcount() + 1
train_set['user_accuracy'] = np.round(train_set['user_correct'] / train_set['user_count'] * 100)

In [30]:
valid_set['user_correct'] = valid_set.groupby('user_id')['prev_answer'].cumsum()
valid_set['user_count'] = valid_set.groupby('user_id')['prev_answer'].cumcount() + 1
valid_set['user_accuracy'] = np.round(valid_set['user_correct'] / valid_set['user_count'] * 100)

In [31]:
#User-col Combinations
cols_to_bundle = ['question_id',
                  'bundle_id',
                  'prior_question_had_explanation',
                  'correct_answer',
                  'part',
                  'tag0',
                  'tag1',
                  'tag2',
                  'tag3',
                  'tag4',
                  'tag5']

for i in cols_to_bundle:
    print(f"Calculating Combination of user_id and {i}")
    train_set[f'user_{i}_correct'] = train_set.groupby(['user_id',i])['prev_answer'].cumsum()
    train_set[f'user_{i}_count'] = train_set.groupby(['user_id',i])['prev_answer'].cumcount() + 1
    train_set[f'user_{i}_accuracy'] = np.round(train_set[f'user_{i}_correct'] / train_set[f'user_{i}_count'] * 100).astype(np.int8)
    
    valid_set[f'user_{i}_correct'] = valid_set.groupby(['user_id',i])['prev_answer'].cumsum()
    valid_set[f'user_{i}_count'] = valid_set.groupby(['user_id',i])['prev_answer'].cumcount() + 1
    valid_set[f'user_{i}_accuracy'] = np.round(valid_set[f'user_{i}_correct'] / valid_set[f'user_{i}_count'] * 100).astype(np.int8)

Calculating Combination of user_id and question_id
Calculating Combination of user_id and bundle_id
Calculating Combination of user_id and prior_question_had_explanation
Calculating Combination of user_id and correct_answer
Calculating Combination of user_id and part
Calculating Combination of user_id and tag0
Calculating Combination of user_id and tag1
Calculating Combination of user_id and tag2
Calculating Combination of user_id and tag3
Calculating Combination of user_id and tag4
Calculating Combination of user_id and tag5


In [32]:
# #Rolling Means
# windows = [5,10]
# for i in windows:
#     print(f"Calculating rolling mean of window {i}")
#     train_set[f"rolling_mean_window_{i}"] = train_set.groupby('user_id')['prev_answer'].apply(lambda x: x.rolling(i).mean())
#     valid_set[f"rolling_mean_window_{i}"] = valid_set.groupby('user_id')['prev_answer'].apply(lambda x: x.rolling(i).mean())

In [33]:
#Second order Lag
cols_for_lag = ['timestamp',
                'task_container_id',
                'prior_question_elapsed_time',
                'prior_question_had_explanation',
                'question_id',
                'bundle_id',
               'correct_answer',
               'tag0',
               'tag1',
               'tag2',
               'tag3',
               'tag4',
               'tag5']

for i in cols_for_lag:
    print(f"Calculating Lag2 for {i}")
    train_set[f"{i}_lag2"] = train_set.groupby('user_id')[i].shift(2).fillna(0).astype(np.int)
    valid_set[f"{i}_lag2"] = valid_set.groupby('user_id')[i].shift(2).fillna(0).astype(np.int)

Calculating Lag2 for timestamp
Calculating Lag2 for task_container_id
Calculating Lag2 for prior_question_elapsed_time
Calculating Lag2 for prior_question_had_explanation
Calculating Lag2 for question_id
Calculating Lag2 for bundle_id
Calculating Lag2 for correct_answer
Calculating Lag2 for tag0
Calculating Lag2 for tag1
Calculating Lag2 for tag2
Calculating Lag2 for tag3
Calculating Lag2 for tag4
Calculating Lag2 for tag5


In [34]:
#Second order Diff
for i in cols_for_lag:
    print(f"Calculating Diff2 for {i}")
    train_set[f"{i}_diff2"] = train_set[f"{i}_lag"] - train_set[f"{i}_lag2"]
    valid_set[f"{i}_diff2"] = valid_set[f"{i}_lag"] - valid_set[f"{i}_lag2"]

Calculating Diff2 for timestamp
Calculating Diff2 for task_container_id
Calculating Diff2 for prior_question_elapsed_time
Calculating Diff2 for prior_question_had_explanation
Calculating Diff2 for question_id
Calculating Diff2 for bundle_id
Calculating Diff2 for correct_answer
Calculating Diff2 for tag0
Calculating Diff2 for tag1
Calculating Diff2 for tag2
Calculating Diff2 for tag3
Calculating Diff2 for tag4
Calculating Diff2 for tag5


In [35]:
#Getting features based on user_id, timestamp breakup
# train_set['user_task_mean'] = train_set.groupby(['user_id','timestamp'])['answered_correctly'].transform('mean')
# valid_set['user_task_mean'] = valid_set.groupby(['user_id','timestamp'])['answered_correctly'].transform('mean')

In [36]:
user_time_cols = ['timestamp',
                  'user_id',
                  'task_container_id',
                  'answered_correctly',
                  'bundle_id',
                  'prior_question_elapsed_time',
                  'prior_question_had_explanation']

user_task_df = pd.concat([train_set[user_time_cols], valid_set[user_time_cols]])
user_task_df = user_task_df.sort_values(['user_id','task_container_id'])
user_task_df['user_task_mean'] = np.round(user_task_df.groupby(['user_id','task_container_id'])['answered_correctly'].transform('mean') * 100)
user_task_df = user_task_df.drop('answered_correctly', axis = 1).drop_duplicates()
user_task_df['user_task_mean_lag'] = np.round(user_task_df.groupby('user_id')['user_task_mean'].shift(1))

In [37]:
get_lags = ['timestamp','task_container_id','bundle_id']
for i in tqdm(get_lags):
    user_task_df[f"prev_{i}"] = user_task_df.groupby('user_id')[i].shift(1)

100%|██████████| 3/3 [00:02<00:00,  1.19it/s]


In [38]:
user_task_df['prev_bundle_id_avg_time'] = np.round(user_task_df.groupby('prev_bundle_id')['prior_question_elapsed_time'].transform('mean') * 100)

In [39]:
user_task_df = user_task_df.dropna()
user_task_df = reduce_mem_usage(user_task_df)[0]

  0%|          | 0/12 [00:00<?, ?it/s]

Memory usage of properties dataframe is : 678.8120250701904  MB


100%|██████████| 12/12 [00:04<00:00,  2.78it/s]

___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  414.339807510376  MB
This is  61.03896103896104 % of the initial size





In [40]:
bundle_explanation_chances = user_task_df[['prior_question_had_explanation','prev_bundle_id']].dropna()
bundle_explanation_chances['total_ones'] = bundle_explanation_chances.groupby('prev_bundle_id')['prior_question_had_explanation'].transform(lambda x: sum(x==1))
bundle_explanation_chances['total_zeroes'] = bundle_explanation_chances.groupby('prev_bundle_id')['prior_question_had_explanation'].transform(lambda x: sum(x==0))
bundle_explanation_chances['explanation_chance'] = np.round(bundle_explanation_chances.total_ones / (bundle_explanation_chances.total_ones + bundle_explanation_chances.total_zeroes) * 100)
bundle_explanation_chances = bundle_explanation_chances[['prev_bundle_id','total_ones','total_zeroes','explanation_chance']].drop_duplicates()
bundle_explanation_chances.columns = ['bundle_id','total_ones','total_zeroes','explanation_chance']

In [41]:
bundle_explanation_chances = reduce_mem_usage(bundle_explanation_chances)[0]

100%|██████████| 4/4 [00:00<00:00, 501.34it/s]

Memory usage of properties dataframe is : 0.3164024353027344  MB
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  0.1395893096923828  MB
This is  44.11764705882353 % of the initial size





In [42]:
user_task_df = user_task_df[['user_id','task_container_id','user_task_mean_lag','prev_timestamp','prev_task_container_id','prev_bundle_id','prev_bundle_id_avg_time']].dropna()

In [43]:
#Merge our data
train_set = train_set.merge(user_task_df, on = ['user_id','task_container_id'], how = 'left')
valid_set = valid_set.merge(user_task_df, on = ['user_id','task_container_id'], how = 'left')

In [44]:
# fill_means = ['user_task_mean_lag','prev_bundle_id_avg_time']
# train_set[fill_means] = np.round(train_set[fill_means].fillna(train_set[fill_means].mean()))
# valid_set[fill_means] = np.round(valid_set[fill_means].fillna(valid_set[fill_means].mean()))

In [45]:
#Merge our data
train_set = train_set.merge(bundle_explanation_chances, on = 'bundle_id', how = 'left')
valid_set = valid_set.merge(bundle_explanation_chances, on = 'bundle_id', how = 'left')

In [46]:
train_set = train_set.fillna(0)
valid_set = valid_set.fillna(0)

In [47]:
print(train_set.shape)
print(valid_set.shape)

(9800953, 163)
(2452472, 163)


In [48]:
del user_task_df, bundle_explanation_chances
gc.collect()

100

In [49]:
# train_set = reduce_mem_usage(train_set)[0]
# valid_set = reduce_mem_usage(valid_set)[0]

In [50]:
gc.collect()

40

In [51]:
# train_set.to_pickle('train_set_lgb.pkl')
# valid_set.to_pickle('valid_set_lgb.pkl')

In [52]:
# temp = [col for col in train_set.columns if train_set[col].dtype == 'float32']
# temp

In [53]:
train_set.head()

Unnamed: 0,timestamp,user_id,content_id,content_type_id,task_container_id,answered_correctly,prior_question_elapsed_time,prior_question_had_explanation,prev_answer,question_id,bundle_id,correct_answer,part,tag0,tag1,tag2,tag3,tag4,tag5,timestamp_lag,task_container_id_lag,prior_question_elapsed_time_lag,prior_question_had_explanation_lag,question_id_lag,bundle_id_lag,correct_answer_lag,tag0_lag,tag1_lag,tag2_lag,tag3_lag,tag4_lag,tag5_lag,timestamp_diff,task_container_id_diff,prior_question_elapsed_time_diff,prior_question_had_explanation_diff,question_id_diff,bundle_id_diff,correct_answer_diff,tag0_diff,tag1_diff,tag2_diff,tag3_diff,tag4_diff,tag5_diff,prior_question_had_explanation_mean,prior_question_had_explanation_std,prior_question_had_explanation_skew,prior_question_had_explanation_kurtosis,prev_answer_mean,prev_answer_std,prev_answer_skew,prev_answer_kurtosis,question_id_mean,question_id_std,question_id_skew,question_id_kurtosis,bundle_id_mean,bundle_id_std,bundle_id_skew,bundle_id_kurtosis,correct_answer_mean,correct_answer_std,correct_answer_skew,correct_answer_kurtosis,part_mean,part_std,part_skew,part_kurtosis,tag0_mean,tag0_std,tag0_skew,tag0_kurtosis,tag1_mean,tag1_std,tag1_skew,tag1_kurtosis,tag2_mean,tag2_std,tag2_skew,tag2_kurtosis,tag3_mean,tag3_std,tag3_skew,tag3_kurtosis,tag4_mean,tag4_std,tag4_skew,tag4_kurtosis,tag5_mean,tag5_std,tag5_skew,tag5_kurtosis,user_correct,user_count,user_accuracy,user_question_id_correct,user_question_id_count,user_question_id_accuracy,user_bundle_id_correct,user_bundle_id_count,user_bundle_id_accuracy,user_prior_question_had_explanation_correct,user_prior_question_had_explanation_count,user_prior_question_had_explanation_accuracy,user_correct_answer_correct,user_correct_answer_count,user_correct_answer_accuracy,user_part_correct,user_part_count,user_part_accuracy,user_tag0_correct,user_tag0_count,user_tag0_accuracy,user_tag1_correct,user_tag1_count,user_tag1_accuracy,user_tag2_correct,user_tag2_count,user_tag2_accuracy,user_tag3_correct,user_tag3_count,user_tag3_accuracy,user_tag4_correct,user_tag4_count,user_tag4_accuracy,user_tag5_correct,user_tag5_count,user_tag5_accuracy,timestamp_lag2,task_container_id_lag2,prior_question_elapsed_time_lag2,prior_question_had_explanation_lag2,question_id_lag2,bundle_id_lag2,correct_answer_lag2,tag0_lag2,tag1_lag2,tag2_lag2,tag3_lag2,tag4_lag2,tag5_lag2,timestamp_diff2,task_container_id_diff2,prior_question_elapsed_time_diff2,prior_question_had_explanation_diff2,question_id_diff2,bundle_id_diff2,correct_answer_diff2,tag0_diff2,tag1_diff2,tag2_diff2,tag3_diff2,tag4_diff2,tag5_diff2,user_task_mean_lag,prev_timestamp,prev_task_container_id,prev_bundle_id,prev_bundle_id_avg_time,total_ones,total_zeroes,explanation_chance
0,13269,883629033,4366,0,1,1,19000,0,0,4366,4366,3,5,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,13269,1,19000,0,4366,4366,3,14,0,0,0,0,0,50,50,-1,56,58,49,-33,67,80,40,103,34,80,40,103,34,65,48,-65,98,61,49,-43,75,69,46,-81,121,61,49,-47,78,62,49,-49,80,64,48,-56,87,65,48,-62,94,65,48,-64,97,0,1,0.0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,474,18,96
1,0,303940910,7900,0,0,1,0,0,0,7900,7900,0,1,131,93,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7900,7900,0,131,93,81,0,0,0,50,50,-1,56,58,49,-33,67,82,38,86,89,82,38,86,89,66,47,-70,104,74,44,-109,-81,79,41,114,3,80,40,104,31,69,46,-85,-128,64,48,-56,87,65,48,-62,94,65,48,-64,97,0,1,0.0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1546,23161,6
2,2156345777,1505339832,9259,0,731,1,33000,1,0,9259,9259,2,5,31,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2156345777,731,33000,1,9259,9259,2,31,0,0,0,0,0,67,47,-72,108,58,49,-33,67,45,50,20,60,45,50,20,60,62,49,-47,78,61,49,-43,75,56,50,-25,62,61,49,-47,78,62,49,-49,80,64,48,-56,87,65,48,-62,94,65,48,-64,97,0,1,0.0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,100.0,2156387000.0,729.0,9264.0,2539948.0,7100,307,96
3,5109397244,1534072,6093,0,910,0,18000,1,0,6093,6093,0,5,91,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5109397244,910,18000,1,6093,6093,0,91,0,0,0,0,0,67,47,-72,108,58,49,-33,67,57,50,-27,63,57,50,-27,63,66,47,-70,104,61,49,-43,75,61,49,-47,78,61,49,-47,78,62,49,-49,80,64,48,-56,87,65,48,-62,94,65,48,-64,97,0,1,0.0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,100.0,5273482000.0,803.0,4439.0,1840903.0,1917,31,98
4,1874427519,1919151627,1578,0,627,1,2000,1,0,1578,1577,0,3,113,38,81,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1874427519,627,2000,1,1578,1577,0,113,38,81,0,0,0,67,47,-72,108,58,49,-33,67,80,40,108,19,75,43,-114,-69,66,47,-70,104,70,46,-86,-125,66,47,-68,102,71,45,-94,-111,69,46,-85,-128,64,48,-56,87,65,48,-62,94,65,48,-64,97,0,1,0.0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,100.0,1931065000.0,616.0,748.0,1830381.0,815,23,97


In [None]:
# del X_train['user_task_mean']
# del X_test['user_task_mean']

### Apply LightGBM Model

In [7]:
# train_set = pd.read_pickle('train_set_lgb.pkl').iloc[-20000000:,:]
# valid_set = pd.read_pickle('valid_set_lgb.pkl')

In [54]:
print(train_set.shape)
print(valid_set.shape)

(9800953, 163)
(2452472, 163)


In [55]:
X_train = train_set.drop('answered_correctly', axis = 1)
y_train = train_set.answered_correctly
X_test = valid_set.drop('answered_correctly', axis = 1)
y_test = valid_set.answered_correctly

# X_train = train_set.drop('answered_correctly', axis = 1).values
# y_train = train_set.answered_correctly
# X_test = valid_set.drop('answered_correctly', axis = 1).values
# y_test = valid_set.answered_correctly

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(9800953, 162)
(9800953,)
(2452472, 162)
(2452472,)


In [56]:
gc.collect()

60

In [57]:
# X_train.to_pickle('X_train_lgb.pkl')
# y_train.to_pickle('y_train_lgb.pkl')
# X_test.to_pickle('X_test_lgb.pkl')
# y_test.to_pickle('y_test_lgb.pkl')

In [58]:
del train_set, valid_set

In [59]:
# X_train.to_pickle('X_train_re.pkl')
# y_train.to_pickle('y_train_re.pkl')

# X_test.to_pickle('X_test_re.pkl')
# y_test.to_pickle('y_test_re.pkl')

#### End_Data

In [8]:
# end_data = pd.read_pickle('train_set_lgb.pkl')

In [85]:
# end_data = pd.concat([X_train, X_test])

In [81]:
# end_data = end_data.groupby('user_id').tail(1)

In [84]:
# end_data.to_pickle('end_data_step8_adv.pkl')

In [90]:
X_train[cols_to_select].tail(50)

Unnamed: 0,timestamp,user_id,timestamp_diff,total_ones,question_id_mean,user_part_accuracy,user_id.1,total_zeroes,prior_question_elapsed_time,timestamp_diff2,user_prior_question_had_explanation_accuracy,user_accuracy,user_part_count,content_id,user_part_correct,prev_bundle_id_avg_time,tag0,bundle_id_mean,user_question_id_count,timestamp.1,question_id_skew,user_tag0_count,question_id_kurtosis,prior_question_elapsed_time_lag2,task_container_id_diff,prior_question_elapsed_time_lag,user_tag2_accuracy,explanation_chance,prev_bundle_id,user_prior_question_had_explanation_correct,tag0_diff,user_prior_question_had_explanation_count,user_correct_answer_accuracy,prev_timestamp,question_id_std,user_tag3_count,tag0_kurtosis,bundle_id_kurtosis,user_tag1_count,user_tag0_correct,bundle_id_skew,user_tag3_accuracy,tag0_skew,user_tag3_correct,user_tag2_count,user_correct_answer_count,user_correct_answer_correct,question_id_diff,user_tag5_accuracy,tag0_diff2,user_tag1_correct,user_tag4_accuracy
9800903,29199815240,1220678999,320132,1528,64,65,1220678999,46,30333,31346980000.0,58,58.0,124,2337,81,2110791.0,157,54,1,29199815240,-60,18,92,20333,1,20333,83,97,2503.0,1201,0,2059,56,29199500000.0,48,1509,104,58,26,9,-15,56,-69,850,23,546,307,-167,58,83,20,58
9800904,29199815240,1220678999,0,1528,72,66,1220678999,46,30333,31347300000.0,58,58.0,125,2335,82,2110791.0,106,54,1,29199815240,-101,21,-99,20333,0,30333,83,97,2503.0,1202,-51,2060,61,29199500000.0,45,1510,-103,58,27,15,-15,56,-98,851,24,579,352,-2,58,0,21,58
9800905,29199815240,1220678999,0,1528,24,65,1220678999,46,30333,31347300000.0,58,58.0,126,2336,82,2110791.0,161,54,1,29199815240,121,9,-53,30333,0,30333,80,97,2503.0,1202,55,2061,56,29199500000.0,43,1511,59,58,28,5,-15,56,-18,851,25,533,297,1,58,-51,21,58
9800906,30353375073,603665538,98416,519,46,71,603665538,3,57000,32500760000.0,76,74.0,114,8666,81,2770087.0,1,46,1,30353375073,16,22,58,17000,1,39000,75,99,6658.0,245,-72,324,70,30353280000.0,50,356,76,58,258,16,16,74,-44,264,341,71,50,2008,74,58,190,74
9800907,62724710913,270648383,70602,476,71,60,270648383,0,30000,64872120000.0,59,59.0,3051,9435,1831,2751774.0,1,71,2,62724710913,-93,238,-114,22000,1,45000,58,100,3940.0,5450,-151,9212,60,62724640000.0,45,6286,76,-114,3340,132,-93,59,-44,3721,3486,2705,1614,5495,59,56,1954,59
9800908,1070047477,537765489,5902,403,71,71,537765489,37,3000,6813.0,71,70.0,376,6274,268,2122769.0,89,71,4,1070047477,-92,13,-116,3000,1,2000,71,92,8169.0,362,81,513,73,1070042000.0,46,376,81,-116,376,9,-92,71,-50,268,376,125,91,-1895,70,-45,268,70
9800909,208024088,2020944029,27187,795,80,93,2020944029,10,23000,31861.0,87,87.0,57,10507,53,2500316.0,10,80,1,208024088,107,16,21,24000,1,20000,91,99,84.0,244,-121,280,86,207996900.0,40,204,101,21,5,16,107,87,-67,177,22,79,68,10423,87,121,5,87
9800910,970972698,195291641,47352,3857,59,68,195291641,42,9000,22975.0,73,72.0,551,6451,375,2072745.0,65,59,1,970972698,-36,5,69,19000,1,17000,68,99,9189.0,874,-31,1195,73,970925300.0,49,878,74,69,575,4,-36,70,-43,615,595,332,243,-2738,72,21,390,72
9800911,1070053968,537765489,6491,6737,36,71,537765489,185,2000,5902.0,71,70.0,377,4257,269,1584356.0,151,36,2,1070053968,57,5,88,2000,1,3000,71,97,6274.0,363,62,514,73,1070047000.0,48,377,56,88,377,2,57,71,6,269,377,157,114,-2017,70,81,269,70
9800912,2007842,1644317136,32775,2579,31,50,1644317136,23,22000,30676.0,46,38.0,6,10447,3,2312187.0,61,31,1,2007842,80,1,120,24000,1,24000,100,99,10675.0,12,52,26,33,1975067.0,46,24,-125,120,1,1,80,29,-87,7,3,12,4,-228,38,-122,1,39


In [11]:
# cols_for_accuracy = ['prior_question_had_explanation',
#                      'prev_answer',
#                      'question_id',
#                     'bundle_id',
#                      'correct_answer',
#                     'part',
#                     'tag0',
#                     'tag1',
#                     'tag2',
#                     'tag3',
#                     'tag4',
#                     'tag5']

# stats_cols = [col for col in end_data if 'mean' in col or 'std' in col or 'skew' in col or 'kurtosis' in col]
# stats_df = end_data[cols_for_accuracy + stats_cols].drop_duplicates()

In [13]:
# stats_df.to_pickle('stats_df_step8.pkl')

In [17]:
# bundle_exp = end_data[['bundle_id','total_ones','total_zeroes','explanation_chance']].drop_duplicates()

In [18]:
# bundle_exp.to_pickle('bundle_exp_step8.pkl')

In [21]:
# end = pd.read_pickle('end_data_step8.pkl')

### LGB on Full Data

In [12]:
# cat_feats = ['prior_question_had_explanation',
#                 'question_id',
#                 'bundle_id',
#                'correct_answer',
#                'tag0',
#                'tag1',
#                'tag2',
#                'tag3',
#                'tag4',
#                'tag5']

# categorical_feature = cat_feats + [f"{i}_lag" for i in cat_feats] + [f"{i}_lag2" for i in cat_feats] + ['prev_answer']

# categorical_feature_idxs = []
# for v in categorical_feature:
#     categorical_feature_idxs.append(all_cols.tolist().index(v))
# print(categorical_feature_idxs)


In [13]:
gc.collect()

100

In [60]:
import lightgbm as lgb

train_data = lgb.Dataset(
    data = X_train.values,
    label = y_train,
    #categorical_feature = categorical_feature_idxs
)

val_data = lgb.Dataset(
    data = X_test.values,
    label = y_test,
    #categorical_feature = categorical_feature_idxs
)

In [61]:
# del X_train, X_test, y_train, y_test

In [62]:
gc.collect()

40

In [63]:
lgbm_params = {
    'objective': 'binary',
    'metric': 'auc',
}

def train():
    evals_result = {}
    model = lgb.train(
        params = lgbm_params,
        train_set = train_data,
        valid_sets = [train_data, val_data],
        num_boost_round = 500,
        verbose_eval = 1,
        evals_result = evals_result,
        early_stopping_rounds = 100,
        #categorical_feature = categorical_feature_idxs,
        #feature_name = features,
    )

    # save model
    model.save_model(f'model_state_myfeatures_10k_1000est_additional_re_step8_adv.lgb')
    
    return model, evals_result

In [64]:
model, evals_result = train()

[1]	training's auc: 0.748271	valid_1's auc: 0.751206
Training until validation scores don't improve for 100 rounds
[2]	training's auc: 0.752135	valid_1's auc: 0.754719
[3]	training's auc: 0.754033	valid_1's auc: 0.756424
[4]	training's auc: 0.754814	valid_1's auc: 0.75708
[5]	training's auc: 0.756114	valid_1's auc: 0.758191
[6]	training's auc: 0.757045	valid_1's auc: 0.759194
[7]	training's auc: 0.757886	valid_1's auc: 0.760207
[8]	training's auc: 0.758814	valid_1's auc: 0.761087
[9]	training's auc: 0.759272	valid_1's auc: 0.761638
[10]	training's auc: 0.759749	valid_1's auc: 0.762237
[11]	training's auc: 0.760186	valid_1's auc: 0.762692
[12]	training's auc: 0.760761	valid_1's auc: 0.763333
[13]	training's auc: 0.761027	valid_1's auc: 0.763557
[14]	training's auc: 0.761503	valid_1's auc: 0.764085
[15]	training's auc: 0.761909	valid_1's auc: 0.764485
[16]	training's auc: 0.762432	valid_1's auc: 0.765058
[17]	training's auc: 0.762763	valid_1's auc: 0.765428
[18]	training's auc: 0.763129	

[152]	training's auc: 0.776771	valid_1's auc: 0.779689
[153]	training's auc: 0.7768	valid_1's auc: 0.779715
[154]	training's auc: 0.776825	valid_1's auc: 0.779612
[155]	training's auc: 0.776845	valid_1's auc: 0.779632
[156]	training's auc: 0.776865	valid_1's auc: 0.779644
[157]	training's auc: 0.776879	valid_1's auc: 0.779637
[158]	training's auc: 0.776909	valid_1's auc: 0.779665
[159]	training's auc: 0.776955	valid_1's auc: 0.779715
[160]	training's auc: 0.77699	valid_1's auc: 0.779749
[161]	training's auc: 0.777013	valid_1's auc: 0.779774
[162]	training's auc: 0.777035	valid_1's auc: 0.779645
[163]	training's auc: 0.777052	valid_1's auc: 0.779651
[164]	training's auc: 0.777073	valid_1's auc: 0.779702
[165]	training's auc: 0.77709	valid_1's auc: 0.779715
[166]	training's auc: 0.777113	valid_1's auc: 0.779737
[167]	training's auc: 0.777128	valid_1's auc: 0.779742
[168]	training's auc: 0.777149	valid_1's auc: 0.77976
[169]	training's auc: 0.777165	valid_1's auc: 0.779772
[170]	training'

[302]	training's auc: 0.779071	valid_1's auc: 0.781187
[303]	training's auc: 0.779081	valid_1's auc: 0.781193
[304]	training's auc: 0.779092	valid_1's auc: 0.7812
[305]	training's auc: 0.779113	valid_1's auc: 0.781221
[306]	training's auc: 0.779127	valid_1's auc: 0.781234
[307]	training's auc: 0.779133	valid_1's auc: 0.781237
[308]	training's auc: 0.779143	valid_1's auc: 0.781244
[309]	training's auc: 0.779163	valid_1's auc: 0.781258
[310]	training's auc: 0.779175	valid_1's auc: 0.781268
[311]	training's auc: 0.779194	valid_1's auc: 0.781283
[312]	training's auc: 0.7792	valid_1's auc: 0.781283
[313]	training's auc: 0.779209	valid_1's auc: 0.78129
[314]	training's auc: 0.779215	valid_1's auc: 0.781292
[315]	training's auc: 0.779225	valid_1's auc: 0.781298
[316]	training's auc: 0.779236	valid_1's auc: 0.781297
[317]	training's auc: 0.779243	valid_1's auc: 0.7813
[318]	training's auc: 0.779251	valid_1's auc: 0.781306
[319]	training's auc: 0.779266	valid_1's auc: 0.781317
[320]	training's 

[452]	training's auc: 0.780469	valid_1's auc: 0.781912
[453]	training's auc: 0.780479	valid_1's auc: 0.781922
[454]	training's auc: 0.780486	valid_1's auc: 0.781901
[455]	training's auc: 0.78049	valid_1's auc: 0.781901
[456]	training's auc: 0.780501	valid_1's auc: 0.78191
[457]	training's auc: 0.780511	valid_1's auc: 0.781918
[458]	training's auc: 0.780517	valid_1's auc: 0.781922
[459]	training's auc: 0.780523	valid_1's auc: 0.781922
[460]	training's auc: 0.780539	valid_1's auc: 0.781923
[461]	training's auc: 0.780543	valid_1's auc: 0.781924
[462]	training's auc: 0.78055	valid_1's auc: 0.781928
[463]	training's auc: 0.780556	valid_1's auc: 0.781932
[464]	training's auc: 0.780565	valid_1's auc: 0.781937
[465]	training's auc: 0.78057	valid_1's auc: 0.781938
[466]	training's auc: 0.780576	valid_1's auc: 0.781936
[467]	training's auc: 0.78058	valid_1's auc: 0.781935
[468]	training's auc: 0.780585	valid_1's auc: 0.781903
[469]	training's auc: 0.780588	valid_1's auc: 0.781902
[470]	training'

In [None]:
# X_train[X_train.user_id == 2146301939].to_excel('sample_user_2146301939.xls')

In [None]:
# y_train[X_train.user_id == 2146301939].to_excel('sample_user_ans_2146301939.xls')

In [None]:
X_train.head(20)

In [13]:
import lightgbm as lgb
model = lgb.Booster(model_file = 'model_state_myfeatures_10k_1000est_additional_re_step8.lgb')

In [70]:
#Run again after correcting cols
feat = pd.concat([pd.DataFrame(model.feature_importance()),pd.DataFrame(X_train.columns)], axis = 1)
feat.columns = ['imp','feat']
feat.to_csv('feat_train_plus_valid_with_myfeatures_lectures_additional_re_Step8_adv.csv')
feat.sort_values('imp', ascending = False).head(60)

Unnamed: 0,imp,feat
31,1333,timestamp_diff
52,564,question_id_mean
109,493,user_part_accuracy
141,450,timestamp_diff2
5,440,prior_question_elapsed_time
32,437,task_container_id_diff
96,398,user_question_id_count
94,387,user_accuracy
159,363,total_ones
160,309,total_zeroes


### LGB on Partial Data

In [92]:
feat = pd.read_csv('feat_train_plus_valid_with_myfeatures_lectures_additional_re_Step8_adv.csv')
cols_to_select = feat.sort_values('imp', ascending = False).feat[:20].values.tolist()
#cols_to_select.remove('task_container_id_diff2')
#cols_to_select.remove('timestamp_diff2')
cols_to_select = ['timestamp','user_id'] + cols_to_select #+ ['tag0','user_task_mean_lag','prev_bundle_id_avg_time']
cols_to_select

['timestamp',
 'user_id',
 'timestamp_diff',
 'question_id_mean',
 'user_part_accuracy',
 'timestamp_diff2',
 'prior_question_elapsed_time',
 'task_container_id_diff',
 'user_question_id_count',
 'user_accuracy',
 'total_ones',
 'total_zeroes',
 'user_id',
 'user_part_count',
 'user_prior_question_had_explanation_accuracy',
 'bundle_id_mean',
 'explanation_chance',
 'question_id_std',
 'user_part_correct',
 'prev_bundle_id_avg_time',
 'tag0',
 'user_prior_question_had_explanation_count']

In [93]:
# cat_feats = [
#              'question_id',
#              'question_id_lag2',
#              'question_id_lag',
#              'bundle_id_lag',
#              'bundle_id',
#              'bundle_id_lag2',
#              'task_container_id_diff',
#              'tag0_lag',
#              'part',
#              'tag1',
#              'tag0']

# categorical_feature = cat_feats

# categorical_feature_idxs = []
# for v in categorical_feature:
#     categorical_feature_idxs.append(all_cols.tolist().index(v))
# print(categorical_feature_idxs)

In [94]:
gc.collect()

40

In [95]:
X_train[cols_to_select].values.shape

(9800953, 22)

In [96]:
X_test[cols_to_select].values.shape

(2452472, 22)

In [97]:
import lightgbm as lgb

train_data = lgb.Dataset(
    data = X_train[cols_to_select].values,
    label = y_train,
)

val_data = lgb.Dataset(
    data = X_test[cols_to_select].values,
    label = y_test,
)

lgbm_params = {
    'objective': 'binary',
    'metric': 'auc',
}

In [98]:
def train():
    evals_result = {}
    model = lgb.train(
        params = lgbm_params,
        train_set = train_data,
        valid_sets = [train_data, val_data],
        num_boost_round = 500,
        verbose_eval = 1,
        evals_result = evals_result,
        early_stopping_rounds = 100,
        #categorical_feature = categorical_feature_idxs,
        #feature_name = features,
    )

    # save model
    model.save_model(f'feat_train_plus_valid_with_myfeatures_lectures_additional_re_Step8_partial_adv2.lgb')
    
    return model, evals_result

In [99]:
model, evals_result = train()

[1]	training's auc: 0.748259	valid_1's auc: 0.751197
Training until validation scores don't improve for 100 rounds
[2]	training's auc: 0.752141	valid_1's auc: 0.754727
[3]	training's auc: 0.754035	valid_1's auc: 0.756414
[4]	training's auc: 0.754849	valid_1's auc: 0.757147
[5]	training's auc: 0.756065	valid_1's auc: 0.758174
[6]	training's auc: 0.757007	valid_1's auc: 0.759191
[7]	training's auc: 0.757833	valid_1's auc: 0.760074
[8]	training's auc: 0.758697	valid_1's auc: 0.760901
[9]	training's auc: 0.759316	valid_1's auc: 0.761663
[10]	training's auc: 0.75964	valid_1's auc: 0.762023
[11]	training's auc: 0.760067	valid_1's auc: 0.762572
[12]	training's auc: 0.760547	valid_1's auc: 0.763137
[13]	training's auc: 0.760727	valid_1's auc: 0.763323
[14]	training's auc: 0.761138	valid_1's auc: 0.76374
[15]	training's auc: 0.761521	valid_1's auc: 0.764162
[16]	training's auc: 0.762084	valid_1's auc: 0.764725
[17]	training's auc: 0.762478	valid_1's auc: 0.765157
[18]	training's auc: 0.762867	v

[152]	training's auc: 0.774269	valid_1's auc: 0.777088
[153]	training's auc: 0.774282	valid_1's auc: 0.777097
[154]	training's auc: 0.774308	valid_1's auc: 0.777118
[155]	training's auc: 0.774333	valid_1's auc: 0.77717
[156]	training's auc: 0.774345	valid_1's auc: 0.77717
[157]	training's auc: 0.774361	valid_1's auc: 0.777186
[158]	training's auc: 0.774376	valid_1's auc: 0.77718
[159]	training's auc: 0.774399	valid_1's auc: 0.777168
[160]	training's auc: 0.774424	valid_1's auc: 0.777192
[161]	training's auc: 0.77445	valid_1's auc: 0.777216
[162]	training's auc: 0.774462	valid_1's auc: 0.77722
[163]	training's auc: 0.774475	valid_1's auc: 0.777234
[164]	training's auc: 0.774496	valid_1's auc: 0.777231
[165]	training's auc: 0.774508	valid_1's auc: 0.777241
[166]	training's auc: 0.774517	valid_1's auc: 0.777246
[167]	training's auc: 0.77453	valid_1's auc: 0.777246
[168]	training's auc: 0.774548	valid_1's auc: 0.777253
[169]	training's auc: 0.774567	valid_1's auc: 0.777296
[170]	training's

[302]	training's auc: 0.776101	valid_1's auc: 0.778259
[303]	training's auc: 0.77611	valid_1's auc: 0.778266
[304]	training's auc: 0.77612	valid_1's auc: 0.778222
[305]	training's auc: 0.776131	valid_1's auc: 0.778232
[306]	training's auc: 0.77614	valid_1's auc: 0.778257
[307]	training's auc: 0.776154	valid_1's auc: 0.778259
[308]	training's auc: 0.776169	valid_1's auc: 0.778268
[309]	training's auc: 0.776175	valid_1's auc: 0.778282
[310]	training's auc: 0.776181	valid_1's auc: 0.778287
[311]	training's auc: 0.776191	valid_1's auc: 0.778295
[312]	training's auc: 0.776205	valid_1's auc: 0.778291
[313]	training's auc: 0.776209	valid_1's auc: 0.778282
[314]	training's auc: 0.776217	valid_1's auc: 0.778288
[315]	training's auc: 0.776222	valid_1's auc: 0.77829
[316]	training's auc: 0.77623	valid_1's auc: 0.778294
[317]	training's auc: 0.77624	valid_1's auc: 0.778302
[318]	training's auc: 0.776256	valid_1's auc: 0.778317
[319]	training's auc: 0.776259	valid_1's auc: 0.778318
[320]	training's

[452]	training's auc: 0.777314	valid_1's auc: 0.778755
[453]	training's auc: 0.77732	valid_1's auc: 0.778703
[454]	training's auc: 0.777325	valid_1's auc: 0.778724
[455]	training's auc: 0.777327	valid_1's auc: 0.778723
[456]	training's auc: 0.777334	valid_1's auc: 0.778726
[457]	training's auc: 0.77734	valid_1's auc: 0.778731
[458]	training's auc: 0.77735	valid_1's auc: 0.778737
[459]	training's auc: 0.777361	valid_1's auc: 0.778699
[460]	training's auc: 0.777371	valid_1's auc: 0.778707
[461]	training's auc: 0.777378	valid_1's auc: 0.77871
[462]	training's auc: 0.777383	valid_1's auc: 0.778718
[463]	training's auc: 0.777389	valid_1's auc: 0.77872
[464]	training's auc: 0.777398	valid_1's auc: 0.77873
[465]	training's auc: 0.777405	valid_1's auc: 0.77875
[466]	training's auc: 0.777413	valid_1's auc: 0.778754
[467]	training's auc: 0.777415	valid_1's auc: 0.778754
[468]	training's auc: 0.777418	valid_1's auc: 0.778756
[469]	training's auc: 0.777422	valid_1's auc: 0.778758
[470]	training's 

In [102]:
X_train[cols_to_select].head(20)

Unnamed: 0,timestamp,user_id,timestamp_diff,question_id_mean,user_part_accuracy,timestamp_diff2,prior_question_elapsed_time,task_container_id_diff,user_question_id_count,user_accuracy,total_ones,total_zeroes,user_id.1,user_part_count,user_prior_question_had_explanation_accuracy,bundle_id_mean,explanation_chance,question_id_std,user_part_correct,prev_bundle_id_avg_time,tag0,user_prior_question_had_explanation_count
0,13269,883629033,13269,80,0,0.0,19000,1,1,0.0,474,18,883629033,1,0,80,96,40,0,0.0,14,1
1,0,303940910,0,82,0,0.0,0,0,1,0.0,1546,23161,303940910,1,0,82,6,38,0,0.0,131,1
2,2156345777,1505339832,2156345777,45,0,0.0,33000,731,1,0.0,7100,307,1505339832,1,0,45,96,50,0,2539948.0,31,1
3,5109397244,1534072,5109397244,57,0,0.0,18000,910,1,0.0,1917,31,1534072,1,0,57,98,50,0,1840903.0,91,1
4,1874427519,1919151627,1874427519,80,0,0.0,2000,627,1,0.0,815,23,1919151627,1,0,75,97,40,0,1830381.0,113,1
5,1874427519,1919151627,0,63,50,1874428000.0,2000,0,1,50.0,815,23,1919151627,2,50,75,97,48,1,1830381.0,82,2
6,1874427519,1919151627,0,82,67,0.0,2000,0,1,67.0,815,23,1919151627,3,67,75,97,39,2,1830381.0,106,3
7,5342710457,1011085097,5342710457,81,0,0.0,41500,579,1,0.0,391,17,1011085097,1,0,76,96,39,0,0.0,53,1
8,5342710457,1011085097,0,78,50,5342710000.0,41500,0,1,50.0,391,17,1011085097,2,50,76,96,42,1,0.0,27,2
9,5342710457,1011085097,0,71,67,7490194000.0,41500,0,1,67.0,391,17,1011085097,3,67,76,96,45,2,0.0,7,3


In [100]:
#Run again after correcting cols
feat = pd.concat([pd.DataFrame(model.feature_importance()),pd.Series(cols_to_select)], axis = 1)
feat.columns = ['imp','feat']
feat.to_csv('feat_train_plus_valid_with_myfeatures_lectures_additional_re_Step8_partial_adv2.csv')
feat.sort_values('imp', ascending = False).head(60)

Unnamed: 0,imp,feat
2,1626,timestamp_diff
21,943,user_prior_question_had_explanation_count
4,881,user_part_accuracy
3,859,question_id_mean
5,858,timestamp_diff2
9,837,user_accuracy
6,818,prior_question_elapsed_time
1,745,user_id
0,745,timestamp
10,729,total_ones


In [None]:
X_train[cols_to_select].head()

In [None]:
X_train.loc[X_train.tag0 < X_train.tag0_lag,['tag0', 'tag0_lag','tag0_diff']]

### Getting End-Data Ready

In [None]:
train_set = pd.read_pickle('train_set_cv1.pkl').iloc[:,:9]
valid_set = pd.read_pickle('valid_set_cv1.pkl').iloc[:,:9]

all_data = pd.concat([train_set,valid_set])

del train_set, valid_set
gc.collect()

In [None]:
all_data = all_data.sort_values(['user_id','timestamp'])

In [None]:
X_train.info()

In [None]:
all_data = all_data.merge(questions[['question_id','bundle_id','part','tag0','tag1']], left_on = 'content_id', right_on = 'question_id', how = 'left')

In [None]:
float_cols = ['question_id','bundle_id','part','tag0','tag1']

for i in tqdm(float_cols):
    all_data[i] = all_data[i].fillna(0).astype(questions[i].dtype)

In [None]:
gc.collect()

In [None]:
all_data.to_pickle('final_data_for_prep_re_step3_partial.pkl')

In [None]:
lags_required = ['bundle_id',
                'question_id',
                'timestamp',
                'tag0',
                'tag1',
                'task_container_id',
                'prior_question_elapsed_time']

for i in tqdm(lags_required):
    all_data[f"{i}_lag"] = all_data.groupby('user_id')[i].shift(1).fillna(0).astype(all_data[i].dtype)

In [None]:
diffs_required = ['timestamp',
                 'task_container_id',
                 'tag0',
                 'question_id']

for i in tqdm(diffs_required):
    all_data[f"{i}_diff"] = (all_data[i] - all_data[f"{i}_lag"]).astype(all_data[i].dtype)

In [None]:
#User accuracy
print("Calculating user_correct")
all_data['user_correct'] = all_data.groupby('user_id')['answered_correctly'].cumsum()

print("Calculating user_count")
all_data['user_count'] = all_data.groupby('user_id')['answered_correctly'].cumcount() + 1

print("Calculating user_accuracy")
all_data['user_accuracy'] = all_data['user_correct'] / all_data['user_count']

In [None]:
all_data.head()

In [None]:
all_data.shape

In [None]:
relevant_cols = ['bundle_id',
                'bundle_id_lag',
                'content_id',
                'part',
                'prior_question_elapsed_time',
                'prior_question_elapsed_time_lag',
                'question_id_diff',
                'question_id_lag',
                 'tag0',
                'tag0_diff',
                'tag0_lag',
                'tag1',
                'tag1_lag',
                'task_container_id',
                'task_container_id_diff',
                'task_container_id_lag',
                'timestamp',
                'timestamp',
                'timestamp_diff',
                'user_accuracy',
                'user_correct',
                'user_count',
                'user_id']

end_data_full_re = all_data[relevant_cols].groupby('user_id').tail(1)

In [None]:
end_data_full_re.head()

In [None]:
end_data_full_re.to_pickle('end_data_full_re1.pkl')

In [None]:
end_data_full_re.loc[end_data_full_re.content_id < end_data_full_re.question_id_lag,['content_id','question_id_lag','question_id_diff']]

In [None]:
# questions.to_pickle('questions_breakup.pkl')

In [None]:
# stats_df.to_pickle('all_stats_df.pkl')

In [None]:
# question_stats = stats_df[['question_id','question_id_mean','question_id_std']]
# question_stats.to_pickle('question_stats_df.pkl')
# question_stats.head()

In [None]:
all_data[all_data.bundle_id != all_data.content_id].head(10)