# LGBM Feature Engineering & Save dataset

In [21]:
import pandas as pd
import os
import random
import math
from tqdm import tqdm
import numpy as np
import time

## 1. 데이터 로딩

In [22]:
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

# 데이터 경로 맞춰주세요!
DATA_PATH = '/opt/ml/input/data/train_dataset/train_data.csv'
df = pd.read_csv(DATA_PATH, dtype=dtype, parse_dates=['Timestamp'])
df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

In [23]:
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

train = pd.read_csv("/opt/ml/input/data/train_dataset/cv_train_data.csv", dtype=dtype, parse_dates=['Timestamp'])
valid = pd.read_csv("/opt/ml/input/data/train_dataset/cv_valid_data.csv", dtype=dtype, parse_dates=['Timestamp'])

In [24]:
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

# 데이터 경로 맞춰주세요!
test_csv_file_path = '/opt/ml/input/data/train_dataset/test_data.csv'
test = pd.read_csv(test_csv_file_path, dtype=dtype, parse_dates=['Timestamp'])
test = test.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

### 1-1 DB 로딩

In [25]:
df['problem_number'] = df['assessmentItemID'].apply(lambda x : int(x[-3:]))

In [26]:
correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
correct_t.columns = ["test_mean", 'test_sum']
correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
correct_k.columns = ["tag_mean", 'tag_sum']
correct_a = df.groupby(['assessmentItemID'])['answerCode'].agg(['mean', 'sum'])
correct_a.columns = ["ass_mean", 'ass_sum']
correct_p = df.groupby(['problem_number'])['answerCode'].agg(['mean', 'sum'])
correct_p.columns = ["prb_mean", 'prb_sum']

## 2. Feature Engineering

In [27]:
def add_last_problem(df):
    new = []
    pre = df['testId'][0]
    for idx in df['testId']:
        if pre != idx :
            new[-1]=-1
            pre = idx
        new.append(0)
    df['last_problem'] = new
    return df

In [28]:
def is_previous_ordered(row):
    q_num = row.problem_number
    q_num_prev = row.q_num_prev
    delta = row.delta
    delta_thres = 1 # hour
    
    if pd.isnull(delta) or delta > pd.Timedelta(hours=1):
        return -1
    elif q_num == q_num_prev + 1:
        return 1
    else:
        return 0

In [29]:
def is_previous_decreasing(row):
    q_num = row.problem_number
    q_num_prev = row.q_num_prev
    delta = row.delta
    delta_thres = 1 # hour
    
    if pd.isnull(delta) or delta > pd.Timedelta(hours=1):
        return -1
    elif q_num < q_num_prev:
        return 1
    else:
        return 0

In [30]:
def is_probably_easy(row):
    delta = row.delta
    delta_thres = 1 # hour
    
    is_prev_ord = row.is_previous_ordered
    is_prev_dec = row.is_previous_decreasing
    is_prev_ord_shift = row.is_prev_ord_shift
    is_prev_dec_shift = row.is_prev_dec_shift
    
    case = (is_prev_ord_shift, is_prev_dec_shift, is_prev_ord, is_prev_dec)
    
    probably_easy_l = [
        (np.nan, np.nan, -1, -1),
        (-1, -1, 1, 0),
        (1, 0, 1, 0),
        (1, 0, 0, 0),
    ]
    
    if pd.isnull(delta) or delta > pd.Timedelta(hours=1):
        return -1
    elif case in probably_easy_l:
        return 1
    else:
        return 0

In [31]:
def ELO_function (df) :
    def get_new_theta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return theta + learning_rate_theta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def get_new_beta(is_good_answer, beta, left_asymptote, theta, nb_previous_answers):
        return beta - learning_rate_beta(nb_previous_answers) * (
            is_good_answer - probability_of_good_answer(theta, beta, left_asymptote)
        )

    def learning_rate_theta(nb_answers):
        return max(0.3 / (1 + 0.01 * nb_answers), 0.04)

    def learning_rate_beta(nb_answers):
        return 1 / (1 + 0.05 * nb_answers)

    def probability_of_good_answer(theta, beta, left_asymptote):
        return left_asymptote + (1 - left_asymptote) * sigmoid(theta - beta)

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))
    
    def estimate_parameters(answers_df, granularity_feature_name='assessmentItemID'):
        item_parameters = {
            granularity_feature_value: {"beta": 0, "nb_answers": 0}
            for granularity_feature_value in np.unique(answers_df[granularity_feature_name])
        }
        student_parameters = {
            student_id: {"theta": 0, "nb_answers": 0}
            for student_id in np.unique(answers_df.userID)
        }

        print("Parameter estimation is starting...")

        for student_id, item_id, left_asymptote, answered_correctly in tqdm(
            zip(answers_df.userID.values, answers_df[granularity_feature_name].values, answers_df.left_asymptote.values, answers_df.answerCode.values)
        ):
            theta = student_parameters[student_id]["theta"]
            beta = item_parameters[item_id]["beta"]

            item_parameters[item_id]["beta"] = get_new_beta(
                answered_correctly, beta, left_asymptote, theta, item_parameters[item_id]["nb_answers"],
            )
            student_parameters[student_id]["theta"] = get_new_theta(
                answered_correctly, beta, left_asymptote, theta, student_parameters[student_id]["nb_answers"],
            )

            item_parameters[item_id]["nb_answers"] += 1
            student_parameters[student_id]["nb_answers"] += 1

        print(f"Theta & beta estimations on {granularity_feature_name} are completed.")
        return student_parameters, item_parameters
    
    def gou_func (theta, beta) :
        return 1 / (1 + np.exp(-(theta - beta)))
    
    
    df['left_asymptote'] = 0

    print(f"Dataset of shape {df.shape}")
    print(f"Columns are {list(df.columns)}")

    student_parameters, item_parameters = estimate_parameters(df)
    
    prob = [gou_func(student_parameters[student]['theta'], item_parameters[item]['beta']) for student, item in zip(df.userID.values, df.assessmentItemID.values)]
    
    df['elo_prob'] = prob
    
    return df

In [32]:
def feature_engineering(df):
    print('-'*20, 'Feature Engineering Start', '-'*20)
    start_time = time.time()
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    df = add_last_problem(df)
    # elo 추가
    df = ELO_function(df)
    
    df['hour'] = df['Timestamp'].dt.hour
    df['dow'] = df['Timestamp'].dt.dayofweek
    
    # 푸는 시간
    diff = df.loc[:, ['userID','Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())
    df['elapsed'] = diff
    df['elapsed'] = df['elapsed'].apply(lambda x : x if x <650 and x >=0 else 0)
    
    df['grade']=df['testId'].apply(lambda x : int(x[1:4])//10)
    df['mid'] = df['testId'].apply(lambda x : int(x[-3:]))
    df['problem_number'] = df['assessmentItemID'].apply(lambda x : int(x[-3:]))
    
#     stu_test_groupby = df.groupby(['userID', 'testId'])
#     df.loc[:, "delta"] = stu_test_groupby['Timestamp'].diff()
#     df['q_num_prev'] = df.problem_number.shift()
#     df['is_previous_ordered'] =  df.apply(lambda row: is_previous_ordered(row), axis=1)
#     df['is_previous_decreasing'] = df.apply(lambda row: is_previous_decreasing(row), axis=1)
#     df['is_prev_ord_shift'] = df.is_previous_ordered.shift()
#     df['is_prev_dec_shift'] = df.is_previous_decreasing.shift()
#     df['is_probably_easy'] = df.apply(lambda row: is_probably_easy(row), axis=1)
#     df.drop(labels=['delta', 'q_num_prev', 'is_previous_ordered',
#                     'is_previous_decreasing', 'is_prev_ord_shift', 'is_prev_dec_shift'], axis=1, inplace=True)
    
    correct_h = df.groupby(['hour'])['answerCode'].agg(['mean', 'sum'])
    correct_h.columns = ["hour_mean", 'hour_sum']
    correct_d = df.groupby(['dow'])['answerCode'].agg(['mean', 'sum'])
    correct_d.columns = ["dow_mean", 'dow_sum'] 
    
    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, correct_a, on=['assessmentItemID'], how="left")
    df = pd.merge(df, correct_p, on=['problem_number'], how="left")
    df = pd.merge(df, correct_h, on=['hour'], how="left")
    df = pd.merge(df, correct_d, on=['dow'], how="left")

    o_df = df[df['answerCode']==1]
    x_df = df[df['answerCode']==0]
    
    elp_k = df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
    elp_k.columns = ['KnowledgeTag',"tag_elp"]
    elp_k_o = o_df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
    elp_k_o.columns = ['KnowledgeTag', "tag_elp_o"]
    elp_k_x = x_df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
    elp_k_x.columns = ['KnowledgeTag', "tag_elp_x"]
    
    df = pd.merge(df, elp_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, elp_k_o, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, elp_k_x, on=['KnowledgeTag'], how="left")

    ass_k = df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
    ass_k.columns = ['assessmentItemID',"ass_elp"]
    ass_k_o = o_df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
    ass_k_o.columns = ['assessmentItemID',"ass_elp_o"]
    ass_k_x = x_df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
    ass_k_x.columns = ['assessmentItemID',"ass_elp_x"]

    df = pd.merge(df, ass_k, on=['assessmentItemID'], how="left")
    df = pd.merge(df, ass_k_o, on=['assessmentItemID'], how="left")
    df = pd.merge(df, ass_k_x, on=['assessmentItemID'], how="left")

    prb_k = df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
    prb_k.columns = ['problem_number',"prb_elp"]
    prb_k_o = o_df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
    prb_k_o.columns = ['problem_number',"prb_elp_o"]
    prb_k_x = x_df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
    prb_k_x.columns = ['problem_number',"prb_elp_x"]

    df = pd.merge(df, prb_k, on=['problem_number'], how="left")
    df = pd.merge(df, prb_k_o, on=['problem_number'], how="left")
    df = pd.merge(df, prb_k_x, on=['problem_number'], how="left")
    
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1)).fillna(0)
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = (df['user_correct_answer']/df['user_total_answer']).fillna(0)
    df['Grade_o'] = df.groupby(['userID','grade'])['answerCode'].transform(lambda x: x.cumsum().shift(1)).fillna(0)
    df['GradeCount'] = df.groupby(['userID','grade']).cumcount()
    df['GradeAcc'] = (df['Grade_o']/df['GradeCount']).fillna(0)
    df['GradeElp'] = df.groupby(['userID','grade'])['elapsed'].transform(lambda x: x.cumsum()).fillna(0)
    df['GradeMElp'] = df['GradeElp']/[v if v != 0 else 1 for v in df['GradeCount'].values]
    
    f = lambda x : len(set(x))
    test = df.groupby(['testId']).agg({
        'problem_number':'max',
        'KnowledgeTag':f
    })
    test.reset_index(inplace=True)

    test.columns = ['testId','problem_count',"tag_count"]
    
    df = pd.merge(df,test,on='testId',how='left')
    
    gdf = df[['userID','testId','problem_number','grade','Timestamp']].sort_values(by=['userID','grade','Timestamp'])
    gdf['buserID'] = gdf['userID'] != gdf['userID'].shift(1)
    gdf['bgrade'] = gdf['grade'] != gdf['grade'].shift(1)
    gdf['first'] = gdf[['buserID','bgrade']].any(axis=1).apply(lambda x : 1- int(x))
    gdf['RepeatedTime'] = gdf['Timestamp'].diff().fillna(pd.Timedelta(seconds=0)) 
    gdf['RepeatedTime'] = gdf['RepeatedTime'].apply(lambda x: x.total_seconds()) * gdf['first']
    df['RepeatedTime'] = gdf['RepeatedTime'].apply(lambda x : math.log(x+1))
    
    df['prior_KnowledgeTag_frequency'] = df.groupby(['userID','KnowledgeTag']).cumcount()
    
    df['problem_position'] = df['problem_number'] / df["problem_count"]
    df['solve_order'] = df.groupby(['userID','testId']).cumcount()
    df['solve_order'] = df['solve_order'] - df['problem_count']*(df['solve_order'] > df['problem_count']).apply(int) + 1
    df['retest'] = (df['solve_order'] > df['problem_count']).apply(int)
    T = df['solve_order'] != df['problem_number']
    TT = T.shift(1)
    TT[0] = False
    df['solved_disorder'] = (TT.apply(lambda x : not x) & T).apply(int)
    
    df['testId'] = df['testId'].apply(lambda x : int(x[1:4]+x[-3]))
    
    print('-'*20, 'Feature Engineering End', '-'*20)
    print(f"Feature Engineering에 걸린 시간 : {time.time() - start_time}s")
    return df

In [33]:
# 굳이 필요없는듯?
# df = feature_engineering(df)
# df.head()

## 3. Train/Valid/Test 데이터 셋 feature_engineering 및 저장

In [34]:
# 얘도 이미 나눠져있어서 필요없음
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    valid = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    valid = valid[valid['userID'] != valid['userID'].shift(-1)]
    return train, test

In [35]:
train = feature_engineering(train)
train.head()

-------------------- Feature Engineering Start --------------------
Dataset of shape (2012569, 8)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'last_problem', 'left_asymptote']


3103it [00:00, 31028.18it/s]

Parameter estimation is starting...


2012569it [00:53, 37725.31it/s]


Theta & beta estimations on assessmentItemID are completed.
-------------------- Feature Engineering End --------------------
Feature Engineering에 걸린 시간 : 154.7360076904297s


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,last_problem,left_asymptote,elo_prob,hour,...,GradeElp,GradeMElp,problem_count,tag_count,RepeatedTime,prior_KnowledgeTag_frequency,problem_position,solve_order,retest,solved_disorder
0,0,A060001001,600,1,2020-03-24 00:17:11,7224,0,0,0.978221,0,...,0.0,0.0,7,2,0.0,0,0.142857,1,0,0
1,0,A060001002,600,1,2020-03-24 00:17:14,7225,0,0,0.968121,0,...,3.0,3.0,7,2,1.386294,0,0.285714,2,0,0
2,0,A060001003,600,1,2020-03-24 00:17:22,7225,0,0,0.940058,0,...,11.0,5.5,7,2,2.197225,1,0.428571,3,0,0
3,0,A060001004,600,1,2020-03-24 00:17:29,7225,0,0,0.970276,0,...,18.0,6.0,7,2,2.079442,2,0.571429,4,0,0
4,0,A060001005,600,1,2020-03-24 00:17:36,7225,0,0,0.956241,0,...,25.0,6.25,7,2,2.079442,3,0.714286,5,0,0


In [36]:
valid = feature_engineering(valid)
valid.head()

-------------------- Feature Engineering Start --------------------
Dataset of shape (254017, 8)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'last_problem', 'left_asymptote']


2028it [00:00, 20279.00it/s]

Parameter estimation is starting...


254017it [00:06, 36806.77it/s]


Theta & beta estimations on assessmentItemID are completed.
-------------------- Feature Engineering End --------------------
Feature Engineering에 걸린 시간 : 18.133310556411743s


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,last_problem,left_asymptote,elo_prob,hour,...,GradeElp,GradeMElp,problem_count,tag_count,RepeatedTime,prior_KnowledgeTag_frequency,problem_position,solve_order,retest,solved_disorder
0,35,A040093001,400,1,2020-01-02 23:30:13,2094,0,0,0.95188,23,...,0.0,0.0,5,1,0.0,0,0.2,1,0,0
1,35,A040093002,400,0,2020-01-02 23:31:13,2094,0,0,0.7777,23,...,60.0,60.0,5,1,4.110874,1,0.4,2,0,0
2,35,A040093003,400,0,2020-01-02 23:31:56,2094,0,0,0.470429,23,...,103.0,51.5,5,1,3.78419,2,0.6,3,0,0
3,35,A040093004,400,0,2020-01-02 23:32:15,2094,0,0,0.255394,23,...,122.0,40.666667,5,1,2.995732,3,0.8,4,0,0
4,35,A040093005,400,1,2020-01-02 23:35:06,2094,-1,0,0.530134,23,...,293.0,73.25,5,1,5.147494,4,1.0,5,0,0


In [37]:
test = feature_engineering(test)
test.head()

-------------------- Feature Engineering Start --------------------
Dataset of shape (260114, 8)
Columns are ['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp', 'KnowledgeTag', 'last_problem', 'left_asymptote']


3783it [00:00, 37824.81it/s]

Parameter estimation is starting...


260114it [00:06, 39283.23it/s]


Theta & beta estimations on assessmentItemID are completed.
-------------------- Feature Engineering End --------------------
Feature Engineering에 걸린 시간 : 18.322449445724487s


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,last_problem,left_asymptote,elo_prob,hour,...,GradeElp,GradeMElp,problem_count,tag_count,RepeatedTime,prior_KnowledgeTag_frequency,problem_position,solve_order,retest,solved_disorder
0,3,A050023001,500,1,2020-01-09 10:56:31,2626,0,0,0.752296,10,...,0.0,0.0,7,4,0.0,0,0.142857,1,0,0
1,3,A050023002,500,1,2020-01-09 10:56:57,2626,0,0,0.416693,10,...,26.0,26.0,7,4,3.295837,1,0.285714,2,0,0
2,3,A050023003,500,0,2020-01-09 10:58:31,2625,0,0,0.281076,10,...,120.0,60.0,7,4,4.553877,0,0.428571,3,0,0
3,3,A050023004,500,0,2020-01-09 10:58:36,2625,0,0,0.382929,10,...,125.0,41.666667,7,4,1.791759,1,0.571429,4,0,0
4,3,A050023006,500,0,2020-01-09 10:58:43,2623,0,0,0.166432,10,...,132.0,33.0,7,4,2.079442,0,0.857143,5,0,1


In [38]:
valid

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,last_problem,left_asymptote,elo_prob,hour,...,GradeElp,GradeMElp,problem_count,tag_count,RepeatedTime,prior_KnowledgeTag_frequency,problem_position,solve_order,retest,solved_disorder
0,35,A040093001,400,1,2020-01-02 23:30:13,2094,0,0,0.951880,23,...,0.0,0.000000,5,1,0.000000,0,0.2,1,0,0
1,35,A040093002,400,0,2020-01-02 23:31:13,2094,0,0,0.777700,23,...,60.0,60.000000,5,1,4.110874,1,0.4,2,0,0
2,35,A040093003,400,0,2020-01-02 23:31:56,2094,0,0,0.470429,23,...,103.0,51.500000,5,1,3.784190,2,0.6,3,0,0
3,35,A040093004,400,0,2020-01-02 23:32:15,2094,0,0,0.255394,23,...,122.0,40.666667,5,1,2.995732,3,0.8,4,0,0
4,35,A040093005,400,1,2020-01-02 23:35:06,2094,-1,0,0.530134,23,...,293.0,73.250000,5,1,5.147494,4,1.0,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
254012,7422,A030171001,301,0,2020-10-23 07:12:59,1873,0,0,0.294862,7,...,91.0,18.200000,5,2,15.167286,0,0.2,1,0,0
254013,7422,A030171002,301,0,2020-10-23 07:13:35,1873,0,0,0.353503,7,...,127.0,21.166667,5,2,3.610918,1,0.4,2,0,0
254014,7422,A030171003,301,0,2020-10-23 07:13:37,1876,0,0,0.278829,7,...,129.0,18.428571,5,2,1.098612,0,0.6,3,0,0
254015,7422,A030171004,301,0,2020-10-23 07:13:38,1876,0,0,0.468918,7,...,130.0,16.250000,5,2,0.693147,1,0.8,4,0,0


## 피쳐 엔지니어링 후 모두 저장

In [39]:
# null 값 분포 우선 fillna로 처리
train = train.fillna(0)
valid = valid.fillna(0)
test = test.fillna(0)

In [40]:
train.to_csv('/opt/ml/input/data/train_dataset/train_after.csv', index = False)
valid.to_csv('/opt/ml/input/data/train_dataset/valid_after.csv', index = False)
test.to_csv('/opt/ml/input/data/train_dataset/test_after.csv', index = False)

In [207]:
# 아직 ass_elp_o / ass_elp_x 를 못잡았음 처리해야됨. -> 우선 fillna로 처리