In [135]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd

import math
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True)
import missingno as msno
import os
from data_loader import FeatureEngineering


DATA_PATH = 'data/'

In [136]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16'
}   

df = pd.read_csv(os.path.join(DATA_PATH, 'train_data.csv'), dtype=dtype, parse_dates=['Timestamp'])
df = df.sort_values(by=['userID', 'Timestamp', 'testId']).reset_index(drop=True)
copy_df = df.copy()

test_df = pd.read_csv(os.path.join(DATA_PATH, 'test_data.csv'), dtype=dtype, parse_dates=['Timestamp'])

CPU times: user 4.12 s, sys: 80 ms, total: 4.2 s
Wall time: 4.2 s


In [108]:
"""def feature_engineering(df):
    # 문제별 풀이시간
    from tqdm import tqdm

    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df['diff_Timestamp'] = df['Timestamp'] - df.shift(1)['Timestamp']

    testId_df = df[~df.duplicated(['assessmentItemID'])].groupby('testId')
    testId2len = {}
    for testId, g_df in testId_df:
        testId2len[testId] = len(g_df)

    userID_df = df.groupby('userID')
    start_index_list = []
    second_index_list = []

    for userID, g_df in tqdm(userID_df):
        testId_df = g_df.groupby('testId')
        for testId, gg_df in testId_df:
            index_list = gg_df.index.tolist()
            start_index = 0
            if len(gg_df) <= testId2len[testId]:
                start_index_list += [index_list[start_index]]
                second_index_list += [index_list[start_index + 1]]
            else:
                div = len(gg_df) // testId2len[testId]
                for _ in range(div):
                    start_index_list += [index_list[start_index]]
                    second_index_list += [index_list[start_index + 1]]
                    start_index += testId2len[testId]

    df.loc[start_index_list, 'diff_Timestamp'] = df.loc[second_index_list, 'diff_Timestamp'].values
    df['elapsed'] = df['diff_Timestamp'].apply(lambda x: x.total_seconds() if not pd.isna(x) else np.nan)


    df['hour'] = df['Timestamp'].dt.hour
    df['dow'] = df['Timestamp'].dt.dayofweek # 요일을 숫자로

    diff = df.loc[:, ['userID','Timestamp']].groupby('userID').diff().fillna(pd.Timedelta(seconds=0))
    diff = diff.fillna(pd.Timedelta(seconds=0))
    diff = diff['Timestamp'].apply(lambda x: x.total_seconds())

    # 문제별 풀이시간
    df['elapsed'] = diff
    df['elapsed'] = df['elapsed'].apply(lambda x : x if x <650 and x >=0 else 0)

    df['testcode']=df['testId'].apply(lambda x : int(x[1:4])//10)
    df['problem_number'] = df['assessmentItemID'].apply(lambda x: int(x[7:])) 


    # feature 별 정답여부
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']
    correct_a = df.groupby(['assessmentItemID'])['answerCode'].agg(['mean', 'sum'])
    correct_a.columns = ["ass_mean", 'ass_sum']
    correct_p = df.groupby(['problem_number'])['answerCode'].agg(['mean', 'sum'])
    correct_p.columns = ["prb_mean", 'prb_sum']
    correct_h = df.groupby(['hour'])['answerCode'].agg(['mean', 'sum'])
    correct_h.columns = ["hour_mean", 'hour_sum']
    correct_d = df.groupby(['dow'])['answerCode'].agg(['mean', 'sum'])
    correct_d.columns = ["dow_mean", 'dow_sum'] 

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, correct_a, on=['assessmentItemID'], how="left")
    df = pd.merge(df, correct_p, on=['problem_number'], how="left")
    df = pd.merge(df, correct_h, on=['hour'], how="left")
    df = pd.merge(df, correct_d, on=['dow'], how="left")


    # 정답과 오답 기준으로 나눠서 생각
    o_df = df[df['answerCode']==1]
    x_df = df[df['answerCode']==0]

    elp_k = df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
    elp_k.columns = ['KnowledgeTag',"tag_elp"]
    elp_k_o = o_df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
    elp_k_o.columns = ['KnowledgeTag', "tag_elp_o"]
    elp_k_x = x_df.groupby(['KnowledgeTag'])['elapsed'].agg('mean').reset_index()
    elp_k_x.columns = ['KnowledgeTag', "tag_elp_x"]

    df = pd.merge(df, elp_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, elp_k_o, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, elp_k_x, on=['KnowledgeTag'], how="left")

    ass_k = df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
    ass_k.columns = ['assessmentItemID',"ass_elp"]
    ass_k_o = o_df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
    ass_k_o.columns = ['assessmentItemID',"ass_elp_o"]
    ass_k_x = x_df.groupby(['assessmentItemID'])['elapsed'].agg('mean').reset_index()
    ass_k_x.columns = ['assessmentItemID',"ass_elp_x"]

    df = pd.merge(df, ass_k, on=['assessmentItemID'], how="left")
    df = pd.merge(df, ass_k_o, on=['assessmentItemID'], how="left")
    df = pd.merge(df, ass_k_x, on=['assessmentItemID'], how="left")

    prb_k = df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
    prb_k.columns = ['problem_number',"prb_elp"]
    prb_k_o = o_df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
    prb_k_o.columns = ['problem_number',"prb_elp_o"]
    prb_k_x = x_df.groupby(['problem_number'])['elapsed'].agg('mean').reset_index()
    prb_k_x.columns = ['problem_number',"prb_elp_x"]

    df = pd.merge(df, prb_k, on=['problem_number'], how="left")
    df = pd.merge(df, prb_k_o, on=['problem_number'], how="left")
    df = pd.merge(df, prb_k_x, on=['problem_number'], how="left")

    # 누적합 - 주어진 데이터 이전/이후 데이터들을 포함하는 메모리를 feature로 포함시킴: Sequence Model을 사용하지 않고 일반적인 지도 학습 모델에서 사용하기 위함
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
    df['testcode_o'] = df.groupby(['userID','testcode'])['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['testcodeCount'] = df.groupby(['userID','testcode']).cumcount()
    df['testcodeAcc'] = df['testcode_o']/df['testcodeCount']
    df['tectcodeElp'] = df.groupby(['userID','testcode'])['elapsed'].transform(lambda x: x.cumsum().shift(1))
    df['testcodeMElp'] = df['tectcodeElp']/df['testcodeCount']



    f = lambda x : len(set(x))
    t_df = df.groupby(['testId']).agg({
    'problem_number':'max',
    'KnowledgeTag':f
    })
    t_df.reset_index(inplace=True)

    t_df.columns = ['testId','problem_count',"tag_count"]

    df = pd.merge(df,t_df,on='testId',how='left')

    gdf = df[['userID','testId','problem_number','testcode','Timestamp']].sort_values(by=['userID','testcode','Timestamp'])
    gdf['buserID'] = gdf['userID'] != gdf['userID'].shift(1)
    gdf['btestcode'] = gdf['testcode'] != gdf['testcode'].shift(1)
    gdf['first'] = gdf[['buserID','btestcode']].any(axis=1).apply(lambda x : 1- int(x))
    gdf['RepeatedTime'] = gdf['Timestamp'].diff().fillna(pd.Timedelta(seconds=0)) 
    gdf['RepeatedTime'] = gdf['RepeatedTime'].apply(lambda x: x.total_seconds()) * gdf['first']
    df['RepeatedTime'] = gdf['RepeatedTime'].apply(lambda x : math.log(x+1))

    df['prior_KnowledgeTag_frequency'] = df.groupby(['userID','KnowledgeTag']).cumcount()

    df['problem_position'] = df['problem_number'] / df["problem_count"]
    df['solve_order'] = df.groupby(['userID','testId']).cumcount()
    df['solve_order'] = df['solve_order'] - df['problem_count']*(df['solve_order'] > df['problem_count']).apply(int) + 1
    df['retest'] = (df['solve_order'] > df['problem_count']).apply(int)
    T = df['solve_order'] != df['problem_number']
    TT = T.shift(1)
    TT[0] = False
    df['solved_disorder'] = (TT.apply(lambda x : not x) & T).apply(int)

    df['testId'] = df['testId'].apply(lambda x : int(x[1:4]+x[-3]))
    df['hour'] = df['Timestamp'].dt.hour
    df['dow'] = df['Timestamp'].dt.dayofweek

    return df"""

In [137]:
#df = FeatureEngineering.FE(df)
#df.to_csv(DATA_PATH + 'train_featured.csv', index=False)

100%|██████████| 6698/6698 [00:33<00:00, 201.50it/s]


In [145]:
df = pd.read_csv(DATA_PATH+'train_featured.csv')

In [146]:
import lightgbm as lgb
import numpy as np
import random
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import KFold

In [147]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.8, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

In [148]:
# 유저별 분리
train, test = custom_train_test_split(df)

# 사용할 Feature 설정
FEATS = df.select_dtypes(include=["int", "int8", "int16", "int64", "float", "float16", "float64"]).columns
FEATS = [col for col in FEATS if col not in ['answerCode']]

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [149]:
lgb_train = lgb.Dataset(train[FEATS], y_train)
lgb_test = lgb.Dataset(test[FEATS], y_test)

In [150]:
model = lgb.train(
    {'objective': 'binary'}, 
    lgb_train,
    valid_sets=[lgb_train, lgb_test],
    verbose_eval=100,
    num_boost_round=500,
    early_stopping_rounds=100
)

preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')



[LightGBM] [Info] Number of positive: 1187785, number of negative: 624671
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6979
[LightGBM] [Info] Number of data points in the train set: 1812456, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655346 -> initscore=0.642620
[LightGBM] [Info] Start training from score 0.642620
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.429549	valid_1's binary_logloss: 0.467797
[200]	training's binary_logloss: 0.425569	valid_1's binary_logloss: 0.461142
[300]	training's binary_logloss: 0.422893	valid_1's binary_logloss: 0.457344
[400]	training's binary_logloss: 0.420597	valid_1's binary_logloss: 0.453137
[500]	training's binary_logloss: 0.418485	valid_1's binary_logloss: 0.451562
Did not meet early stopping. Best iteration is:
[500]	training's binary_logloss: 0.418485	valid_

In [151]:
# FEATURE ENGINEERING
#test_df = FeatureEngineering.FE(test_df)
#test_df.to_csv(DATA_PATH + 'test_featured.csv', index=False)

100%|██████████| 744/744 [00:03<00:00, 197.45it/s]


In [153]:
# Inference
test_df = pd.read_csv(DATA_PATH+'test_featured.csv')

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)

In [154]:
# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [156]:
submission = pd.read_csv(DATA_PATH+'sample_submission.csv')
submission['prediction'] = total_preds

In [157]:
submission.to_csv(DATA_PATH+'lgbm_base_submission.csv')

In [None]:
fdfeffe # 현정이가 train-test 다르게 처리 한 부분

In [None]:
# Set hyperparameters for the LightGBM model
params = {
    'objective': 'regression',  # For regression tasks
    'metric': 'rmse',  # Root Mean Squared Error as the evaluation metric
    'num_leaves': 31,  # Maximum number of leaves in one tree
    'learning_rate': 0.05,  # Learning rate for boosting
    'feature_fraction': 0.9,  # Fraction of features to be used per tree
    'bagging_fraction': 0.8,  # Fraction of data to be bagged
    'bagging_freq': 5,  # Frequency of bagging
    'verbose': 0  # Verbosity of output
}

# Initialize a list to store the cross-validation scores
cv_scores = []

In [None]:
# Perform K-fold cross-validation
kf = KFold(n_splits=5, random_state=42, shuffle=True)
for train_index, test_index in kf.split(X):
    # Split the data into training and testing sets for this fold
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Create the LightGBM dataset
    train_data = lgb.Dataset(X_train, label=y_train)

    # Train the LightGBM model
    model = lgb.train(params, train_data, num_boost_round=100)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Convert probabilities to binary predictions
    y_pred_binary = np.round(y_pred)

    # Compute accuracy
    accuracy = accuracy_score(y_test, y_pred_binary)
    accuracy_scores.append(accuracy)

    # Compute AUROC
    auroc = roc_auc_score(y_test, y_pred)
    auroc_scores.append(auroc)

    # Print the evaluation metrics for this fold
    print('Fold Accuracy:', accuracy)
    print('Fold AUROC:', auroc)
    print('---')

# Calculate the mean and standard deviation of the evaluation metrics
mean_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)
mean_auroc = np.mean(auroc_scores)
std_auroc = np.std(auroc_scores)

KeyError: "None of [Int64Index([      0,       2,       3,       4,       5,       6,       7,\n                  8,       9,      10,\n            ...\n            2266574, 2266575, 2266576, 2266578, 2266579, 2266581, 2266582,\n            2266583, 2266584, 2266585],\n           dtype='int64', length=1813268)] are in the [columns]"