In [1]:
import numpy as np
import pandas as pd
import os
import random
import pickle
from pycaret.classification import *
from pycaret.utils import check_metric
from datetime import timedelta, timezone, datetime
from sklearn.model_selection import StratifiedKFold
import torch

## 데이터 가져오기

In [2]:
%%time
dtype = {
    'userID': 'int16',
    'answerCode': 'int8',
    'KnowledgeTag': 'int16',
}   

TRAIN_DATA_PATH = '/opt/ml/input/data/train_dataset/tuning_train_data_2.csv'
TEST_DATA_PATH = '/opt/ml/input/data/train_dataset/tuning_test_data_2.csv'
FOLD_PATH = '/opt/ml/input/data/train_dataset/user_fold_class.csv'

df_train = pd.read_csv(TRAIN_DATA_PATH, parse_dates=['Timestamp'])
df_test = pd.read_csv(TEST_DATA_PATH, parse_dates=['Timestamp'])
df_fold = pd.read_csv(FOLD_PATH)

CPU times: user 12.8 s, sys: 1.61 s, total: 14.4 s
Wall time: 14.4 s


In [3]:
def feature_engineering(df):
    
    # 카테고리형 feature들
    cat_features = ['userID','assessmentItemID','testId','KnowledgeTag','Category','Number','hour','weekday', 
                    'prob_mean_cate','average_user_correct_cate','past_user_prob_count']
    
    for i in enumerate (cat_features) : 
        ca = i[1] 
        df[ca] = df[ca].astype('category') 

    return df

In [4]:
df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

In [5]:
# 맞춰야하는 문항 ID 파악
set_assessmentItemID = set(df_test.loc[df_test.answerCode == -1, 'assessmentItemID'].values)

In [6]:
df_train.dtypes

Unnamed: 0                              int64
userID                               category
assessmentItemID                     category
testId                               category
answerCode                              int64
Timestamp                      datetime64[ns]
KnowledgeTag                         category
Category                             category
Number                               category
Time                                    int64
solTime                                 int64
isTest                                  int64
clipped_solTime                         int64
total_used_time                         int64
hour                                 category
weekday                              category
past_prob_count                         int64
past_prob_correct                     float64
average_prob_correct                  float64
prob_mean                             float64
prob_sum                              float64
prob_mean_cate                    

In [7]:
df_fold

Unnamed: 0,userID,fold_class
0,0,9
1,1,10
2,2,6
3,5,10
4,6,10
...,...,...
6693,7436,1
6694,7437,1
6695,7438,1
6696,7440,1


In [8]:
df_train = pd.merge(df_train,df_fold[['userID','fold_class']],on='userID')

In [9]:
df_train

Unnamed: 0.1,Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,Category,Number,Time,solTime,isTest,clipped_solTime,total_used_time,hour,weekday,past_prob_count,past_prob_correct,average_prob_correct,prob_mean,prob_sum,prob_mean_cate,past_test_count,past_test_correct,average_test_correct,test_mean,test_sum,past_tag_count,past_tag_correct,average_tag_correct,tag_mean,tag_sum,past_user_count,past_user_correct,average_user_correct,average_user_correct_cate,moving_average_user_correct,past_user_prob_count,past_user_prob_correct,average_user_prob_correct,past_user_test_count,past_user_test_correct,average_user_test_correct,past_user_tag_count,past_user_tag_correct,average_user_tag_correct,fold_class
0,0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,6,A001001,1585009031,3,0,3,3,0,1,163,162.0,0.993865,0.984000,246.0,9,978,945.0,0.966258,0.952667,1429.0,365,356.0,0.975342,0.957333,718.0,0,0.0,0.000000,0,0.6,0,0.0,0.0,0,0.0,0.00,0,0.0,0.00,9
1,1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,6,A001002,1585009034,8,0,8,11,0,1,163,159.0,0.975460,0.968000,242.0,9,979,946.0,0.966292,0.952667,1429.0,1743,1587.0,0.910499,0.917067,3439.0,1,1.0,1.000000,10,0.6,0,0.0,0.0,1,1.0,1.00,0,0.0,0.00,9
2,2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,6,A001003,1585009042,7,0,7,18,0,1,163,153.0,0.938650,0.916000,229.0,9,980,947.0,0.966327,0.952667,1429.0,1744,1588.0,0.910550,0.917067,3439.0,2,2.0,1.000000,10,0.9,0,0.0,0.0,2,2.0,1.00,1,1.0,1.00,9
3,3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,6,A001004,1585009049,7,0,7,25,0,1,163,162.0,0.993865,0.972000,243.0,9,981,948.0,0.966361,0.952667,1429.0,1745,1589.0,0.910602,0.917067,3439.0,3,3.0,1.000000,10,0.9,0,0.0,0.0,3,3.0,1.00,2,2.0,1.00,9
4,4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,6,A001005,1585009056,11,0,11,36,0,1,163,156.0,0.957055,0.948000,237.0,9,982,949.0,0.966395,0.952667,1429.0,1746,1590.0,0.910653,0.917067,3439.0,4,4.0,1.000000,10,1.0,0,0.0,0.0,4,4.0,1.00,3,3.0,1.00,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2266581,2266581,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,3,A071005,1591339821,0,0,0,220,6,4,89,45.0,0.505618,0.446667,134.0,4,449,310.0,0.690423,0.666000,999.0,2049,1488.0,0.726208,0.694889,3127.0,4,1.0,0.250000,2,0.7,0,0.0,0.0,4,1.0,0.25,4,1.0,0.25,1
2266582,2266582,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,4,A165001,1597971999,11,0,11,11,1,4,93,72.0,0.774194,0.643333,193.0,6,372,280.0,0.752688,0.653878,784.0,922,690.0,0.748373,0.700029,2413.0,5,1.0,0.200000,2,0.5,0,0.0,0.0,0,0.0,0.00,0,0.0,0.00,1
2266583,2266583,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,4,A165002,1597972010,46,0,46,57,1,4,93,70.0,0.752688,0.640000,192.0,6,373,281.0,0.753351,0.653878,784.0,923,691.0,0.748646,0.700029,2413.0,6,2.0,0.333333,3,0.5,0,0.0,0.0,1,1.0,1.00,1,1.0,1.00,1
2266584,2266584,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,4,A165003,1597972056,73,0,73,130,1,4,93,78.0,0.838710,0.786667,236.0,7,374,282.0,0.754011,0.653878,784.0,924,692.0,0.748918,0.700029,2413.0,7,3.0,0.428571,4,0.6,0,0.0,0.0,2,2.0,1.00,2,2.0,1.00,1


In [10]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [11]:
def get_fold_score(data):
    targets = data['fold_class']
    return np.array(targets)

In [12]:
def my_train_vali_split(df, filter_option = None, train_must_exist_leaderboard = False, ratio=0.5, seed = 23):
    random.seed(seed)
    # 리더보드와 동일 조건의 컬럼 수집
    vali_full = df[(df['userID'] != df['userID'].shift(-1)) & (df.assessmentItemID.isin(set_assessmentItemID))].copy()
    
    # 리더보드와 동일 조건의 컬럼을 나누기
    ratio_r = (1 - ratio)
    vali_1 = vali_full.sample(frac=ratio_r, random_state = seed) # ratio가 1이면, ratio_r이 0이 되어, vali_1에 아무것도 할당되지 않는다.
    vali_2 = vali_full.drop(vali_1.index)

    # vali에 포함된 유저 목록 확인하기
    vali_1_userID = set(vali_1.userID.values)
    vali_2_userID = set(vali_2.userID.values)
    
    # vali에 없는 유저들만 train으로 데려오기
    train_1 = df[ df['userID'].isin(vali_1_userID) == False ].copy()
    train_2 = df[ df['userID'].isin(vali_2_userID) == False ].copy()
    
    # 마지막 응답만 가져올지 여부
    if filter_option == '시험지마지막응답':
        train_1 = train_1[train_1['testId'] != train_1['testId'].shift(-1)].copy()
        train_2 = train_2[train_2['testId'] != train_2['testId'].shift(-1)].copy()
    if filter_option == '사용자마지막응답':
        train_1 = train_1[train_1['userID'] != train_1['userID'].shift(-1)].copy()
        train_2 = train_2[train_2['userID'] != train_2['userID'].shift(-1)].copy()

    # train도 리더보드에서 맞춰야하는 문제(444개 문제)만 볼지 여부
    if train_must_exist_leaderboard:
        train_1 = train_1[train_1.assessmentItemID.isin(set_assessmentItemID)].copy()
        train_2 = train_2[train_2.assessmentItemID.isin(set_assessmentItemID)].copy()
    
    return train_1, vali_1, train_2, vali_2 , vali_full

In [13]:
def exam(datasets ,FEATS, categorical_features=[],numeric_features=[],seed=47):
    train_1, vali_1, train_2, vali_2, vali_full = datasets
    random.seed(seed)
    settings = setup(data=train_1[FEATS], target='answerCode', train_size=0.8, categorical_features=categorical_features, numeric_features=numeric_features, 
                     log_experiment=True, experiment_name='lgbm', log_plots=True, log_profile=True, log_data=True)
    
    lgbm = create_model('lightgbm', sort='AUC')
    tuned_lgbm = tune_model(lgbm, optimize = 'AUC', fold = 10)
    final_lgbm = finalize_model(tuned_lgbm)

    predict_model(final_lgbm)

    log = []
    prediction = predict_model(final_lgbm, data=vali_full[FEATS], raw_score = True)
    log.append(f"모든 vali 데이터:    {check_metric(prediction['answerCode'], prediction['Label'], metric = 'Accuracy')} ,{check_metric(prediction['answerCode'], prediction['Score_1'], metric = 'AUC')}")
    return final_lgbm, log

In [45]:
# 사용자 마지막 데이터만 가져오기
data = df_train[df_train['userID'] != df_train['userID'].shift(-1)].reset_index()
test_data = df_test[df_test['userID'] != df_test['userID'].shift(-1)].reset_index()

In [20]:
data.shape

(6698, 48)

In [38]:
# train data ground truth
answerCode = data['answerCode']
answerCode

0       0
1       1
2       0
3       1
4       0
       ..
6693    0
6694    0
6695    1
6696    0
6697    1
Name: answerCode, Length: 6698, dtype: int64

In [33]:
# data['answerCode'].to_csv("./answerCode.csv", mode='w')

## Valid Inference

In [21]:
filter_option = '시험지마지막응답' # 시험지마지막응답, 사용자마지막응답, None
# filter_option = None
train_must_exist_leaderboard = False # True, False
# train_must_exist_leaderboard = True

FEATS = ['average_user_test_correct','prob_mean', 'test_mean', 'answerCode']
FEATS += ['tag_mean', 'total_used_time']

# args.seed = 42
seed_everything(42)

kfold = StratifiedKFold(n_splits=5)

In [22]:
oof = np.zeros(data.shape[0])
fold_models = []
target = get_fold_score(data)
target

array([ 9, 10,  6, ...,  1,  1,  1])

In [23]:
for i, (train_index, valid_index) in enumerate(kfold.split(data, target)):
    print(train_index, valid_index)
    train_data, valid_data = data.loc[train_index], data.loc[valid_index]
    print(f'Calculating train oof {i + 1}')
    
    # LGBM 모델 생성 및 훈련
    datasets = my_train_vali_split(train_data, filter_option = filter_option, train_must_exist_leaderboard = train_must_exist_leaderboard, ratio = 1.0)
    trained_model,log = exam(datasets ,FEATS)
    print('\n'.join(log))
    
    # fold별 oof 값 모으기
    predict = predict_model(trained_model, data=valid_data, raw_score=True)
#     print(predict)
    oof[valid_index] = predict['Score_1'] #<- valid_data을 model로 infer한 값이여야합니다.
    fold_models.append(trained_model)
#     oof[valid_index] = predict

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.729,0.8107,0.6847,0.7277,0.7056,0.4549,0.4557
1,0.75,0.8202,0.7389,0.7353,0.7371,0.4988,0.4988
2,0.7266,0.8189,0.7241,0.7067,0.7153,0.4525,0.4526
3,0.729,0.8247,0.7192,0.7122,0.7157,0.4568,0.4568
4,0.7173,0.7683,0.7094,0.699,0.7042,0.4335,0.4335
5,0.6893,0.752,0.7079,0.659,0.6826,0.379,0.3799
6,0.7056,0.778,0.6733,0.6939,0.6834,0.4084,0.4086
7,0.7354,0.8221,0.7178,0.7214,0.7196,0.4691,0.4691
8,0.6745,0.766,0.6337,0.6632,0.6481,0.3456,0.3459
9,0.733,0.8005,0.7178,0.7178,0.7178,0.4645,0.4645


Finished loading model, total used 20 iterations
Finished loading model, total used 20 iterations
Finished loading model, total used 20 iterations


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.7748,0.8607,0.7776,0.7614,0.7694,0.5493,0.5495


모든 vali 데이터:    0.7853 ,0.8654


In [24]:
oof

array([0.1347, 0.803 , 0.0299, ..., 0.5838, 0.1853, 0.7426])

In [44]:
total = 6698
count = 0
for i in range(total):
    pred = 0 if oof[i] < 0.5 else 1
#     print(oof[i], pred, answerCode[i])
    if pred == answerCode[i]:
        count += 1
print(count / total)

0.7243953418931024


In [25]:
len(oof)

6698

In [26]:
oof_val_np_path = './val_infer.npy'
np.save(oof_val_np_path, oof)

## Test Inference

In [27]:
predicts = np.zeros(test_data.shape[0])

for i, model in enumerate(fold_models):
    print(f'Calculating train oof {i + 1}')
    predict = predict_model(model, data=test_data, raw_score=True) #테스트 데이터 예측.
    predicts += predict['Score_1']
    
predict_avg = predicts / len(fold_models)

Calculating train oof 1
Calculating train oof 2
Calculating train oof 3
Calculating train oof 4
Calculating train oof 5


In [28]:
predicts

0      3.3350
1      3.8245
2      1.0126
3      3.1373
4      2.0464
        ...  
739    0.2757
740    3.5464
741    4.1042
742    4.0248
743    3.5766
Name: Score_1, Length: 744, dtype: float64

In [29]:
predict_avg

0      0.66700
1      0.76490
2      0.20252
3      0.62746
4      0.40928
        ...   
739    0.05514
740    0.70928
741    0.82084
742    0.80496
743    0.71532
Name: Score_1, Length: 744, dtype: float64

In [30]:
oof_predict_np_path = './test_infer.npy'
np.save(oof_predict_np_path, predict_avg)