# LGBM을 활용한 베이스라인

In [None]:
!pip install scikit-learn
!pip install pycaret

In [1]:
import numpy as np
import pandas as pd
import os
import random
import pickle
from pycaret.classification import *
from pycaret.utils import check_metric
from datetime import timedelta, timezone, datetime

## 0. Wandb 연결

In [2]:
import wandb # Wandb 연결을 위해 library 불러오기
from wandb.lightgbm import wandb_callback
import xgboost as xgb

### Config 파일에서 wandb config에 남길 데이터를 추가

In [3]:
config = {
    "seed": 42,
    "split_ratio": 0.7,
    "verbose_eval": 100,
    "num_boost_round": 500,
    "early_stopping_rounds": 100,
    "feature_engineering": []
}

## 1. 데이터 로딩

In [4]:
# Load Train Data
data_dir = '/opt/ml/input/data/train_dataset'
csv_file_path = os.path.join(data_dir, 'train_data.csv')
df = pd.read_csv(csv_file_path, parse_dates=['Timestamp'])

# LOAD TESTDATA
test_csv_file_path = os.path.join(data_dir, 'test_data.csv')
test_df = pd.read_csv(test_csv_file_path, parse_dates=['Timestamp'])

answerCode2bool = {'userID':object,  'answerCode': 'int16', 'KnowledgeTag':object}
df = df.astype(answerCode2bool)
test_df = test_df.astype(answerCode2bool)

## 2. Feature Engineering

In [5]:
# train에서 각 문제 평균 뽑기
testId_mean_sum = df.groupby(['testId'])['answerCode'].agg(['mean','sum']).to_dict()
assessmentItemID_mean_sum = df.groupby(['assessmentItemID'])['answerCode'].agg(['mean', 'sum']).to_dict()
KnowledgeTag_mean_sum = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum']).to_dict()

# 맞춰야하는 문항 ID 파악
set_assessmentItemID = set(test_df.loc[test_df.answerCode == -1, 'assessmentItemID'].values)

In [6]:
def feature_engineering(df):
    
    # 문항이 중간에 비어있는 경우를 파악 (1,2,3,,5)
    def assessmentItemID2item(x):
        return int(x[-3:]) - 1  # 0 부터 시작하도록 
    df['item'] = df.assessmentItemID.map(assessmentItemID2item)

    item_size = df[['assessmentItemID', 'testId']].drop_duplicates().groupby('testId').size()
    testId2maxlen = item_size.to_dict() # 중복해서 풀이할 놈들을 제거하기 위해

    item_max = df.groupby('testId').item.max()
    print(len(item_max[item_max + 1 != item_size]), '개의 시험지가 중간 문항이 빈다. item_order가 올바른 순서') # item_max는 0부터 시작하니까 + 1
    shit_index = item_max[item_max +1 != item_size].index
    shit_df = df.loc[df.testId.isin(shit_index),['assessmentItemID', 'testId']].drop_duplicates().sort_values('assessmentItemID')      
    shit_df_group = shit_df.groupby('testId')

    shitItemID2item = {}
    for key in shit_df_group.groups:
        for i, (k,_) in enumerate(shit_df_group.get_group(key).values):
            shitItemID2item[k] = i
        
    def assessmentItemID2item_order(x):
        if x in shitItemID2item:
            return int(shitItemID2item[x])
        return int(x[-3:]) - 1  # 0 부터 시작하도록 
    df['item_order'] =  df.assessmentItemID.map(assessmentItemID2item_order)

    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    # 유저가 푼 시험지에 대해, 유저의 전체 정답/풀이횟수/정답률 계산 (3번 풀었으면 3배)
    df_group = df.groupby(['userID','testId'])['answerCode']
    df['user_total_correct_cnt'] = df_group.transform(lambda x: x.cumsum().shift(1))
    df['user_total_ans_cnt'] = df_group.cumcount()
    df['user_total_acc'] = df['user_total_correct_cnt'] / df['user_total_ans_cnt']

    # 유저가 푼 시험지에 대해, 유저의 풀이 순서 계산 (시험지를 반복해서 풀었어도, 누적되지 않음)
    # 특정 시험지를 얼마나 반복하여 풀었는지 계산 ( 2번 풀었다면, retest == 1)
    df['test_size'] = df.testId.map(testId2maxlen)
    df['retest'] = df['user_total_ans_cnt'] // df['test_size']
    df['user_test_ans_cnt'] = df['user_total_ans_cnt'] % df['test_size']

    # 각 시험지 당 유저의 정확도를 계산
    df['user_test_correct_cnt'] = df.groupby(['userID','testId','retest'])['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_acc'] = df['user_test_correct_cnt']/df['user_test_ans_cnt']


    # 아래의 피처는 다이나믹 합니다. 학습된 train의 평균값을 사용하지 않고, 새로 들어온 데이터의 평균 값을 사용합니다.
    # # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    # correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    # correct_t.columns = ["test_mean", 'test_sum']
    # correct_a = df.groupby(['assessmentItemID'])['answerCode'].agg(['mean', 'sum'])
    # correct_a.columns = ["ItemID_mean", 'ItemID_sum']
    # correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    # correct_k.columns = ["tag_mean", 'tag_sum']
    # df = pd.merge(df, correct_t, on=['testId'], how="left")
    # df = pd.merge(df, correct_a, on=['assessmentItemID'], how="left")
    # df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")

    # 본 피처는 train에서 얻어진 값을 그대로 유지합니다.
    df["test_mean"] = df.testId.map(testId_mean_sum['mean'])
    df['test_sum'] = df.testId.map(testId_mean_sum['sum'])
    df["ItemID_mean"] = df.assessmentItemID.map(assessmentItemID_mean_sum['mean'])
    df['ItemID_sum'] = df.assessmentItemID.map(assessmentItemID_mean_sum['sum'])
    df["tag_mean"] = df.KnowledgeTag.map(KnowledgeTag_mean_sum['mean'])
    df['tag_sum'] = df.KnowledgeTag.map(KnowledgeTag_mean_sum['sum'])
    '''
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']
    
    # Feature Engineering 추가
    df['assessment_category'] = df.apply(lambda row: int(row.assessmentItemID[2]), axis = 1)

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    '''
    return df

In [7]:
df = feature_engineering(df)
df.head()

31 개의 시험지가 중간 문항이 빈다. item_order가 올바른 순서


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,item,item_order,user_total_correct_cnt,user_total_ans_cnt,user_total_acc,test_size,retest,user_test_ans_cnt,user_test_correct_cnt,user_acc,test_mean,test_sum,ItemID_mean,ItemID_sum,tag_mean,tag_sum
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0,0,,0,,6,0,0,,,0.947683,1268,0.982063,219,0.955022,637
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,1,1,1.0,1,1.0,6,0,1,1.0,1.0,0.947683,1268,0.964126,215,0.913187,3040
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2,2,2.0,2,1.0,6,0,2,2.0,1.0,0.947683,1268,0.910314,203,0.913187,3040
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3,3,3.0,3,1.0,6,0,3,3.0,1.0,0.947683,1268,0.96861,216,0.913187,3040
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4,4,4.0,4,1.0,6,0,4,4.0,1.0,0.947683,1268,0.941704,210,0.913187,3040


In [9]:
test_df = feature_engineering(test_df)
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]
test_df.head()

31 개의 시험지가 중간 문항이 빈다. item_order가 올바른 순서


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,item,item_order,user_total_correct_cnt,user_total_ans_cnt,user_total_acc,test_size,retest,user_test_ans_cnt,user_test_correct_cnt,user_acc,test_mean,test_sum,ItemID_mean,ItemID_sum,tag_mean,tag_sum
1035,3,A050133008,A050000133,-1,2020-10-26 13:13:57,5289,7,7,6.0,7,0.857143,8,0,7,6.0,0.857143,0.65397,1219,0.532189,124,0.560703,1658
1706,4,A070146008,A070000146,-1,2020-12-27 02:47:54,9080,7,7,6.0,7,0.857143,8,0,7,6.0,0.857143,0.635949,697,0.59854,82,0.538664,1226
3023,13,A070111008,A070000111,-1,2020-12-27 04:35:09,9660,7,7,3.0,7,0.428571,8,0,7,3.0,0.428571,0.462209,795,0.376744,81,0.499044,1305
4283,17,A090064006,A090000064,-1,2020-10-30 05:48:37,2611,5,5,5.0,5,1.0,6,0,5,5.0,1.0,0.427536,236,0.26087,24,0.408974,319
4670,26,A060135007,A060000135,-1,2020-10-23 11:44:18,1422,6,6,4.0,6,0.666667,7,0,6,4.0,0.666667,0.634492,986,0.306306,68,0.610038,2589


## 3. Train/Test 데이터 셋 분리

In [10]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(config["seed"])
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

def my_train_vali_split(df, filter_option = None, train_must_exist_leaderboard = False, ratio=0.5, seed = 23):
    random.seed(seed)
    # 리더보드와 동일 조건의 컬럼 수집
    vali_full = df[(df['userID'] != df['userID'].shift(-1)) & (df.assessmentItemID.isin(set_assessmentItemID))].copy()
    
    # 리더보드와 동일 조건의 컬럼을 나누기
    ratio_r = (1 - ratio)
    vali_1 = vali_full.sample(frac=ratio_r, random_state = seed) # ratio가 1이면, ratio_r이 0이 되어, vali_1에 아무것도 할당되지 않는다.
    vali_2 = vali_full.drop(vali_1.index)

    # vali에 포함된 유저 목록 확인하기
    vali_1_userID = set(vali_1.userID.values)
    vali_2_userID = set(vali_2.userID.values)
    
    # vali에 없는 유저들만 train으로 데려오기
    train_1 = df[ df['userID'].isin(vali_1_userID) == False ].copy()
    train_2 = df[ df['userID'].isin(vali_2_userID) == False ].copy()
    
    # 마지막 응답만 가져올지 여부
    if filter_option == '시험지마지막응답':
        train_1 = train_1[train_1['testId'] != train_1['testId'].shift(-1)].copy()
        train_2 = train_2[train_2['testId'] != train_2['testId'].shift(-1)].copy()
    if filter_option == '사용자마지막응답':
        train_1 = train_1[train_1['userID'] != train_1['userID'].shift(-1)].copy()
        train_2 = train_2[train_2['userID'] != train_2['userID'].shift(-1)].copy()

    # train도 리더보드에서 맞춰야하는 문제(444개 문제)만 볼지 여부
    if train_must_exist_leaderboard:
        train_1 = train_1[train_1.assessmentItemID.isin(set_assessmentItemID)].copy()
        train_2 = train_2[train_2.assessmentItemID.isin(set_assessmentItemID)].copy()
    
    return train_1, vali_1, train_2, vali_2 , vali_full


def test_train_vali_split(df, filter_option = None, train_must_exist_leaderboard = False, vali='Full', ratio=0.5, seed = 23):
    random.seed(seed)
    # 리더보드와 동일 조건의 컬럼 수집을 포기했다.
    df = df[df.answerCode != -1].copy()
    vali_full = df[(df['userID'] != df['userID'].shift(-1))].copy()

    # 리더보드와 동일 조건의 컬럼을 나누기
    ratio_r = (1 - ratio)
    vali_1 = vali_full.sample(frac=ratio_r, random_state = seed) # ratio가 1이면, ratio_r이 0이 되어, vali_1에 아무것도 할당되지 않는다.
    vali_2 = vali_full.drop(vali_1.index)

    # vali에 포함된 유저 목록 확인하기
    vali_1_userID = set(vali_1.userID.values)
    vali_2_userID = set(vali_2.userID.values)
    #vali_full_userID = set(vali_full.userID.values)

    # vali 유저의 전 기록을 쓸 경우, 디폴트는 마지막 응답만 사용합니다.
    if vali in ['Full', '시험지마지막응답']:
        vali_1 = df[ df['userID'].isin(vali_1_userID) == True ].copy()
        vali_2 = df[ df['userID'].isin(vali_2_userID) == True ].copy()
        #vali_full = df[ df['userID'].isin(vali_full_userID) == True ].copy()

    # vali 유저의 각 시험지 마지막 기록을 쓸 경우
    if vali == '시험지마지막응답': 
        vali_1 = vali_1[(vali_1['testId'] != vali_1['testId'].shift(-1))].copy()
        vali_2 = vali_2[(vali_2['testId'] != vali_2['testId'].shift(-1))].copy()
        #vali_full = df[ df['userID'].isin(vali_full_userID) == True ].copy()
        #vali_full = vali_full[(vali_full['testId'] != vali_full['testId'].shift(-1))].copy()

    # vali에 없는 유저들만 train으로 데려오기
    train_1 = df[ df['userID'].isin(vali_1_userID) == False ].copy()
    train_2 = df[ df['userID'].isin(vali_2_userID) == False ].copy()
    
    # 마지막 응답만 가져올지 여부
    if filter_option == '시험지마지막응답':
        train_1 = train_1[train_1['testId'] != train_1['testId'].shift(-1)].copy()
        train_2 = train_2[train_2['testId'] != train_2['testId'].shift(-1)].copy()
    if filter_option == '사용자마지막응답':
        train_1 = train_1[train_1['userID'] != train_1['userID'].shift(-1)].copy()
        train_2 = train_2[train_2['userID'] != train_2['userID'].shift(-1)].copy()

    # train도 리더보드에서 맞춰야하는 문제(444개 문제)만 볼지 여부
    if train_must_exist_leaderboard:
        train_1 = train_1[train_1.assessmentItemID.isin(set_assessmentItemID)].copy()
        train_2 = train_2[train_2.assessmentItemID.isin(set_assessmentItemID)].copy()
    
    return train_1, vali_1, train_2, vali_2 , vali_full

In [11]:
# 유저별 분리
train, test = custom_train_test_split(df, ratio=config["split_ratio"])

# 사용할 Feature 설정
#FEATS = config["feature_engineering"]
FEATS = ['user_acc','ItemID_mean']
FEATS += ['test_mean']
config["feature_engineering"] = FEATS

# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [12]:
train.columns

Index(['userID', 'assessmentItemID', 'testId', 'Timestamp', 'KnowledgeTag',
       'item', 'item_order', 'user_total_correct_cnt', 'user_total_ans_cnt',
       'user_total_acc', 'test_size', 'retest', 'user_test_ans_cnt',
       'user_test_correct_cnt', 'user_acc', 'test_mean', 'test_sum',
       'ItemID_mean', 'ItemID_sum', 'tag_mean', 'tag_sum'],
      dtype='object')

In [13]:
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np

In [14]:
lgb_train = lgb.Dataset(train[FEATS], y_train)
lgb_test = lgb.Dataset(test[FEATS], y_test)

## 4. 훈련 및 검증

In [16]:
def exam_LGBM1(datasets,FEATS, categorical_features=[],numeric_features=[],seed=42):
    train_1, vali_1, train_2, vali_2, vali_full = datasets
    random.seed(seed)
    settings = setup(data=train_1[FEATS], target='answerCode', train_size=0.8, categorical_features=categorical_features, numeric_features=numeric_features)
    
    lgbm = create_model('lightgbm', sort='AUC')
    tuned_lgbm = tune_model(lgbm, optimize = 'AUC', fold = 10)
    final_lgbm = finalize_model(tuned_lgbm)

    predict_model(lgbm)
    predict_model(tuned_lgbm)
    predict_model(final_lgbm)

    log = []
    prediction = predict_model(final_lgbm, data=vali_1[FEATS], raw_score = True)
    log.append(f"학습에 사용안한 데이터: {check_metric(prediction['answerCode'], prediction['Label'], metric = 'Accuracy')} ,{check_metric(prediction['answerCode'], prediction['Score_1'], metric = 'AUC')}")
    prediction = predict_model(final_lgbm, data=vali_2[FEATS], raw_score = True)
    log.append(f"학습에 사용한 데이터:  {check_metric(prediction['answerCode'], prediction['Label'], metric = 'Accuracy')} ,{check_metric(prediction['answerCode'], prediction['Score_1'], metric = 'AUC')}")
    prediction = predict_model(final_lgbm, data=vali_full[FEATS], raw_score = True)
    log.append(f"모든 vali 데이터:    {check_metric(prediction['answerCode'], prediction['Label'], metric = 'Accuracy')} ,{check_metric(prediction['answerCode'], prediction['Score_1'], metric = 'AUC')}")
        
    return final_lgbm, log

def exam_LGBM2( datasets ,FEATS, categorical_features=[],numeric_features=[],seed=47):
    train_1, vali_1, train_2, vali_2, vali_full = datasets
    random.seed(seed)
    settings = setup(data=train_2[FEATS], target='answerCode', train_size=0.8, categorical_features=categorical_features, numeric_features=numeric_features)
    
    lgbm = create_model('lightgbm', sort='AUC')
    tuned_lgbm = tune_model(lgbm, optimize = 'AUC', fold = 10)
    final_lgbm = finalize_model(tuned_lgbm)
    
    predict_model(lgbm)
    predict_model(tuned_lgbm)
    predict_model(final_lgbm)
    
    log = []
    prediction = predict_model(final_lgbm, data=vali_2[FEATS], raw_score = True)
    log.append(f"학습에 사용안한 데이터: {check_metric(prediction['answerCode'], prediction['Label'], metric = 'Accuracy')} ,{check_metric(prediction['answerCode'], prediction['Score_1'], metric = 'AUC')}")
    prediction = predict_model(final_lgbm, data=vali_1[FEATS], raw_score = True)
    log.append(f"학습에 사용한 데이터:  {check_metric(prediction['answerCode'], prediction['Label'], metric = 'Accuracy')} ,{check_metric(prediction['answerCode'], prediction['Score_1'], metric = 'AUC')}")
    prediction = predict_model(final_lgbm, data=vali_full[FEATS], raw_score = True)
    log.append(f"모든 vali 데이터:    {check_metric(prediction['answerCode'], prediction['Label'], metric = 'Accuracy')} ,{check_metric(prediction['answerCode'], prediction['Score_1'], metric = 'AUC')}")    
    
    return final_lgbm, log

def exam_full(datasets ,FEATS, categorical_features=[],numeric_features=[],seed=47):
    train_1, vali_1, train_2, vali_2, vali_full = datasets
    random.seed(seed)
    settings = setup(data=train_1[FEATS], target='answerCode', train_size=0.8, categorical_features=categorical_features, numeric_features=numeric_features)
    
    lgbm = create_model('lightgbm', sort='AUC')
    tuned_lgbm = tune_model(lgbm, optimize = 'AUC', fold = 10)
    final_lgbm = finalize_model(tuned_lgbm)

    predict_model(lgbm)
    predict_model(tuned_lgbm)
    predict_model(final_lgbm)

    log = []
    prediction = predict_model(final_lgbm, data=vali_full[FEATS], raw_score = True)
    log.append(f"모든 vali 데이터:    {check_metric(prediction['answerCode'], prediction['Label'], metric = 'Accuracy')} ,{check_metric(prediction['answerCode'], prediction['Score_1'], metric = 'AUC')}")
    return final_lgbm, log

In [24]:

wandb.login()

wandb.init(project='p-stage-4-boosting', entity='lastffang', config=config)
#wandb.init(project='p-stage-4-dkt', entity='newspring97', config=config) #TODO: 나중에 lastffang으로 바꿀 것
#wandb.init(project='p-stage-4', entity='lastffang', config=config)
wandb.run.name = "sb-lgbm-seoil-features"


[34m[1mwandb[0m: wandb version 0.10.31 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


In [25]:
model = lgb.train(
                    {'objective': 'binary'}, 
                    lgb_train,
                    valid_sets=[lgb_train, lgb_test],
                    verbose_eval=config["verbose_eval"],
                    num_boost_round=config["num_boost_round"],
                    early_stopping_rounds=config["early_stopping_rounds"],
                    callbacks=[wandb_callback()]
                )

preds = model.predict(test[FEATS])
acc = accuracy_score(y_test, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')

[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 557
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 3
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.489476	valid_1's binary_logloss: 0.531166
[200]	training's binary_logloss: 0.488572	valid_1's binary_logloss: 0.53085
Early stopping, best iteration is:
[156]	training's binary_logloss: 0.488933	valid_1's binary_logloss: 0.53066
VALID AUC : 0.8121074339097595 ACC : 0.7448928749377179



In [26]:
# INSTALL MATPLOTLIB IN ADVANCE
# _ = lgb.plot_importance(model)

## 5. Inference

In [27]:
# FEATURE ENGINEERING
test_df = feature_engineering(test_df)

# LEAVE LAST INTERACTION ONLY
test_df = test_df[test_df['userID'] != test_df['userID'].shift(-1)]

# DROP ANSWERCODE
test_df = test_df.drop(['answerCode'], axis=1)

411 개의 시험지가 중간 문항이 빈다. item_order가 올바른 순서


KeyError: 'Column not found: answerCode'

In [28]:
# MAKE PREDICTION
total_preds = model.predict(test_df[FEATS])

In [29]:
# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "output_seoil_features.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)    
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/output_seoil_features.csv


In [30]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
training_binary_logloss,0.48805
valid_1_binary_logloss,0.5307
_runtime,21.0
_timestamp,1622707490.0
_step,255.0


0,1
training_binary_logloss,█▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
valid_1_binary_logloss,█▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
_runtime,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
_timestamp,▁▁▁▁▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
