# LGBM을 활용한 베이스라인

In [1]:
import pandas as pd
import os
import random
from tqdm import tqdm
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns; sns.set_theme(color_codes=True)
import pickle
import optuna

## 1. 데이터 로딩

In [2]:
data_dir = '/opt/ml/project/data/'
csv_file_path = os.path.join(data_dir, 'total_data_v2.csv')
df = pd.read_csv(csv_file_path, parse_dates=['Timestamp']) 
df = df.sort_values(by=['userID', 'Timestamp']).reset_index(drop=True)

## 2. Feature Engineering

In [3]:
def load_agg_data(df):
    last_cond = df['userID'] != df['userID'].shift(-1)
    agg_df = df[~last_cond]
    return agg_df


def calculate_elapsdTime(df):
    cond = df['elapsedTime'].isna()
    global_elapsedTime_mean = df['elapsedTime'].mean()
    df['estimated_elapsedTime'] = cond
    
    for window_size in [3, 5, 10, 30, 50, 100, 200]:
        df[f'roll_elapsedTime_mean{window_size}'] = \
            df.groupby(['userID'])[f'elapsedTime'].rolling(window_size, min_periods=1).mean().values
    
    # version 1
    df[f'elapsedTime_v1'] = df['elapsedTime'].values
    df.loc[cond, f'elapsedTime_v1'] = global_elapsedTime_mean
    
    # version 2
    df[f'elapsedTime_v2'] = df['elapsedTime'].values
    question_time_dict = df.groupby('assessmentItemID').elapsedTime.mean().to_dict()
    df.loc[cond, f'elapsedTime_v2'] = df.loc[cond, 'assessmentItemID'].apply(lambda x:question_time_dict[x] if x in question_time_dict.keys() else global_elapsedTime_mean)
    
    # version 3
    df[f'elapsedTime_v3'] = df['elapsedTime'].values
    user_time_dict = df.groupby('userID').elapsedTime.mean().to_dict()
    df.loc[cond, f'elapsedTime_v3'] = df.loc[cond, 'userID'].apply(lambda x:user_time_dict[x] if x in user_time_dict.keys() else global_elapsedTime_mean)
    
    # version 4        
    df[f'elapsedTime_v4'] = df['elapsedTime'].values
    df.loc[cond, f'elapsedTime_v4'] = df.loc[cond, f'roll_elapsedTime_mean3']
        
    return df


def calculate_user_question_elapsedTime(df):
    agg_df = load_agg_data(df)
    
    # 유저 별 all/correct/wrong 걸린 시간 평균
    user_df = agg_df.groupby('userID')[f'elapsedTime_v3'].agg(['mean'])
    user_correct_df = agg_df[agg_df['answerCode'] == 1].groupby('userID')[f'elapsedTime_v3'].agg(['mean'])
    user_wrong_df = agg_df[agg_df['answerCode'] == 0].groupby('userID')[f'elapsedTime_v3'].agg(['mean'])
    
    user_df.columns = [f'user_elapsedTime_mean']
    user_correct_df.columns = [f'user_correct_elapsedTime_mean']
    user_wrong_df.columns = [f'user_wrong_elapsedTime_mean']
    
    # 문제 별 all/correct/wrong 걸린 시간 평균
    question_df = agg_df.groupby('assessmentItemID')[f'elapsedTime_v2'].agg(['mean'])
    question_correct_df = agg_df[agg_df['answerCode'] == 1].groupby('assessmentItemID')[f'elapsedTime_v2'].agg(['mean'])
    question_wrong_df = agg_df[agg_df['answerCode'] == 0].groupby('assessmentItemID')[f'elapsedTime_v2'].agg(['mean'])
    
    question_df.columns = [f'question_elapsedTime_mean']
    question_correct_df.columns = [f'question_correct_elapsedTime_mean']
    question_wrong_df.columns = [f'question_wrong_elapsedTime_mean']
    
    df = pd.merge(df, user_df, on=['userID'], how="left")
    df = pd.merge(df, user_correct_df, on=['userID'], how="left")
    df = pd.merge(df, user_wrong_df, on=['userID'], how="left")
    df = pd.merge(df, question_df, on=['assessmentItemID'], how="left")
    df = pd.merge(df, question_correct_df, on=['assessmentItemID'], how="left")
    df = pd.merge(df, question_wrong_df, on=['assessmentItemID'], how="left")
    
    time_df = df.groupby('userID').elapsedTime.rolling(window=3).sum()
    time_df = time_df.reset_index()[['userID', 'elapsedTime']]
    cond1 = time_df.elapsedTime >= 0
    cond2 = time_df.elapsedTime < 3
    df['randomly_marked'] = cond1 & cond2

    return df


def calculate_statistics(df):    
    agg_df = load_agg_data(df)
    
    correct_i = agg_df.groupby(['assessmentItemID'])['answerCode'].agg(['mean', 'sum', 'std'])
    correct_i.columns = ["question_mean", 'question_sum', 'question_std']
    correct_t = agg_df.groupby(['testId'])['answerCode'].agg(['mean', 'sum', 'std'])
    correct_t.columns = ["test_mean", 'test_sum', 'test_std']
    correct_k = agg_df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum', 'std'])
    correct_k.columns = ["tag_mean", 'tag_sum', 'tag_std']
    type_df = agg_df.groupby('testType')['answerCode'].agg(['mean', 'sum', 'std'])
    type_df.columns = ['type_mean', 'type_sum', 'type_std']
    qn_df = agg_df.groupby('questionNumber')['answerCode'].agg(['mean','sum','std'])
    qn_df.columns = ['question_number_mean', 'question_number_sum', 'question_number_std']
    tn_df = agg_df.groupby('testNumber').answerCode.agg(['mean', 'sum', 'std'])
    tn_df.columns = ['test_number_mean', 'test_number_sum', 'test_number_std']
    
    corr_df = agg_df.groupby('assessmentItemID')['answerCode'].agg([['corr_ratio', 'mean']]).reset_index()
    corr_df = agg_df[agg_df['answerCode']==0].merge(corr_df, on='assessmentItemID')
    corr_df = corr_df.groupby('userID')['corr_ratio'].agg(['min', 'max', 'mean', 'std']).reset_index()
    corr_df.columns = ['userID', 'corr_min', 'corr_max', 'corr_mean', 'corr_std']
    
    df = pd.merge(df, correct_i, on=['assessmentItemID'], how="left")
    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    df = pd.merge(df, type_df, on=['testType'], how="left")
    df = pd.merge(df, qn_df, on=['questionNumber'], how='left')
    df = pd.merge(df, tn_df, on=['testNumber'], how='left')
    df = pd.merge(df, corr_df, on=['userID'], how='left')
    
    return df


def calculate_user_accuracy(df):
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']
    return df


def calculate_accuracy_trend(df):
    for window_size in [3, 5, 10, 30, 50, 100, 200]:
        user_df = df.groupby(df['userID']).shift(1)
        accuracy_trend = user_df.groupby(df['userID']).answerCode.rolling(window=window_size, min_periods=1).mean()
        correct_trend = user_df.groupby(df['userID']).answerCode.rolling(window=window_size, min_periods=1).sum()
        
        df[f'accuracy_trend{window_size}'] = accuracy_trend.values
        df[f'normalized_accuracy_trend{window_size}'] = df[f'accuracy_trend{window_size}'] - df['question_mean']
        df[f'correct_trend{window_size}'] = correct_trend.values
    
    df['shift'] = df.groupby('userID').answerCode.shift(1)
    
    for window_size in ['10min','1h','10h','1D','10D']:
        temp_accuracy_arr = np.zeros(len(df))
        temp_correct_arr = np.zeros(len(df))

        for user_id, temp_df in df.groupby('userID'):
            idx = temp_df.index
            accuracy_time_trend = temp_df.set_index('Timestamp')['shift'].rolling(window_size, min_periods=1).mean()
            correct_time_trend = temp_df.set_index('Timestamp')['shift'].rolling(window_size, min_periods=1).sum()
            temp_accuracy_arr[idx] = accuracy_time_trend
            temp_correct_arr[idx] = correct_time_trend

        df[f'accuracy_time_trend{window_size}'] = temp_accuracy_arr
        df[f'normalized_accuracy_time_trend{window_size}'] = temp_accuracy_arr - df['question_mean'].values
        df[f'correct_time_trend{window_size}'] = temp_correct_arr
    
    return df


def calculate_accuracy_on_past_attempts(df):
    # 과거 똑같은 문제 count/correct/accuracy
    df['past_question_count'] = df.groupby(['userID', 'assessmentItemID']).cumcount()
    df['shift'] = df.groupby(['userID', 'assessmentItemID'])['answerCode'].shift().fillna(0)
    df['past_question_correct'] = df.groupby(['userID', 'assessmentItemID'])['shift'].cumsum()
    df['past_question_accuracy'] = (df['past_question_correct'] / df['past_question_count']).fillna(0)
    
    # 과거 똑같은 태그 count/correct/accuracy
    df['past_tag_count'] = df.groupby(['userID', 'KnowledgeTag']).cumcount()
    df['shift'] = df.groupby(['userID', 'KnowledgeTag'])['answerCode'].shift().fillna(0)
    df['past_tag_correct'] = df.groupby(['userID', 'KnowledgeTag'])['shift'].cumsum()
    df['past_tag_accuracy'] = (df['past_tag_correct'] / df['past_tag_count']).fillna(0)
    
    # 과거 똑같은 Type count/correct/accuracy
    df['past_type_count'] = df.groupby(['userID', 'testType']).cumcount()
    df['shift'] = df.groupby(['userID', 'testType'])['answerCode'].shift().fillna(0)
    df['past_type_correct'] = df.groupby(['userID', 'testType'])['shift'].cumsum()
    df['past_type_accuracy'] = (df['past_type_correct'] / df['past_type_count']).fillna(0)
    
    # 과거 똑같은 문제 푼지 얼마나 되었는지
    shift = df.groupby(['userID', 'assessmentItemID'])['Timestamp'].shift()
    last_question_elapsedTime = (df['Timestamp'].values - shift.values) / np.timedelta64(1, 's')
    last_question_elapsedTime[np.isnan(last_question_elapsedTime)] = 0
    df['past_question_elapsedTime'] = last_question_elapsedTime

    # 과거 똑같은 태그 푼지 얼마나 되었는지
    shift = df.groupby(['userID', 'KnowledgeTag'])['Timestamp'].shift()
    last_tag_elapsedTime = (df['Timestamp'].values - shift.values) / np.timedelta64(1, 's')
    last_tag_elapsedTime[np.isnan(last_tag_elapsedTime)] = 0
    df['past_tag_elapsedTime'] = last_tag_elapsedTime

    # 과거 똑같은 Type 푼지 얼마나 되었는지
    shift = df.groupby(['userID', 'testType'])['Timestamp'].shift()
    last_tag_elapsedTime = (df['Timestamp'].values - shift.values) / np.timedelta64(1, 's')
    last_tag_elapsedTime[np.isnan(last_tag_elapsedTime)] = 0
    df['past_type_elapsedTime'] = last_tag_elapsedTime
    
    return df


def calculate_time_slot(df):
    # 날짜, timestamp, 시간
    df['day'] = df.Timestamp.dt.day
    df['time'] = df.Timestamp.apply(lambda x: x.value // 10**9)
    df['hour'] = df['Timestamp'].transform(lambda x: x.dt.hour)
    
    agg_df = load_agg_data(df)
    
    # 시간대별 정확도, 유저별 공부 시간, 야행성 여부
    hour_dict = agg_df.groupby(['hour'])['answerCode'].mean().to_dict()
    mode_dict = df.groupby(['userID'])['hour'].agg(lambda x: pd.Series.mode(x)[0]).to_dict()
    df['accuracy_per_hour'] = df['hour'].map(hour_dict)
    df['hour_mode'] = df['userID'].map(mode_dict)
    df['is_night'] = (df['hour_mode'] >= 22).values | (df['hour_mode'] < 4).values
    
    # Test 치는데 걸리는 시간
    # df['test_total_time'] = (df['test_end_time'].values - df['test_start_time'].values) / np.timedelta64(1, 's')
    
    return df


def dimension_reduction(df):
    # Truncated SVD
    SVD_DIM = 5
    with open('./assets/svd_question.pickle','rb') as f:
         svd_q_dict = pickle.load(f)
         
    svd_q_df = pd.DataFrame.from_dict(svd_q_dict).T
    cols = [f'svd_question{i+1}' for i in range(SVD_DIM)]
    cols.insert(0, 'assessmentItemID')
    svd_q_df = svd_q_df.reset_index()
    svd_q_df.columns = cols
    df = pd.merge(df, svd_q_df, how='left', on='assessmentItemID')

    with open('./assets/svd_user.pickle','rb') as f:
         svd_u_dict = pickle.load(f)
         
    svd_u_df = pd.DataFrame.from_dict(svd_u_dict).T
    cols = [f'svd_user{i+1}' for i in range(SVD_DIM)]
    cols.insert(0, 'userID')
    svd_u_df = svd_u_df.reset_index()
    svd_u_df.columns = cols
    df = pd.merge(df, svd_u_df, how='left', on='userID')
    
    LDA_DIM = 5
    # 문제들에 대한 유저별 정답 횟수 행렬 LDA
    transformed = np.load(f'./assets/lda_correct_question.npy')
    transformed_df = pd.DataFrame(transformed)
    transformed_df.columns = [f'lda_correct_question{i+1}' for i in range(LDA_DIM)]
    transformed_df = transformed_df.astype(np.float32)
    transformed_df['assessmentItemID'] = sorted(df.assessmentItemID.unique())
    df = pd.merge(df, transformed_df, how='left', on='assessmentItemID')
    
    # 문제들에 대한 유저별 오답 횟수 행렬 LDA (answerCode == 0 filtering)
    wrong_transformed = np.load(f'./assets/lda_wrong_question.npy')
    wrong_transformed_df = pd.DataFrame(wrong_transformed)
    wrong_transformed_df.columns = [f'lda_wrong_question{i+1}' for i in range(LDA_DIM)]
    wrong_transformed_df = wrong_transformed_df.astype(np.float32)
    wrong_transformed_df['assessmentItemID'] = sorted(df.assessmentItemID.unique())
    df = pd.merge(df, wrong_transformed_df, how='left', on='assessmentItemID')
    
    return df


def word2vec_embedding(df):
    EMB_DIM = 10
    # user's correct question list word2vec 
    with open('./assets/word2vec_correct_question.pickle','rb') as f:
         word2vec_correct = pickle.load(f)
    
    emb_correct_df = pd.DataFrame.from_dict(word2vec_correct).T
    cols = [f'word2vec_correct_question{i+1}' for i in range(EMB_DIM)]
    cols.insert(0, 'assessmentItemID')
    emb_correct_df = emb_correct_df.reset_index()
    emb_correct_df.columns = cols
    
    # user's wrong question list word2vec
    with open('./assets/word2vec_wrong_question.pickle','rb') as f:
         word2vec_wrong = pickle.load(f)
    
    emb_wrong_df = pd.DataFrame.from_dict(word2vec_wrong).T
    cols = [f'word2vec_wrong_question{i+1}' for i in range(EMB_DIM)]
    cols.insert(0, 'assessmentItemID')
    emb_wrong_df = emb_wrong_df.reset_index()
    emb_wrong_df.columns = cols
    
    df = pd.merge(df, emb_correct_df, how='left', on='assessmentItemID')
    df = pd.merge(df, emb_wrong_df, how='left', on='assessmentItemID')
    
    return df


def gcn_embedding(df):
    GCN_EMB_DIM = 10
    with open('./assets/gcn_embedding.pickle','rb') as f:
        gcn_embedding = pickle.load(f)

    gcn_user_embedding = pd.DataFrame.from_dict(gcn_embedding['user']).T
    cols = [f'gcn_user_embedding{i+1}' for i in range(GCN_EMB_DIM)]
    cols.insert(0, 'userID')
    gcn_user_embedding = gcn_user_embedding.reset_index()
    gcn_user_embedding.columns = cols

    gcn_item_embedding = pd.DataFrame.from_dict(gcn_embedding['item']).T
    cols = [f'gcn_question_embedding{i+1}' for i in range(GCN_EMB_DIM)]
    cols.insert(0, 'assessmentItemID')
    gcn_item_embedding = gcn_item_embedding.reset_index()
    gcn_item_embedding.columns = cols
    
    df = pd.merge(df, gcn_user_embedding, how='left', on='userID')
    df = pd.merge(df, gcn_item_embedding, how='left', on='assessmentItemID')
    
    return df


def elo_rating(df):
    with open('./assets/elo_student_parameters.pickle','rb') as f:
        student_parameters = pickle.load(f)

    with open('./assets/elo_item_parameters.pickle','rb') as f:
        item_parameters = pickle.load(f)

    student_df = pd.DataFrame.from_dict(student_parameters).T
    student_df.columns = ['elo_theta', 'user_nb_answers']
    student_df['userID'] = student_df.index
    item_df = pd.DataFrame.from_dict(item_parameters).T
    item_df.columns = ['elo_beta', 'item_nb_answers']
    item_df['assessmentItemID'] = item_df.index

    df = pd.merge(df, student_df, how='left', on='userID')
    df = pd.merge(df, item_df, how='left', on='assessmentItemID')

    return df
    

def feature_engineering(df):
    
    df = calculate_elapsdTime(df)
    df = calculate_user_question_elapsedTime(df)
    
    df = calculate_statistics(df)
    df = calculate_user_accuracy(df)
    df = calculate_accuracy_trend(df)
    df = calculate_accuracy_on_past_attempts(df)
    
    df = calculate_time_slot(df)
    
    df = dimension_reduction(df)
    df = word2vec_embedding(df)
    # df = gcn_embedding(df)
    df = elo_rating(df)
    
    df.fillna(0, inplace=True)
    
    return df

In [4]:
fe_df = feature_engineering(df)

## 3. Train/Valid 데이터 셋 분리

In [5]:
# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
def custom_train_test_split(df, ratio=0.7, user_ids=[]):
    if len(user_ids) == 0 :
        users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
        
        random.seed(42)
        random.shuffle(users)
        
        max_train_data_len = ratio*len(df)
        sum_of_train_data = 0
        user_ids =[]

        for user_id, count in users:
            sum_of_train_data += count
            if max_train_data_len < sum_of_train_data:
                break
            user_ids.append(user_id)

    train = df[df['userID'].isin(user_ids)]
    valid = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    valid = valid[valid['userID'] != valid['userID'].shift(-1)]
    return train, valid

In [6]:
# DROP ANSWERCODE
test_df = fe_df[fe_df['answerCode'] == -1]
test_users = test_df.userID.unique()
test_df = test_df.drop(['answerCode'], axis=1)

train_df = fe_df[fe_df['userID'].isin(test_users) == False]

In [59]:
# 사용할 Feature 설정
FEATS = ['KnowledgeTag', 
         
         'time',
         'hour',
         'hour_mode',
         'accuracy_per_hour',  
        
         'user_correct_answer', 
         'user_total_answer', 
         'user_acc',
         
         'test_mean', 
         'test_sum', 
         'test_std',
         
         'tag_mean',
         'tag_sum', 
         'tag_std', 
         
         'type_mean', 
         'type_sum',
         'type_std',
         
         'question_mean',
         'question_sum',
         'question_std', 
         
         'question_number_mean', 
         'question_number_sum', 
         'question_number_std',
         
         'test_number_mean', 
         'test_number_sum', 
         'test_number_std',
         
        #  'corr_min', 
        #  'corr_max', 
        #  'corr_mean', 
        #  'corr_std',
        
         'accuracy_trend3', 
         'accuracy_trend5', 
         'accuracy_trend10', 
         'accuracy_trend30', 
         'accuracy_trend50', 
         'accuracy_trend100', 
         'accuracy_trend200',
        #  'accuracy_time_trend1h',
        #  'accuracy_time_trend10h',
        #  'accuracy_time_trend1D',
        #  'accuracy_time_trend10min',
        #  'accuracy_time_trend10D',
         
         'normalized_accuracy_trend3', 
         'normalized_accuracy_trend5', 
         'normalized_accuracy_trend10', 
         'normalized_accuracy_trend30', 
         'normalized_accuracy_trend50', 
         'normalized_accuracy_trend100', 
         'normalized_accuracy_trend200',
        #  'normalized_accuracy_time_trend1h',
        #  'normalized_accuracy_time_trend10h',
        #  'normalized_accuracy_time_trend1D',
        #  'normalized_accuracy_time_trend10min',
        #  'normalized_accuracy_time_trend10D',
         
         'correct_trend3', 
         'correct_trend5', 
         'correct_trend10', 
         'correct_trend30', 
         'correct_trend50', 
         'correct_trend100', 
         'correct_trend200',
        #  'correct_time_trend1h',
        #  'correct_time_trend10h',
        #  'correct_time_trend1D',
        #  'correct_time_trend10min',
        #  'correct_time_trend10D',
         
        #  'estimated_elapsedTime',
        #  'elapsedTime_v1', 
        #  'elapsedTime_v2', 
        #  'elapsedTime_v3', 
         'elapsedTime_v4', 
         
         'user_elapsedTime_mean',
         'user_correct_elapsedTime_mean',
         'user_wrong_elapsedTime_mean',
         'question_elapsedTime_mean',
         'question_correct_elapsedTime_mean',
         'question_wrong_elapsedTime_mean',
         
         'past_type_count',
         'past_type_correct',
         'past_type_accuracy',
         'past_type_elapsedTime',
         'past_tag_count',
         'past_tag_correct',
         'past_tag_accuracy',
         'past_tag_elapsedTime',
                 
         'roll_elapsedTime_mean3',
         'roll_elapsedTime_mean5',
         'roll_elapsedTime_mean10',
         'roll_elapsedTime_mean30',
         'roll_elapsedTime_mean50',
         'roll_elapsedTime_mean100',
         'roll_elapsedTime_mean200',
        
         'word2vec_wrong_question1',
         'word2vec_wrong_question2',
         'word2vec_wrong_question3',
         'word2vec_wrong_question4',
         'word2vec_wrong_question5',
         'word2vec_wrong_question6',
         'word2vec_wrong_question7',
         'word2vec_wrong_question8',
         'word2vec_wrong_question9',
         'word2vec_wrong_question10',
         
        #  'lda_wrong_question1',
        #  'lda_wrong_question2',
        #  'lda_wrong_question3',
        #  'lda_wrong_question4',
        #  'lda_wrong_question5',
        
        #  'svd_user1',
        #  'svd_user2',
        #  'svd_user3',
        #  'svd_user4',
        #  'svd_user5',
         
         'svd_question1',
         'svd_question2',
         'svd_question3',
         'svd_question4',
         'svd_question5',

        #  'elo_theta',
        #  'elo_beta',
         
         'trueSkill_win_probability',
        #  'trueSkill_user_mu',
        #  'trueSkill_user_sigma',
        #  'trueSkill_question_mu',
        #  'trueSkill_question_sigma',
]
print(f'Number of Features: {len(FEATS)}')

Number of Features: 88


In [60]:
model_list = []
auc_list = []
acc_list = []

for cv_num in range(1,6):
    users_file_path = os.path.join(data_dir, f'cv{cv_num}_users.pickle')
    with open(users_file_path,'rb') as f:
        users = pickle.load(f)
    train_users = users['train_users']

    # 유저별 분리
    train, valid = custom_train_test_split(train_df, user_ids=train_users)

    # X, y 값 분리
    y_train = train['answerCode']
    train = train.drop(['answerCode'], axis=1)

    y_valid = valid['answerCode']
    valid = valid.drop(['answerCode'], axis=1)

    lgb_train = lgb.Dataset(train[FEATS], y_train)
    lgb_valid = lgb.Dataset(valid[FEATS], y_valid)

    model = lgb.train(
        {'objective':'binary'},
        lgb_train,
        valid_sets=[lgb_train, lgb_valid],
        verbose_eval=100,
        num_boost_round=10000,
        early_stopping_rounds=100,
    )

    preds = model.predict(valid[FEATS])
    acc = accuracy_score(y_valid, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_valid, preds)

    print(f'VALID{cv_num} AUC : {auc} ACC : {acc}\n')
    
    model_list.append(model)
    auc_list.append(auc)
    acc_list.append(acc)

[LightGBM] [Info] Number of positive: 1039565, number of negative: 546592
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17387
[LightGBM] [Info] Number of data points in the train set: 1586157, number of used features: 88
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.655399 -> initscore=0.642855
[LightGBM] [Info] Start training from score 0.642855
Training until validation scores don't improve for 100 rounds
[100]	training's binary_logloss: 0.432909	valid_1's binary_logloss: 0.502391
[200]	training's binary_logloss: 0.427852	valid_1's binary_logloss: 0.498853
[300]	training's binary_logloss: 0.424837	valid_1's binary_logloss: 0.498328
[400]	training's binary_logloss: 0.42207	valid_1's binary_logloss: 0.497452
[500]	training's binary_logloss: 0.419572	valid_1's binary_logloss: 0.496351
[600]	training's binary_logloss: 0.417248	valid_1's binary_logloss: 0.495285
[700]	training's bi

In [61]:
print(f'AUC : {np.mean(auc_list):.4f} ACC : {np.mean(acc_list):.4f}\n')

AUC : 0.8482 ACC : 0.7677



In [62]:
total_preds = np.zeros(len(test_df))
for model in model_list:
    total_preds += model.predict(test_df[FEATS]) / len(model_list)

In [63]:
# SAVE OUTPUT
output_dir = 'output/'
write_path = os.path.join(output_dir, "estimate elapsedTime.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : output/estimate elapsedTime.csv
