In [9]:
pip install pycaret[full]

Collecting pycaret[full]
[?25l  Downloading https://files.pythonhosted.org/packages/da/99/18f151991b0f06107af9723417c64e304ae2133587f85ea734a90136b4ae/pycaret-2.3.1-py3-none-any.whl (261kB)
[K     |█▎                              | 10kB 26.4MB/s eta 0:00:01[K     |██▌                             | 20kB 18.4MB/s eta 0:00:01[K     |███▊                            | 30kB 14.6MB/s eta 0:00:01[K     |█████                           | 40kB 13.8MB/s eta 0:00:01[K     |██████▎                         | 51kB 6.6MB/s eta 0:00:01[K     |███████▌                        | 61kB 7.7MB/s eta 0:00:01[K     |████████▊                       | 71kB 8.1MB/s eta 0:00:01[K     |██████████                      | 81kB 8.5MB/s eta 0:00:01[K     |███████████▎                    | 92kB 8.9MB/s eta 0:00:01[K     |████████████▌                   | 102kB 7.2MB/s eta 0:00:01[K     |█████████████▊                  | 112kB 7.2MB/s eta 0:00:01[K     |███████████████                 | 122kB 7.2MB

In [31]:
# uninstall lightgbm CPU
!pip uninstall lightgbm -y
# install lightgbm GPU
!pip install lightgbm --install-option=--gpu --install-option="--opencl-include-dir=/usr/local/cuda/include/" --install-option="--opencl-library=/usr/local/cuda/lib64/libOpenCL.so"

Uninstalling lightgbm-3.2.1:
  Successfully uninstalled lightgbm-3.2.1
  cmdoptions.check_install_build_global(options)
Collecting lightgbm
[?25l  Downloading https://files.pythonhosted.org/packages/7a/6d/db0f5effd3f7982632111f37fcd2fa386b8407f1ff58ef30b71d65e1a444/lightgbm-3.2.1.tar.gz (1.5MB)
[K     |████████████████████████████████| 1.5MB 9.1MB/s 
Skipping wheel build for lightgbm, due to binaries being disabled for it.
Installing collected packages: lightgbm
    Running setup.py install for lightgbm ... [?25l[?25hdone
Successfully installed lightgbm-3.2.1


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import numpy as np
import pandas as pd
import os
import random
import pickle
from pycaret.classification import *
from pycaret.utils import check_metric
from datetime import timedelta, timezone, datetime
import torch
import time
from datetime import datetime

In [2]:
def setSeeds(seed = 42):
    # 랜덤 시드를 설정하여 매 코드를 실행할 때마다 동일한 결과를 얻게 합니다.
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)    
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
setSeeds(42)

In [3]:
def feature_split_user(df):
    new_df = df.copy()
    new_df["newUserID"] = df['assessmentItemID'].apply(lambda x:x[:3]) + df['userID'].astype(str)
    return new_df

def get_remain_test_data(df_test):
    get_new_id = set(df_test.loc[df_test.answerCode == -1, 'newUserID'])    
    test_data = df_test[df_test.newUserID.isin(get_new_id)]
    remain_data = df_test.drop(test_data.index)
    return test_data, remain_data
# time convert
def convert_time(s):
    timestamp = time.mktime(datetime.strptime(s, '%Y-%m-%d %H:%M:%S').timetuple())
    return int(timestamp)

In [4]:
data_path = '/content/drive/MyDrive/stage4/input/train_dataset/'
train_path = os.path.join(data_path, 'train_data.csv')
test_path = os.path.join(data_path, 'test_data.csv')

df_train_ori = pd.read_csv(train_path) 
df_test_ori = pd.read_csv(test_path)

answerCode2bool = {'userID':object,  'answerCode': 'int16', 'KnowledgeTag':object}
df_train_ori = df_train_ori.astype(answerCode2bool)
df_test_ori = df_test_ori.astype(answerCode2bool)
df_train = feature_split_user(df_train_ori)
df_test = feature_split_user(df_test_ori)

df_test_shift = df_test[df_test['userID'] != df_test['userID'].shift(-1)] # 맞춰야하는 row만 모아놓은 것
df_test, remain_data = get_remain_test_data(df_test)
df_train = pd.concat([df_train, remain_data])

df_train['Timestamp'] = df_train['Timestamp'].apply(convert_time).astype('int32')
df_test['Timestamp'] = df_test['Timestamp'].apply(convert_time).astype('int32')

df_train_test = pd.concat([df_train, df_test['answerCode'], df_test[df_test['answerCode'] != -1]])

In [5]:
# trian에서 각 문제 평균 뽑기
testId_mean_sum = df_train_test.groupby(['testId'])['answerCode'].agg(['mean','sum']).to_dict()
assessmentItemID_mean_sum = df_train_test.groupby(['assessmentItemID'])['answerCode'].agg(['mean', 'sum']).to_dict()
KnowledgeTag_mean_sum = df_train_test.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum']).to_dict()

In [6]:
# new user id 겹치는거 없음 확인
s1 = set(df_test.loc[:, 'newUserID'])
s2 = set(df_train.loc[:, 'newUserID'])
s1 & s2

set()

## Feature engineering
### 한번만 실행되도록 설정

In [7]:
def feature_engineering(df_ori):
    df = df_ori.copy()
    def assessmentItemID2item(x):
        return int(x[-3:]) - 1
    df['item'] = df.assessmentItemID.map(assessmentItemID2item)
    # testId별로 나오는 문제 갯수
    item_size = df[['assessmentItemID', 'testId']].drop_duplicates().groupby('testId').size()
    testId2maxlen = item_size.to_dict() # 중복해서 풀이할 놈들을 제거하기 위해
    
    item_max = df.groupby('testId')['item'].max()

    # 순서대로 풀지 않은 문제의 index
    not_order_index = item_max[item_max +1 != item_size].index    
    not_order_test = df.loc[df.testId.isin(not_order_index)][['assessmentItemID','testId']].drop_duplicates().sort_values('assessmentItemID')
    not_order_group = not_order_test.groupby('testId')

    not_order_ID2item = {}
    for key in not_order_group.groups:
        for i, (k, _) in enumerate(not_order_group.get_group(key).values):
            not_order_ID2item[k] = i
    
    def assessmentItemID2item_order(x):
        if x in not_order_ID2item:
            return int(not_order_ID2item[x])
        return int(x[-3:]) -1
    df['orded_item'] = df.assessmentItemID.map(assessmentItemID2item_order)
    df_group = df.groupby(['newUserID','testId'])['answerCode']
    #user 별 이전까지 맞춘개수, 현재까지 맞춘개수, 현재 정답률
    df['user_correct_answer'] = df_group.transform(lambda x: x.cumsum().shift(1)).fillna(0)
    df['user_total_answer'] = df_group.cumcount()
    df['user_acc'] = (df['user_correct_answer'] / df['user_total_answer'])
    
    #한 test안에서 마지막으로 푼 문제으로 부터 지난 시간
    df['prev_timestamp'] = (df.groupby('testId')['Timestamp'].shift(0) - df.groupby('testId')['Timestamp'].shift(1)).fillna(-999)
    
    # user 별 마지막으로 푼 tag로부터 지난 시간, NaN값은 300으로 한다.
    prev_timestamp_ac = df.groupby(['newUserID', 'KnowledgeTag'])[['Timestamp']].shift()
    df['diff_time_btw_KnowledgeTag_ids'] = (df['Timestamp'] - prev_timestamp_ac['Timestamp']).fillna(300)
    
    # 각 tag 별 마지막으로 풀었을때 정답 여부
    prev_correct_ac = df.groupby(['newUserID', 'KnowledgeTag'])[['answerCode']].shift()        
    df['prev_answered_correctly'] = prev_correct_ac['answerCode'].fillna(0)
    
    #test, item, tag 별 평균 정답률
    df["test_mean"] = df.testId.map(testId_mean_sum['mean'])
    df["ItemID_mean"] = df.assessmentItemID.map(assessmentItemID_mean_sum['mean'])
    df["tag_mean"] = df.KnowledgeTag.map(KnowledgeTag_mean_sum['mean'])
    
    #test, Item, tag 별 상대적 정답률 -> 너무 무작위적이라 feature로 못씀
    df['relative_test_answer'] = df['answerCode'] - df['test_mean']
    df['relative_ItemID_answer'] = df['answerCode'] - df['ItemID_mean']
    df['relative_tag_answer'] = df['answerCode'] - df['tag_mean']
    #test, Item, tag 별 상대적 정답률 누적 점수 -> linear 한 경향을 지님
    df['accumulate_relative_test_answer'] = df.groupby('newUserID')['relative_test_answer'].transform(lambda x: x.cumsum().shift(1)).fillna(0).astype('float16')
    df['accumulate_relative_ItemID_answer'] = df.groupby('newUserID')['relative_ItemID_answer'].transform(lambda x: x.cumsum().shift(1)).fillna(0).astype('float16')
    df['accumulate_relative_tag_answer'] = df.groupby('newUserID')['relative_tag_answer'].transform(lambda x: x.cumsum().shift(1)).fillna(0).astype('float16')
    
    
    #이동평균선 5, 10, 15, 20, 25, 30, 2~30
    for i in range(5, 31, 5):
        column = 'ma' + str(i)
        df[column] = df['user_acc'].fillna(0).rolling(window=i).mean()
    
    #MACD
    df['MACD'] = df['ma15'] - df['ma25']
    
    #Standard Deviation 5,10, 15, 20, 25, 30, fillna(-999)
    for i in range(5, 31, 5):
        column = 'sd' + str(i)
        df[column] = df['user_acc'].fillna(0).rolling(window=i).std().fillna(-999)
    
    #볼린저 밴드
    df['Upper BollingerBand'] = (df['ma10'] + (df['sd10'] * 3)).fillna(-999)
    df['Lower BollingerBand'] = (df['ma10'] - (df['sd10'] * 3)).fillna(-999)
    
    #이전에 같은 item, tag 몇 번 풀었는지
    df['prior_ItemID_frequency'] = df.groupby(['newUserID', 'assessmentItemID']).cumcount('int16').fillna(0)
    df['prior_tag_frequency'] = df.groupby(['newUserID', 'KnowledgeTag']).cumcount('int8').fillna(0)
    
    #망각 곡선TOD
    return df

In [8]:
fe_train_url = os.path.join(data_path, 'fe_train.csv')
fe_test_url = os.path.join(data_path, 'fe_test.csv')

if os.path.exists(fe_train_url) & os.path.exists(fe_test_url):
    FE_train = pd.read_csv(fe_train_url)
    FE_test = pd.read_csv(fe_test_url)
    FE_train.drop('Unnamed: 0', axis=1, inplace=True)
    FE_test.drop('Unnamed: 0', axis=1, inplace=True)
    print("cache dataset complete")
else:
    print("working on feature engineering.")
    FE_train = feature_engineering(df_train)
    FE_test = feature_engineering(df_test)
    FE_train.to_csv(fe_train_url)
    FE_test.to_csv(fe_test_url)
    print("save dataset")

cache dataset complete


## catergorical data preprocessing

In [9]:
a = set(FE_train.assessmentItemID)
b = set(FE_test.assessmentItemID)
print(len(a))
print(len(b))
print(len(a & b))
print('*' * 50)

a = set(FE_train.testId)
b = set(FE_test.testId)
print(len(a))
print(len(b))
print(len(a & b))
print('*' * 50)

a = set(FE_train.KnowledgeTag)
b = set(FE_test.KnowledgeTag)
print(len(a))
print(len(b))
print(len(a & b))
print('*' * 50)

a = set(FE_train.newUserID)
b = set(FE_test.newUserID)
print(len(a))
print(len(b))
print(len(a & b))
print('*' * 50)

9454
9399
9399
**************************************************
1537
1526
1526
**************************************************
912
912
912
**************************************************
18995
744
0
**************************************************


In [10]:
from sklearn.preprocessing import LabelEncoder
def get_ideal_dtypes(df, df_test):
    ideal_dtypes = dict()
    
    for column in df.columns:
        dtype = df[column].dtype
        
        if dtype != object:
            c_min = df[column].min()
            c_max = df[column].max()

            # 숫자형 데이터 형식 최적화
            if str(dtype)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    ideal_dtypes[column] = 'int8'
                elif c_min > np.iinfo(np.uint8).min and c_max < np.iinfo(np.uint8).max:
                    ideal_dtypes[column] = 'uint8'
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    ideal_dtypes[column] = 'int16'
                elif c_min > np.iinfo(np.uint16).min and c_max < np.iinfo(np.uint16).max:
                    ideal_dtypes[column] = 'uint16'
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    ideal_dtypes[column] = 'int32'
                elif c_min > np.iinfo(np.uint32).min and c_max < np.iinfo(np.uint32).max:
                    ideal_dtypes[column] = 'uint32'
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    ideal_dtypes[column] = 'int64'
                elif c_min > np.iinfo(np.uint64).min and c_max < np.iinfo(np.uint64).max:
                    ideal_dtypes[column] = 'uint64'
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    ideal_dtypes[column] = 'float16'
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    ideal_dtypes[column] = 'float32'
                else:
                    ideal_dtypes[column] = 'float64'
        else:
            df_full = pd.concat([df, df_test])
    
            label_encoder = LabelEncoder()
            #For UNKNOWN class
            a = df_full[column].unique().tolist() + ['unknown']
            label_encoder.fit(a)

            #모든 컬럼이 범주형이라고 가정
            df_full[column] = df_full[column].astype(str)
            test = label_encoder.transform(df_full[column])
            df_full[column] = test
            ideal_dtypes[column] = 'category'
    ideal_dtypes['KnowledgeTag'] = 'category'   
    df_full = df_full.astype(ideal_dtypes)
    return df_full[:len(df)], df_full[len(df):]

In [11]:
# 데이터 사이즈 줄이기
PP_train, PP_test = get_ideal_dtypes(FE_train, FE_test)

In [12]:
PP_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2406738 entries, 0 to 2406737
Data columns (total 41 columns):
 #   Column                             Dtype   
---  ------                             -----   
 0   userID                             int16   
 1   assessmentItemID                   category
 2   testId                             category
 3   answerCode                         int8    
 4   Timestamp                          int32   
 5   KnowledgeTag                       category
 6   newUserID                          category
 7   item                               int8    
 8   orded_item                         int8    
 9   user_correct_answer                float16 
 10  user_total_answer                  int8    
 11  user_acc                           float16 
 12  prev_timestamp                     float32 
 13  diff_time_btw_KnowledgeTag_ids     float32 
 14  prev_answered_correctly            float16 
 15  test_mean                          float16 
 16  

In [13]:
a = set(PP_train.assessmentItemID)
b = set(PP_test.assessmentItemID)
print(len(a))
print(len(b))
print(len(a & b))
print('*' * 50)

a = set(PP_train.testId)
b = set(PP_test.testId)
print(len(a))
print(len(b))
print(len(a & b))
print('*' * 50)

a = set(PP_train.KnowledgeTag)
b = set(PP_test.KnowledgeTag)
print(len(a))
print(len(b))
print(len(a & b))
print('*' * 50)

a = set(PP_train.newUserID)
b = set(PP_test.newUserID)
print(len(a))
print(len(b))
print(len(a & b))
print('*' * 50)

9454
9399
9399
**************************************************
1537
1526
1526
**************************************************
912
912
912
**************************************************
18995
744
0
**************************************************


In [14]:
# 문제 없다면 다시 train test로 분배!
df_train, df_test = PP_train, PP_test
df_train.sample(5)
df_test.sample(5)

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,newUserID,item,orded_item,user_correct_answer,user_total_answer,user_acc,prev_timestamp,diff_time_btw_KnowledgeTag_ids,prev_answered_correctly,test_mean,ItemID_mean,tag_mean,relative_test_answer,relative_ItemID_answer,relative_tag_answer,accumulate_relative_test_answer,accumulate_relative_ItemID_answer,accumulate_relative_tag_answer,ma5,ma10,ma15,ma20,ma25,ma30,MACD,sd5,sd10,sd15,sd20,sd25,sd30,Upper BollingerBand,Lower BollingerBand,prior_ItemID_frequency,prior_tag_frequency
54490,1557,A040170005,A040000170,1,1603085546,2119,7730,4,4,3.0,4,0.75,56.0,56.0,1.0,0.708008,0.662109,0.727051,0.292236,0.337891,0.272949,-3.753906,-3.794922,-3.835938,0.583496,0.483398,0.533203,0.558105,0.583496,0.563965,-0.049988,0.372559,0.361816,0.372314,0.374756,0.376465,0.370605,1.568359,-0.602051,0,4
30246,792,A060070005,A060000070,0,1588749812,9557,15352,4,4,4.0,4,1.0,6.0,6.0,1.0,0.771973,0.744141,0.741211,-0.771973,-0.744141,-0.741211,-18.015625,-18.140625,-17.0,0.799805,0.498291,0.373291,0.304932,0.29541,0.361572,0.078003,0.447266,0.453369,0.411621,0.379883,0.34668,0.381836,1.858398,-0.861816,0,3
22358,568,A070089005,A070000089,0,1597981658,8885,16796,4,4,3.0,4,0.75,64.0,300.0,0.0,0.474121,0.543945,0.494141,-0.474121,-0.543945,-0.494141,50.0,49.6875,38.625,0.383301,0.660645,0.64502,0.595703,0.614258,0.678223,0.031143,0.361328,0.383057,0.379639,0.339355,0.344482,0.345703,1.80957,-0.488525,0,0
102957,3789,A080065006,A080000065,0,1597287437,2870,18093,5,5,1.0,5,0.199951,6.0,300.0,0.0,0.492188,0.522461,0.515625,-0.492188,-0.522461,-0.515625,-20.6875,-20.828125,-26.109375,0.456787,0.456299,0.448486,0.336426,0.328369,0.299072,0.120117,0.324463,0.286377,0.326904,0.344238,0.316162,0.297363,1.31543,-0.40332,0,0
5109,79,A070121005,A070000121,1,1599698536,9660,17234,4,4,3.0,4,0.75,40.0,40.0,1.0,0.462646,0.464111,0.494385,0.537109,0.536133,0.505859,58.0625,58.15625,66.8125,0.383301,0.691895,0.697754,0.773438,0.758301,0.765137,-0.060577,0.361328,0.404541,0.392822,0.363037,0.362793,0.369873,1.905273,-0.521973,0,4


# validation split

In [15]:
# 맞춰야하는 문항 ID 파악
set_assessmentItemID = set(df_test.loc[df_test.answerCode == -1, 'assessmentItemID'].values) # 문제별 ID
set_testId = set(df_test.loc[df_test.answerCode == -1, 'testId'].values) # 시험지별 ID
set_tag = set(df_test.loc[df_test.answerCode == -1, 'KnowledgeTag'].values) # 시험지별 ID

In [16]:
def get_full_valid_split(df_ori, filter_option=None):
    random.seed(0)
    df = df_ori.copy()
    df_val = df[(df['newUserID'] != df['newUserID'].shift(-1)) & (df.assessmentItemID.isin(set_assessmentItemID))]
    val_newUserId = set(df_val.newUserID.values)

    if filter_option == "test":
        df_train = df[df['testId'] != df['testId'].shift(-1)]
    elif filter_option == "user":
        df_train = df[df['newUserID'] != df['newUserID'].shift(-1)]
    else:
        df_train = df
    return df_train, df_val

In [17]:
filter_option = 'test' #user, none

FEATS = ['answerCode', 'item', 'orded_item','assessmentItemID','KnowledgeTag','testId',
       'user_correct_answer', 'user_total_answer', 'user_acc',
       'prev_timestamp', 'diff_time_btw_KnowledgeTag_ids',
       'prev_answered_correctly', 'test_mean', 'ItemID_mean', 'tag_mean',
       'accumulate_relative_test_answer', 'accumulate_relative_ItemID_answer',
       'accumulate_relative_tag_answer', 'ma5', 'ma10', 'ma15', 'ma20', 'ma25',
       'ma30', 'MACD', 'sd5', 'sd10', 'sd15', 'sd20',
       'sd25', 'sd30', 'Upper BollingerBand', 'Lower BollingerBand', 'prior_ItemID_frequency',
       'prior_tag_frequency']
         
numeric_features = ['item', 'orded_item',
       'user_correct_answer', 'user_total_answer', 'user_acc',
       'prev_timestamp', 'diff_time_btw_KnowledgeTag_ids',
       'prev_answered_correctly', 'test_mean', 'ItemID_mean', 'tag_mean',
       'accumulate_relative_test_answer', 'accumulate_relative_ItemID_answer',
       'accumulate_relative_tag_answer', 'ma5', 'ma10', 'ma15', 'ma20', 'ma25',
       'ma30', 'MACD', 'sd5', 'sd10', 'sd15', 'sd20',
       'sd25', 'sd30', 'Upper BollingerBand', 'Lower BollingerBand', 'prior_ItemID_frequency',
       'prior_tag_frequency']
categorical_features = ['assessmentItemID','KnowledgeTag','testId']

df_train, df_val = get_full_valid_split(df_train, filter_option) # test filter 기준 398628 , 35404 rows면 잘 분배된거에요
print(len(df_train))
print(len(df_val))

398628
35404


In [18]:
df_train[FEATS]

Unnamed: 0,answerCode,item,orded_item,assessmentItemID,KnowledgeTag,testId,user_correct_answer,user_total_answer,user_acc,prev_timestamp,diff_time_btw_KnowledgeTag_ids,prev_answered_correctly,test_mean,ItemID_mean,tag_mean,accumulate_relative_test_answer,accumulate_relative_ItemID_answer,accumulate_relative_tag_answer,ma5,ma10,ma15,ma20,ma25,ma30,MACD,sd5,sd10,sd15,sd20,sd25,sd30,Upper BollingerBand,Lower BollingerBand,prior_ItemID_frequency,prior_tag_frequency
5,1,6,5,A060001007,7225,A060000001,5.0,5,1.000000,11.0,11.0,1.0,0.952637,0.928223,0.916992,0.236694,0.212036,0.374512,1.000000,,,,,,,0.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,-999.000000,0,4
12,1,6,6,A060003007,7226,A060000003,5.0,6,0.833496,16.0,16.0,1.0,0.793945,0.863770,0.803711,0.521484,0.591797,0.635742,0.709961,0.654785,,,,,,0.133179,0.380127,-999.000000,-999.000000,-999.000000,-999.000000,1.795898,-0.485596,0,6
19,1,6,6,A060005007,7228,A060000005,5.0,6,0.833496,34.0,132.0,1.0,0.852539,0.779785,0.831543,0.612793,0.540039,0.857422,0.966797,0.821777,0.692383,0.719238,,,,0.074524,0.305420,0.387207,0.393555,-999.000000,-999.000000,1.738281,-0.094849,0,4
26,1,6,6,A060007007,7229,A060000007,6.0,6,1.000000,183.0,183.0,1.0,0.804199,0.828125,0.726074,1.935547,1.959961,2.667969,1.000000,0.883301,0.844238,0.769043,0.775391,,0.069092,0.000000,0.314697,0.347656,0.359375,0.368408,-999.000000,1.827148,-0.060944,0,6
32,1,5,5,A080002006,1395,A080000002,4.0,5,0.799805,4.0,300.0,0.0,0.759277,0.580078,0.513184,0.203369,0.024002,0.334717,0.959961,0.879883,0.842285,0.831543,0.807129,0.772949,0.034882,0.089417,0.315430,0.347900,0.362793,0.331055,0.371826,1.826172,-0.066589,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2406715,0,4,4,A040109005,8238,A040000109,1.0,4,0.250000,7.0,7.0,1.0,0.765137,0.759766,0.779297,-2.060547,-2.066406,-2.117188,0.049988,0.366699,0.321289,0.278564,0.296143,0.355713,0.025177,0.111816,0.437744,0.358154,0.352051,0.344727,0.350098,1.679688,-0.946777,0,4
2406721,0,5,5,A080002006,1395,A080000002,3.0,5,0.600098,3.0,300.0,0.0,0.759277,0.580078,0.513184,-0.796875,-0.976074,-0.665039,0.803223,0.426758,0.512207,0.429199,0.353516,0.358398,0.158813,0.187256,0.422852,0.422607,0.394775,0.386719,0.376709,1.695312,-0.841797,0,0
2406726,0,4,4,A030160005,1726,A030000160,2.0,4,0.500000,29.0,29.0,0.0,0.462891,0.234131,0.553711,0.148071,-0.080017,-0.214722,0.633301,0.718262,0.495605,0.542480,0.470215,0.400146,0.025467,0.415039,0.316406,0.417480,0.413330,0.398926,0.398438,1.667969,-0.231079,0,4
2406731,0,4,4,A010093005,6557,A010000093,0.0,4,0.000000,113.0,113.0,0.0,0.782715,0.765625,0.790039,-3.130859,-3.148438,-3.160156,0.000000,0.316650,0.479004,0.371582,0.434082,0.391846,0.044891,0.000002,0.433594,0.432617,0.420654,0.429199,0.404297,1.617188,-0.983887,0,4


# ♻ Pycaret 시작

In [19]:
###setup
tr = setup(data=df_train[FEATS], target='answerCode', train_size=0.8, high_cardinality_features=categorical_features, numeric_features=numeric_features, normalize=True, normalize_method='robust', use_gpu=True)
models (internal = True) [[ 'Name', 'GPU Enabled']]

Unnamed: 0,Description,Value
0,session_id,6461
1,Target,answerCode
2,Target Type,Binary
3,Label Encoded,
4,Original Data,"(398628, 35)"
5,Missing Values,True
6,Numeric Features,31
7,Categorical Features,3
8,Ordinal Features,False
9,High Cardinality Features,True


Unnamed: 0_level_0,Name,GPU Enabled
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
lr,Logistic Regression,False
knn,K Neighbors Classifier,False
nb,Naive Bayes,False
dt,Decision Tree Classifier,False
svm,SVM - Linear Kernel,False
rbfsvm,SVM - Radial Kernel,False
gpc,Gaussian Process Classifier,False
mlp,MLP Classifier,False
ridge,Ridge Classifier,False
rf,Random Forest Classifier,False


In [None]:
#gpu test
# import dataset
# from pycaret.datasets import get_data
# data = get_data('poker')
# # initialize the setup
# from pycaret.classification import *
# clf = setup(data, target = 'CLASS', use_gpu = True)
# models (internal = True) [[ 'Name', 'GPU Enabled']]

In [None]:
# Install RAPIDS
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!bash rapidsai-csp-utils/colab/rapids-colab.sh stable

# import sys, os, shutil

# sys.path.append('/usr/local/lib/python3.7/site-packages/')
# os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
# os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
# os.environ["CONDA_PREFIX"] = "/usr/local"
# for so in ['cudf', 'rmm', 'nccl', 'cuml', 'cugraph', 'xgboost', 'cuspatial']:
#   fn = 'lib'+so+'.so'
#   source_fn = '/usr/local/lib/'+fn
#   dest_fn = '/usr/lib/'+fn
#   if os.path.exists(source_fn):
#     print(f'Copying {source_fn} to {dest_fn}')
#     shutil.copyfile(source_fn, dest_fn)
# # fix for BlazingSQL import issue
# # ImportError: /usr/lib/x86_64-linux-gnu/libstdc++.so.6: version `GLIBCXX_3.4.26' not found (required by /usr/local/lib/python3.7/site-packages/../../libblazingsql-engine.so)
# if not os.path.exists('/usr/lib64'):
#     os.makedirs('/usr/lib64')
# for so_file in os.listdir('/usr/local/lib'):
#   if 'libstdc' in so_file:
#     shutil.copyfile('/usr/local/lib/'+so_file, '/usr/lib64/'+so_file)
#     shutil.copyfile('/usr/local/lib/'+so_file, '/usr/lib/x86_64-linux-gnu/'+so_file)

In [32]:
models (internal = True) [[ 'Name', 'GPU Enabled']]

Unnamed: 0_level_0,Name,GPU Enabled
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
lr,Logistic Regression,False
knn,K Neighbors Classifier,False
nb,Naive Bayes,False
dt,Decision Tree Classifier,False
svm,SVM - Linear Kernel,False
rbfsvm,SVM - Radial Kernel,False
gpc,Gaussian Process Classifier,False
mlp,MLP Classifier,False
ridge,Ridge Classifier,False
rf,Random Forest Classifier,False


In [20]:
from time import time
#top 5 모델을 뽑을 겁니다.
cell_start_time = time()
xgboost = create_model('xgboost')
lightgbm = create_model('lightgbm')
catboost = create_model('catboost')
cell_end_time = time()
print("CELL RUN TIME : ",cell_end_time - cell_start_time)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7713,0.8428,0.8385,0.7729,0.8044,0.5301,0.5327
1,0.7757,0.8464,0.8413,0.7771,0.808,0.5394,0.5419
2,0.7745,0.8468,0.8466,0.7729,0.8081,0.5361,0.5393
3,0.7707,0.8422,0.8366,0.7732,0.8037,0.5293,0.5316
4,0.7758,0.8474,0.8412,0.7773,0.808,0.5395,0.542
5,0.7769,0.8462,0.8443,0.7771,0.8093,0.5416,0.5443
6,0.7726,0.8454,0.8377,0.775,0.8051,0.5331,0.5354
7,0.7725,0.8449,0.84,0.7737,0.8055,0.5326,0.5352
8,0.775,0.8465,0.841,0.7764,0.8074,0.5379,0.5404
9,0.7769,0.8472,0.8457,0.7764,0.8096,0.5414,0.5443


CELL RUN TIME :  172.65796780586243


In [21]:
top3_models = [xgboost,lightgbm,catboost]

In [22]:
#모델 하이퍼 파라미터 튜닝
models = []
for model in top3_models:
    tuned_model = tune_model(model, optimize = 'AUC')
    models.append(tuned_model)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7696,0.8411,0.8363,0.7719,0.8028,0.5267,0.5292
1,0.7755,0.845,0.8412,0.777,0.8078,0.539,0.5415
2,0.7742,0.8451,0.8455,0.7731,0.8077,0.5355,0.5387
3,0.7701,0.841,0.8343,0.7736,0.8028,0.5283,0.5304
4,0.7748,0.8461,0.8404,0.7765,0.8071,0.5375,0.54
5,0.7752,0.8448,0.8422,0.776,0.8078,0.5382,0.5408
6,0.7725,0.8442,0.8367,0.7753,0.8049,0.533,0.5352
7,0.7721,0.8434,0.839,0.7737,0.805,0.5318,0.5343
8,0.7736,0.8451,0.8386,0.7759,0.806,0.5353,0.5376
9,0.7758,0.8456,0.8447,0.7756,0.8087,0.5392,0.5421


In [None]:
#soft 앙상블 후 성능평가
blended = blend_models(estimator_list = models, method = 'soft')
pred_holdout = predict_model(blended)

In [None]:
#전체 데이터에 대해서 다시 학습하는 구간
final_model = finalize_model(blended)

IntProgress(value=0, description='Processing: ', max=4)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC


KeyboardInterrupt: 

In [None]:
#save model
save_model(final_model,'31_3_features')

# inference

In [None]:
# MAKE PREDICTION
df_test_shift = df_test[df_test['userID'] != df_test['userID'].shift(-1)] # 맞춰야하는 row만 모아놓은 것

predictions = predict_model(final_model, data = df_test_shift[FEATS])

# predictions는 test data에 'Score' column이 추가된 df입니다.
print(predictions.sample(5))
total_preds = predictions['Score']

# SAVE OUTPUT
output_dir = './'
prediction_name = datetime.now(timezone(timedelta(hours=9))).strftime('%m%d_%H%M')

write_path = os.path.join(output_dir, f"{prediction_name}.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)    
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("id,prediction\n")
    for id, p in enumerate(total_preds):
        w.write('{},{}\n'.format(id,p))

writing prediction : ./0608_2215.csv
