In [3]:
import os
import time

import numpy as np
import pandas as pd

from args import parse_args
from data_loader import Dataset, Preprocess
from utils import set_seeds
from trainer import boosting_model

import warnings
warnings.filterwarnings("ignore")

# python main.py --model    # CAT, XG, LGBM   default="CAT", 
# python main.py --model CAT --trials 1 --cat_feats userID assessmentItemID testId KnowledgeTag Month DayOfWeek TimeOfDay categorize_solvingTime categorize_ProblemAnswerRate categorize_TagAnswerRate categorize_TestAnswerRate ProblemNumber --n_window 2

# Boosting 계열, 수정할수 있는 파라미터
# 1. FEATURE 선택
# 2. model 선택, default: CAT, args.py에 있음
# 3. train시 valid set 쓸건지 안쓸건지, default: N, args.py에 있음
# 4. optuna 시도 횟수, default: n_trials=50, 보통 100번이상이면 수렴됨, args.py에 있음
# 5. optuna params, trainer.py에 있음



def main(args):
    ######################## SELECT FEATURE
    FEATURE = ['userID','assessmentItemID','testId','KnowledgeTag',
               'SolvingTime','CumulativeTime',
               'Month','DayOfWeek','TimeOfDay',
               'problems_cumulative','problems_last7days','problems_last30days',
               'CumulativeUserProblemAnswerRate','CumulativeProblemCount',
               'ProblemAnswerRate','TagAnswerRate','CumulativeUserTagAnswerRate','TestAnswerRate',
               'categorize_solvingTime','categorize_ProblemAnswerRate','categorize_TagAnswerRate','categorize_TestAnswerRate',
               'CumulativeUserTagExponentialAverage','UserTagCount',
    ]

In [25]:
params = {'iterations': 2267, 'od_wait': 1948, 'learning_rate': 0.07062651890243886, 'min_data_in_leaf': 11, 'leaf_estimation_iterations': 13, 'l2_leaf_reg': 0.4292148678534886, 'border_count': 80, 'random_strength': 1, 'bagging_temperature': 0.4279553208777898}

In [31]:
data_dir = '../../data/'
file_name = 'FE_v4.1.csv'
test_file_name = 'test_data.csv'

cat_feats = ['userID', 'assessmentItemID','testId', 'KnowledgeTag', 'Month', 'DayOfWeek', 'TimeOfDay', 'categorize_solvingTime', 'categorize_ProblemAnswerRate', 'categorize_TagAnswerRate', 'categorize_TestAnswerRate']
feats = ['userID','assessmentItemID','testId','KnowledgeTag',
               'SolvingTime','CumulativeTime',
               'Month','DayOfWeek','TimeOfDay',
               'problems_cumulative','problems_last7days','problems_last30days',
               'CumulativeUserProblemAnswerRate','CumulativeProblemCount',
               'ProblemAnswerRate','TagAnswerRate','CumulativeUserTagAnswerRate','TestAnswerRate',
               'categorize_solvingTime','categorize_ProblemAnswerRate','categorize_TagAnswerRate','categorize_TestAnswerRate']

In [32]:
 # Time
now = time.localtime()
now_date = time.strftime('%Y%m%d', now)
now_hour = time.strftime('%X', now)
save_time = now_date + '_' + now_hour.replace(':', '')

######################## DATA LOAD
print("### DATA LOAD ###")
df = pd.read_csv(data_dir + file_name, parse_dates=["Timestamp"])
# test = pd.read_csv(data_dir + test_file_name, parse_dates=["Timestamp"])


### DATA LOAD ###


In [33]:
for feature in cat_feats:
        df[feature] = df[feature].astype('category')

In [34]:
train = df[df["answerCode"] >= 0]
test = df[df["answerCode"] == -1]

In [35]:
train_x = train[feats]
train_y = train['answerCode']

In [46]:
train_x.columns, test_x.columns

(Index(['userID', 'assessmentItemID', 'testId', 'KnowledgeTag', 'SolvingTime',
        'CumulativeTime', 'Month', 'DayOfWeek', 'TimeOfDay',
        'problems_cumulative', 'problems_last7days', 'problems_last30days',
        'CumulativeUserProblemAnswerRate', 'CumulativeProblemCount',
        'ProblemAnswerRate', 'TagAnswerRate', 'CumulativeUserTagAnswerRate',
        'TestAnswerRate', 'categorize_solvingTime',
        'categorize_ProblemAnswerRate', 'categorize_TagAnswerRate',
        'categorize_TestAnswerRate'],
       dtype='object'),
 Index(['userID', 'assessmentItemID', 'testId', 'KnowledgeTag', 'SolvingTime',
        'CumulativeTime', 'Month', 'DayOfWeek', 'TimeOfDay',
        'problems_cumulative', 'problems_last7days', 'problems_last30days',
        'CumulativeUserProblemAnswerRate', 'CumulativeProblemCount',
        'ProblemAnswerRate', 'TagAnswerRate', 'CumulativeUserTagAnswerRate',
        'TestAnswerRate', 'categorize_solvingTime',
        'categorize_ProblemAnswerRate', 'c

In [39]:
test_x = test[feats]

In [36]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(
                **params, task_type='GPU', devices='cuda',  
                custom_metric = 'AUC', eval_metric = 'AUC',
                objective= 'Logloss'              
            )

In [37]:
model.fit(train_x, train_y, cat_features=cat_feats)

Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 343ms	remaining: 12m 57s
1:	total: 685ms	remaining: 12m 55s
2:	total: 1.01s	remaining: 12m 45s
3:	total: 1.39s	remaining: 13m 6s
4:	total: 1.69s	remaining: 12m 46s
5:	total: 2s	remaining: 12m 32s
6:	total: 2.32s	remaining: 12m 27s
7:	total: 2.65s	remaining: 12m 29s
8:	total: 2.98s	remaining: 12m 27s
9:	total: 3.39s	remaining: 12m 44s
10:	total: 3.71s	remaining: 12m 40s
11:	total: 4.03s	remaining: 12m 36s
12:	total: 4.37s	remaining: 12m 37s
13:	total: 4.69s	remaining: 12m 35s
14:	total: 5.01s	remaining: 12m 32s
15:	total: 5.38s	remaining: 12m 37s
16:	total: 5.7s	remaining: 12m 34s
17:	total: 6.08s	remaining: 12m 40s
18:	total: 6.44s	remaining: 12m 41s
19:	total: 6.8s	remaining: 12m 43s
20:	total: 7.19s	remaining: 12m 48s
21:	total: 7.57s	remaining: 12m 52s
22:	total: 7.93s	remaining: 12m 53s
23:	total: 8.29s	remaining: 12m 55s
24:	total: 8.63s	remaining: 12m 53s
25:	total: 9.03s	remaining: 12m 58s
26:	total: 9.37s	remaining: 12m 57s
27:	total: 9.69s	remaining: 12m 55s
28:	tota

<catboost.core.CatBoostClassifier at 0x7f1ba7a66050>

In [47]:
pred = model.predict_proba(test_x)
pred

array([[0.36516278, 0.63483722],
       [0.50260002, 0.49739998],
       [0.86136593, 0.13863407],
       ...,
       [0.02437989, 0.97562011],
       [0.01410869, 0.98589131],
       [0.9160535 , 0.0839465 ]])

In [51]:
result = pred[:,1]

In [56]:
test_x['prediction'] = result
submission = test_x["prediction"].reset_index(drop=True).reset_index()

submission.rename(columns={"index": "id"}, inplace=True)

submission_filename = f"cat_{save_time}_submission.csv"
submission.to_csv(submission_filename, index=False)

In [76]:
a = pd.read_csv('submit/output-10.csv') # 나 78 73
b = pd.read_csv('submit/output-11.csv') # 욱
c = pd.read_csv('submit/output-12.csv') # 최고점
d = pd.read_csv('submit/output-13.csv') # 나 79 72
a.head()

Unnamed: 0,id,prediction
0,0,0.690254
1,1,0.653753
2,2,0.238448
3,3,0.785384
4,4,0.271341


In [77]:
c.head()

Unnamed: 0,id,prediction
0,0,0.743928
1,1,0.778528
2,2,0.237336
3,3,0.84887
4,4,0.240068


In [78]:
c.prediction = 0.8*c.prediction + 0.2*d.prediction

In [79]:
c.head()

Unnamed: 0,id,prediction
0,0,0.729398
1,1,0.758722
2,2,0.236018
3,3,0.822441
4,4,0.238703


In [80]:
c.to_csv('ensemble highest+me79.csv', index=False)

In [None]:
c.