In [36]:
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

0          0
1          1
2          1
3          1
4          1
          ..
2525951    0
2525952    0
2525953    1
2525954    1
2525955    1
Name: inter, Length: 2525956, dtype: int64

In [79]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
import random

FEATS = [
   'KnowledgeTag', 'month', 'hour', 'week', 'elapsed', 'elapsed_cate',
   'assessmentItemID0', 'assessmentItemID1', 'assessmentItemID2',
   'testId0', 'testId1', 'test0_mean', 'test0_std', 'test1_mean',
   'test1_std', 'tag_mean', 'tag_std', 'ass0_mean', 'ass0_std',
   'ass1_mean', 'ass1_std', 'ass2_mean', 'ass2_std',]

cate = ['KnowledgeTag', 'month', 'hour', 'week', 'elapsed_cate', 'testId0', 'testId1',
       'assessmentItemID0', 'assessmentItemID1', 'assessmentItemID2']
# cate = ['KnowledgeTag', 'month', 'hour', 'week', 'week_hour', 'elapsed_cate',

# 'assessmentItemID0', 'assessmentItemID1', 'assessmentItemID2',]

# conti = ['tag_mean', 'tag_std', 'ass0_mean', 'ass0_std', 'elapsed',

# 'ass1_mean', 'ass1_std', 'ass2_mean', 'ass2_std', 'user_total_answer', 'solve_order']

FEATS = cate + conti
df = pd.read_csv('../../data/pkj.csv')

train, test = custom_train_test_split(df)

y_train_cat = train['answerCode']
X_train_cat = train.drop(['answerCode'], axis=1)[FEATS]

y_test_cat = test['answerCode']
X_test_cat = test.drop(['answerCode'], axis=1)[FEATS]

In [80]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np

# Best trial:
#   Value: 0.8161062067008433
#   Params: 
#     objective: Logloss
#     colsample_bylevel: 0.09974535513468831
#     depth: 12
#     boosting_type: Ordered
#     bootstrap_type: MVS
catboost_cl = CatBoostClassifier(cat_features=cate, n_estimators=5000, eval_metric ='AUC', 
                                 use_best_model=True, od_type = "Iter", od_wait = 500)

catboost_cl.fit(X_train_cat, y_train_cat, verbose=100, eval_set=(X_test_cat, y_test_cat))

preds = catboost_cl.predict_proba(X_test_cat)[:,1]
acc = accuracy_score(y_test_cat, np.where(preds >= 0.5, 1, 0))
auc = roc_auc_score(y_test_cat, preds)

print(f'VALID AUC : {auc} ACC : {acc}\n')
# 0.801

Learning rate set to 0.099769
0:	test: 0.6941157	best: 0.6941157 (0)	total: 715ms	remaining: 59m 34s
100:	test: 0.7782130	best: 0.7787668 (60)	total: 1m 7s	remaining: 54m 17s
200:	test: 0.7830181	best: 0.7830181 (200)	total: 2m 32s	remaining: 1h 50s
300:	test: 0.7841188	best: 0.7844676 (292)	total: 3m 39s	remaining: 57m 12s
400:	test: 0.7852745	best: 0.7854757 (395)	total: 5m 8s	remaining: 58m 55s
500:	test: 0.7848538	best: 0.7854757 (395)	total: 6m 20s	remaining: 56m 59s
600:	test: 0.7857579	best: 0.7857579 (600)	total: 7m 29s	remaining: 54m 49s
700:	test: 0.7855537	best: 0.7857893 (602)	total: 8m 45s	remaining: 53m 42s
800:	test: 0.7853365	best: 0.7857893 (602)	total: 9m 55s	remaining: 52m 1s
900:	test: 0.7854818	best: 0.7857893 (602)	total: 11m 10s	remaining: 50m 48s
1000:	test: 0.7854084	best: 0.7857893 (602)	total: 12m 22s	remaining: 49m 25s
1100:	test: 0.7858375	best: 0.7858551 (1097)	total: 13m 33s	remaining: 48m
1200:	test: 0.7862146	best: 0.7862436 (1197)	total: 14m 45s	remain

In [48]:
def custom_train_test_split_fold_5(df, ratio=0.2, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data, k = 0, 0
    user_ids =[[],[],[],[],[]]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            k += 1
            max_train_data_len += ratio*len(df)
        user_ids[k].append(user_id)

    return user_ids


In [55]:
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import numpy as np

user_id = custom_train_test_split_fold_5(df)

for i in tqdm(range(5)):
    u = []
    for j in range(5):
        if j != i:
            u += user_id[j]
        
    train = df[df['userID'].isin(u)]
    test = df[df['userID'].isin(u) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    
    y_train_cat = train['answerCode']
    X_train_cat = train.drop(['answerCode'], axis=1)[FEATS]

    y_test_cat = test['answerCode']
    X_test_cat = test.drop(['answerCode'], axis=1)[FEATS]

   
    # catboost_cl = CatBoostClassifier(cat_features=cate, n_estimators=5000, use_best_model=True,
    # eval_metric ='AUC')
    catboost_cl = CatBoostClassifier(cat_features=cate, n_estimators=5000, eval_metric ='AUC', 
                                     use_best_model=True, od_type = "Iter", od_wait = 100)
    
    catboost_cl.fit(X_train_cat, y_train_cat, verbose=500, eval_set=(X_test_cat, y_test_cat))
    preds = catboost_cl.predict_proba(X_test_cat)[:,1]
    acc = accuracy_score(y_test_cat, np.where(preds >= 0.5, 1, 0))
    auc = roc_auc_score(y_test_cat, preds)
    
    print(f'VALID AUC : {auc} ACC : {acc}\n')
    
    catboost_cl.save_model(f'catboost_info/model/catboost_{i}')

  0%|          | 0/5 [00:00<?, ?it/s]

Learning rate set to 0.103118
0:	test: 0.6934543	best: 0.6934543 (0)	total: 868ms	remaining: 1h 12m 20s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8048703488
bestIteration = 273

Shrink model to first 274 iterations.
VALID AUC : 0.8048703488455524 ACC : 0.7230769230769231



 20%|██        | 1/5 [04:58<19:52, 298.20s/it]

Learning rate set to 0.103122
0:	test: 0.7138384	best: 0.7138384 (0)	total: 771ms	remaining: 1h 4m 15s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8081097349
bestIteration = 219

Shrink model to first 220 iterations.


 40%|████      | 2/5 [09:11<14:14, 284.83s/it]

VALID AUC : 0.8081097349315728 ACC : 0.7336956521739131

Learning rate set to 0.103104
0:	test: 0.6965904	best: 0.6965904 (0)	total: 757ms	remaining: 1h 3m 5s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7934909084
bestIteration = 189

Shrink model to first 190 iterations.


 60%|██████    | 3/5 [12:58<08:54, 267.45s/it]

VALID AUC : 0.7934909084277142 ACC : 0.7191087103308575

Learning rate set to 0.103119
0:	test: 0.7250497	best: 0.7250497 (0)	total: 844ms	remaining: 1h 10m 20s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8075652673
bestIteration = 119

Shrink model to first 120 iterations.


 80%|████████  | 4/5 [15:53<03:59, 239.74s/it]

VALID AUC : 0.8075652673294167 ACC : 0.7318212141427618

Learning rate set to 0.103109
0:	test: 0.6907348	best: 0.6907348 (0)	total: 843ms	remaining: 1h 10m 13s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.7925107527
bestIteration = 217

Shrink model to first 218 iterations.


100%|██████████| 5/5 [20:07<00:00, 241.51s/it]

VALID AUC : 0.792510752688172 ACC : 0.7163879598662207






In [51]:
test = pd.read_csv('../../data/test.csv')
sub = test[test['answerCode'] == -1]

p = []

for i in tqdm(range(5)):
    catboost_cl = CatBoostClassifier(cat_features=cate, n_estimators=2000)
    catboost_cl.load_model(f'catboost_info/model/catboost_{i}')
    preds = catboost_cl.predict_proba(sub[FEATS])[:,1]
    p.append(preds)
    
s = pd.read_csv('output/submission.csv')
m = (p[0] + p[1] + p[2] + p[3] + p[4])/5
s['prediction'] = m

s.to_csv('output/submission_cat_hyp.csv', index=False)

100%|██████████| 5/5 [00:00<00:00,  6.32it/s]


In [6]:
test = pd.read_csv('../../data/test.csv')
sub = test[test['answerCode'] == -1]
sub['userID'] = sub['userID'].map(cate2label)

import time
from datetime import datetime
def convert_time(s):
    timestamp = time.mktime(
        datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple()
    )
    return int(timestamp)

sub['Timestamp'] = sub['Timestamp'].apply(convert_time)

preds = catboost_cl.predict_proba(sub[FEATS])[:,1]

s = pd.read_csv('output/submission.csv')
s['prediction'] = preds

s.to_csv('output/submission_cat_2.0.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['userID'] = sub['userID'].map(cate2label)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub['Timestamp'] = sub['Timestamp'].apply(convert_time)


In [3]:
test = pd.read_csv('../../data/test.csv')
test.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'month', 'hour', 'week', 'elapsed', 'elapsed_cate',
       'assessmentItemID0', 'assessmentItemID1', 'assessmentItemID2',
       'testId0', 'testId1', 'test0_mean', 'test0_std', 'test1_mean',
       'test1_std', 'tag_mean', 'tag_std', 'ass0_mean', 'ass0_std',
       'ass1_mean', 'ass1_std', 'ass2_mean', 'ass2_std', 'user_correct_answer',
       'user_total_answer', 'user_acc', 'recAccuracy', 'recCount'],
      dtype='object')

In [None]:
FEATS = ['Timestamp', 'userID',
       'KnowledgeTag', 'assessmentItemID0', 'assessmentItemID1',
       'assessmentItemID2', 'month', 'hour', 'week', 'elapsed', 'elapsed_cate',
       'tag_mean', 'tag_std', 'ass0_mean', 'ass0_std', 'ass1_mean', 'ass1_std',
       'ass2_mean', 'ass2_std', 'user_total_answer']

In [54]:
a = pd.read_csv('output/output (1).csv')
b = pd.read_csv('output/output (2).csv')

c = (a['prediction'] + b['prediction'])/2
b['prediction'] = c
b.to_csv('output/p.csv', index=False)