In [2]:
import random
import pandas as pd
import numpy as np
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score


In [3]:
train = pd.read_csv('../../data/train_data.csv')
test = pd.read_csv('../../data/test_data.csv')

In [4]:
df = pd.concat([train,test[test['answerCode'] != -1]]).reset_index(drop=True)
test_set = test[test['answerCode'] == -1]

In [5]:
def k_fold_split(df,fold):
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts())) # 유저 ID, 유저 수
    random.shuffle(users)

    max_train_data_len = len(df)/fold
    sum_of_train_data = 0
    val_idx =[[] for _ in range(fold)]
    val_set = []
    train_set = []

    idx = 0
    for k in range(fold-1):
        sum_of_train_data = 0
        for user_id, count in users[idx:]:
            sum_of_train_data += count
            if max_train_data_len < sum_of_train_data:
                break
            val_idx[k].append(user_id)
            idx += 1
            
    for i in range(idx,len(users)):
        val_idx[fold-1].append(users[i][0])

    for k in range(fold):
        train_set.append(df[df['userID'].isin(val_idx[k]) == False])
        val = df[df['userID'].isin(val_idx[k])]
        val_set.append(val[val['userID'] != val['userID'].shift(-1)])
    return train_set,val_set

In [36]:
fold = 5
train,test = k_fold_split(df,fold)
print(f"총 user 수 : {df['userID'].nunique()}")
for i in range(fold):
    print(f"train_{i} user 수 : {train[i]['userID'].nunique()}")
    print(f"test_{i} user 수 : {test[i]['userID'].nunique()}")
    print(f"겹치는 유저가 있나?? : {set(train[i]['userID'])&set(test[i]['userID'])}")

총 user 수 : 7442
train_0 user 수 : 5884
test_0 user 수 : 1558
겹치는 유저가 있나?? : set()
train_1 user 수 : 5956
test_1 user 수 : 1486
겹치는 유저가 있나?? : set()
train_2 user 수 : 5929
test_2 user 수 : 1513
겹치는 유저가 있나?? : set()
train_3 user 수 : 6028
test_3 user 수 : 1414
겹치는 유저가 있나?? : set()
train_4 user 수 : 5971
test_4 user 수 : 1471
겹치는 유저가 있나?? : set()


In [9]:
def get_stacking_base_datasets(model,FEATS,X_train, X_valid, X_test, n_folds):
    
    valid_len = []
    y_valid = X_valid[0]['answerCode']

    for i,v in enumerate(X_valid):
        valid_len.append(len(v))
        if i != 0:
            y_valid = pd.concat([y_valid,v['answerCode']])
            
    train_fold_pred = np.zeros((sum(valid_len) ,1 ))
    test_pred = np.zeros((X_test.shape[0],n_folds))
    print(model.__class__.__name__ , ' model 시작 ')
    
    for folder_counter , (train, valid) in enumerate(zip(X_train,X_valid)):
        
        print('\t 폴드 세트: ',folder_counter,' 시작 ')
        X_tr = train[FEATS].drop(columns = 'answerCode') 
        y_tr = train['answerCode']
        X_te = valid[FEATS].drop(columns = 'answerCode')
        
        
        model.fit(X_tr , y_tr)       
        
        train_fold_pred[sum(valid_len[:folder_counter]) : sum(valid_len[:folder_counter+1]), :] = model.predict_proba(X_te)[:,1].reshape(-1,1)
         
        print('\t',folder_counter,"fold auc : ",roc_auc_score(valid['answerCode'],model.predict_proba(X_te)[:,1]))

        test_pred[:, folder_counter] = model.predict_proba(X_test[FEATS].drop(columns = 'answerCode'))[:,1]

    print("\tvalid auc : ",roc_auc_score(y_valid,train_fold_pred))       
     
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)    
    
    
    return train_fold_pred , test_pred_mean

In [15]:
fold = 5
fold_train,fold_valid = k_fold_split(df,fold)
lgbm = LGBMClassifier()
catboost = CatBoostClassifier()

In [10]:
FEATS = ['userID','KnowledgeTag','answerCode']
train_fold_pred , test_pred_mean = get_stacking_base_datasets(lgbm,FEATS,fold_train, fold_valid, test_set, fold)

LGBMClassifier  model 시작 
	 폴드 세트:  0  시작 
	 0 fold auc :  0.6242959449404761
	 폴드 세트:  1  시작 
	 1 fold auc :  0.633947202601849
	 폴드 세트:  2  시작 
	 2 fold auc :  0.640838438248134
	 폴드 세트:  3  시작 
	 3 fold auc :  0.6221708899589136
	 폴드 세트:  4  시작 
	 4 fold auc :  0.5898191083074341
	valid auc :  0.6220098221712724


In [88]:
fold = 5
fold_train,fold_valid = k_fold_split(df,fold)
FEATS = ['userID','KnowledgeTag','answerCode']
train_fold_pred , test_pred_mean = get_stacking_base_datasets(catboost,FEATS,fold_train, fold_valid, test_set, fold)

CatBoostClassifier  model 시작 
	 폴드 세트:  0  시작 
Learning rate set to 0.265702
0:	learn: 0.6627127	total: 219ms	remaining: 3m 38s
1:	learn: 0.6460414	total: 373ms	remaining: 3m 6s
2:	learn: 0.6375350	total: 509ms	remaining: 2m 49s
3:	learn: 0.6323491	total: 647ms	remaining: 2m 41s
4:	learn: 0.6292678	total: 769ms	remaining: 2m 32s
5:	learn: 0.6275728	total: 886ms	remaining: 2m 26s
6:	learn: 0.6263297	total: 1.01s	remaining: 2m 23s
7:	learn: 0.6252297	total: 1.14s	remaining: 2m 21s
8:	learn: 0.6242095	total: 1.26s	remaining: 2m 19s
9:	learn: 0.6236789	total: 1.38s	remaining: 2m 16s
10:	learn: 0.6229650	total: 1.5s	remaining: 2m 14s
11:	learn: 0.6226077	total: 1.6s	remaining: 2m 11s
12:	learn: 0.6222268	total: 1.71s	remaining: 2m 9s
13:	learn: 0.6217244	total: 1.83s	remaining: 2m 9s
14:	learn: 0.6214816	total: 1.96s	remaining: 2m 8s
15:	learn: 0.6212179	total: 2.06s	remaining: 2m 6s
16:	learn: 0.6208464	total: 2.2s	remaining: 2m 7s
17:	learn: 0.6205934	total: 2.34s	remaining: 2m 7s
18:	lea

KeyboardInterrupt: 

In [32]:
fold = 5
models = [lgbm,lgbm]
FEATS = [['userID','KnowledgeTag','answerCode'],['userID','KnowledgeTag','answerCode']]
test_pred = stacking_pred(models,FEATS,fold_train,fold_valid,test_set,fold)


LGBMClassifier  model 시작 
	 폴드 세트:  0  시작 
	 0 fold auc :  0.6176656635525212
	 폴드 세트:  1  시작 
	 1 fold auc :  0.6043386497094793
	 폴드 세트:  2  시작 
	 2 fold auc :  0.6385521903220042
	 폴드 세트:  3  시작 
	 3 fold auc :  0.6131782723414019
	 폴드 세트:  4  시작 
	 4 fold auc :  0.6342386642547156
	valid auc :  0.6213462195756658
LGBMClassifier  model 시작 
	 폴드 세트:  0  시작 
	 0 fold auc :  0.6176656635525212
	 폴드 세트:  1  시작 
	 1 fold auc :  0.6043386497094793
	 폴드 세트:  2  시작 
	 2 fold auc :  0.6385521903220042
	 폴드 세트:  3  시작 
	 3 fold auc :  0.6131782723414019
	 폴드 세트:  4  시작 
	 4 fold auc :  0.6342386642547156
	valid auc :  0.6213462195756658
final train auc :  0.6582658762846436


In [31]:
def stacking_pred(models,FEATS,X_train, X_valid, X_test, n_folds):
    valid_len = 0
    stack_y_train = X_valid[0]['answerCode']

    for i,v in enumerate(X_valid):
        valid_len += len(v)
        if i != 0:
            stack_y_train = pd.concat([stack_y_train,v['answerCode']])
    
    stack_X_train = np.zeros((valid_len,len(models)))
    stack_X_test = np.zeros((len(X_test),len(models)))

    for i,(model,FEAT) in enumerate(zip(models,FEATS)):
        train_fold_pred, test_pred_mean = get_stacking_base_datasets(model,FEAT,X_train, X_valid, X_test, n_folds)
        stack_X_train[:,i] = train_fold_pred.reshape(-1)
        stack_X_test[:,i] = test_pred_mean.reshape(-1)
    
    final_model = LGBMClassifier()
    final_model.fit(
        stack_X_train,stack_y_train
    )
    
    train_preds = final_model.predict_proba(stack_X_train)[:,1]
    print("final train auc : ",roc_auc_score(stack_y_train,train_preds))

    final_preds = final_model.predict_proba(stack_X_test)[:,1]
    return final_preds


In [None]:
from datetime import datetime
from pytz import timezone

now = datetime.now(tz = timezone('Asia/Seoul'))
date_str = now.strftime('%m-%d-%H:%M:%S')
submission = pd.read_csv('../../data/sample_submission.csv')
submission['prediction'] = test_pred
submission.to_csv('./submission/stacking'+date_str+'.csv')