In [1]:
# https://www.kaggle.com/code/sarmat/lgbm-stacking-example/notebook

In [2]:
import numpy as np # linear algebra
import pandas as pd
import random
from scipy.sparse.linalg import svds

from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF

import torch
from tqdm import tqdm
from dataset import custom_train_test_split, make_dataset

from sklearn.metrics import RocCurveDisplay, accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve, auc


def get_metric(targets, preds):
    auc = roc_auc_score(targets, preds)
    acc = accuracy_score(targets, np.where(preds >= 0.5, 1, 0))
    precsion = precision_score(targets, np.where(preds >= 0.5, 1, 0))
    recall = recall_score(targets, np.where(preds >= 0.5, 1, 0))
    F1_score = f1_score(targets, np.where(preds >= 0.5, 1, 0))

    print('auc :',auc)
    print('acc :',acc)
    print('precision :',precsion)
    print('recall :',recall)

def test_to_csv(preds, name:str):
    
    result = []
    for n,i in enumerate(preds):
        row = {}    
        row['id'] = n
        row['prediction'] = i
        result.append(row)
    pd.DataFrame(result).to_csv(f'output/{name}.csv', index=None)
    


In [3]:
train_data = pd.read_csv('/opt/ml/input/data/train_data.csv')
test_data  = pd.read_csv('/opt/ml/input/data/test_data.csv')

In [4]:
train_data.drop_duplicates(subset = ["userID", "assessmentItemID"], keep = "last", inplace = True)
train_data.drop(['Timestamp','testId','KnowledgeTag'], axis=1, inplace=True, errors='ignore')

In [5]:
matrix_train = train_data.pivot_table('answerCode', index='userID', columns='assessmentItemID')
matrix_train.fillna(0.5, inplace=True)

In [6]:
user_id2idx = {v:i for i,v in enumerate(matrix_train.index)}
user_idx2id = {i:v for i,v in enumerate(matrix_train.index)}

item_id2idx = {v:i for i,v in enumerate(matrix_train.columns)}
item_idx2id = {i:v for i,v in enumerate(matrix_train.columns)}

In [124]:
def predict(matrix, userid, itemid, user_id2idx, item_id2idx,n=12):
    nmf = NMF(n_components=n, max_iter=1000)
    X = matrix
    nmf.fit(X)
    X_pred = nmf.inverse_transform(nmf.transform(X))

    ret = [X_pred[user_id2idx[u], item_id2idx[i]] for u,i in zip(userid, itemid)]
    return ret

In [132]:
valid_user  = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv').userID.unique()
all_train = pd.read_csv('/opt/ml/input/data/all.csv')
valid_data = all_train[all_train.userID.isin(valid_user)]
userid = sorted(list(set([u for u in valid_data.userID])))
user_id2idx_valid = {v:i for i,v in enumerate(userid)}

matrix_valid = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(valid_data.userID, valid_data.assessmentItemID, valid_data.answerCode):
    user,item = user_id2idx_valid[user],item_id2idx[item]
    matrix_valid[user,item] = a

valid_predict1 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 20)
valid_predict2 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 22)
valid_predict3 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 24)
valid_predict4 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 26)
valid_predict5 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 28)
valid_predict6 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 30)
valid_predict7 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 32)
valid_predict8 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 34)
valid_predict9 = predict(matrix_valid, valid_data.userID, valid_data.assessmentItemID, user_id2idx_valid, item_id2idx, 36)

# item_id2idx는 train에서 사용한 것을 다시 사용한다.
test_data  = pd.read_csv('/opt/ml/input/data/test_data.csv')

userid = sorted(list(set([u for u in test_data.userID])))
user_id2idx_test = {v:i for i,v in enumerate(userid)}

matrix_test = 0.5*np.ones((len(userid), len(item_id2idx)))
for user,item,a in zip(test_data.userID, test_data.assessmentItemID, test_data.answerCode):
    user,item = user_id2idx_test[user],item_id2idx[item]
    if a<0:a=0.5
    matrix_test[user,item] = a

test_data = test_data[test_data.answerCode==-1]

test_predict1 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 20)
test_predict2 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 22)
test_predict3 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 24)
test_predict4 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 26)
test_predict5 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 28)
test_predict6 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 30)
test_predict7 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 32)
test_predict8 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 34)
test_predict9 = predict(matrix_test, test_data.userID, test_data.assessmentItemID, user_id2idx_test, item_id2idx, 36)

# print('Fold no: {}'.format(fold_))
print("AUC NMF1:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict1))))
print("AUC NMF2:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict2))))
print("AUC NMF3:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict3))))
print("AUC NMF4:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict4))))
print("AUC NMF5:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict5)))) 
print("AUC NMF6:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict6)))) 
print("AUC NMF7:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict7)))) 
print("AUC NMF8:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict8)))) 
print("AUC NMF9:{} ".format(get_metric(valid_data.answerCode.to_numpy(), np.array(valid_predict9)))) 



auc : 0.854055734166637
acc : 0.8118275548486912
precision : 0.8239003669083602
recall : 0.9065997777043467
AUC NMF1:None 
auc : 0.8589932043500832
acc : 0.815870591338375
precision : 0.827833509568762
recall : 0.9077893598485987
AUC NMF2:None 
auc : 0.8646303977703766
acc : 0.8194963329225997
precision : 0.8308695222730714
recall : 0.9097059088587822
AUC NMF3:None 
auc : 0.8698873724865015
acc : 0.8232204931166024
precision : 0.8338313978400115
recall : 0.9119468893628526
AUC NMF4:None 
auc : 0.8746891790692226
acc : 0.826873791911565
precision : 0.8365948043667066
recall : 0.9143861335576317
AUC NMF5:None 
auc : 0.8796837377871889
acc : 0.8313577437730546
precision : 0.8403632538646665
recall : 0.9167833218180179
AUC NMF6:None 
auc : 0.8836672438619616
acc : 0.8345425699854734
precision : 0.8430499950368925
recall : 0.9184835831656103
AUC NMF7:None 
auc : 0.8879641983171652
acc : 0.8372156194270463
precision : 0.8446883903428285
recall : 0.9208927874072517
AUC NMF8:None 
auc : 0.8920

In [133]:
new_valid = np.array([valid_predict1, valid_predict2, valid_predict3, valid_predict4, valid_predict5, valid_predict6, valid_predict7, valid_predict8, valid_predict9]).T

new_test = np.array([test_predict1, test_predict2, test_predict3, test_predict4, test_predict5, test_predict6, test_predict7, test_predict8, test_predict9]).T

In [134]:
val = pd.read_csv('/opt/ml/input/data/cv_valid_data.csv')
tail_idx = val.index[val.answerCode==-1].to_numpy()

In [135]:
y_valid = valid_data.answerCode.to_numpy()

valid_tail = [new_valid[i] for i in range(len(new_valid)) if i in tail_idx]
y_tail = [y_valid[i] for i in range(len(y_valid)) if i in tail_idx]

new_valid = [new_valid[i] for i in range(len(new_valid)) if not i in tail_idx]
y_new_valid = [y_valid[i] for i in range(len(y_valid)) if not i in tail_idx]

In [138]:
from catboost import CatBoostClassifier, Pool

train_pool = Pool(new_valid, y_new_valid)
# eval_pool = Pool(valid_tail , y_tail)
eval_pool = Pool(valid_tail, y_tail)

Final_cat = CatBoostClassifier(
            iterations = 500,
            random_seed = 42,
            learning_rate = 0.01,
            loss_function = 'Logloss', 
            custom_metric = ['Logloss','AUC'],
            early_stopping_rounds = 30,
            use_best_model =  True,
            task_type = "GPU",
            bagging_temperature = 1,
            verbose = False)

Final_cat.fit(train_pool, eval_set=eval_pool, plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7fbb2908c910>

In [139]:
preds = Final_cat.predict(new_test , prediction_type='Probability')[:,1]
val_preds = Final_cat.predict(valid_tail , prediction_type='Probability')[:,1]

get_metric(y_tail, val_preds)

from datetime import date, datetime, timezone, timedelta

KST = timezone(timedelta(hours=9))
time_record = datetime.now(KST)
_day = str(time_record)[:10]
_time = str(time_record.time())[:8]
now_time = _day+'_'+_time

test_to_csv(new_test.mean(axis=1),f'Stacking_NMF_{now_time}')

auc : 0.8681234100370027
acc : 0.7970430107526881
precision : 0.7956403269754768
recall : 0.7934782608695652


In [91]:
test_to_csv( new_test[:,0], 'NMF_38')
test_to_csv( new_test[:,1], 'NMF_40')
test_to_csv( new_test[:,2], 'NMF_42')
test_to_csv( new_test[:,3], 'NMF_44')
test_to_csv( new_test[:,4], 'NMF_46')
test_to_csv( new_test[:,5], 'NMF_48')
test_to_csv( new_test[:,6], 'NMF_50')
test_to_csv( new_test[:,7], 'NMF_52')


In [32]:
new_test[:,-1]

744