In [94]:
import sys
import os

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


from tabpfn import TabPFNClassifier

sys.path.append(r"../")
from feature_engineering import feature_engineering


In [95]:
def load_raw_data(basepath="../../data/",IS_CUSTOM=True):
    """Load data for xgboost"""
    path1 = os.path.join(basepath, "train_data.csv")
    path2 = os.path.join(basepath, "test_data.csv")
    if IS_CUSTOM:
        print("Load custom...")
        path2 = os.path.join(basepath, "custom_test_data.csv")
    data1 = pd.read_csv(path1)
    data2 = pd.read_csv(path2)
    data = pd.concat([data1, data2])
    data = data.sort_values(["userID", "Timestamp"])
    data.drop_duplicates(
        subset=["userID", "assessmentItemID"], keep="last", inplace=True
    )
    data = feature_engineering(data)
    return data


data = load_raw_data()
data

Load custom...


Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,userAnswerRate,userSolvedLen,testAnswerRate,testSolvedLen,...,mid3,last3,timeConcentrationRate,timeConcentrationCount,timeConcentrationLevel,user_correct_answer,user_total_answer,user_acc,monthAnswerRate,monthSolvedCount
0,0,A060001001,A060000001,1,2020-03-24 00:17:11,7224,0.630872,745,0.951701,1470,...,001,001,0.650422,109984,2,,0,,0.681427,189115
1,0,A060001002,A060000001,1,2020-03-24 00:17:14,7225,0.630872,745,0.951701,1470,...,001,002,0.650422,109984,2,1.0,1,1.000000,0.681427,189115
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,0.630872,745,0.951701,1470,...,001,003,0.650422,109984,2,2.0,2,1.000000,0.681427,189115
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,0.630872,745,0.951701,1470,...,001,004,0.650422,109984,2,3.0,3,1.000000,0.681427,189115
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,0.630872,745,0.951701,1470,...,001,005,0.650422,109984,2,4.0,4,1.000000,0.681427,189115
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2475969,7441,A030071005,A030000071,0,2020-06-05 06:50:21,438,0.555556,9,0.666892,1480,...,071,005,0.628544,182506,0,1.0,4,0.250000,0.651950,248806
2475970,7441,A040165001,A040000165,1,2020-08-21 01:06:39,8836,0.555556,9,0.649529,1167,...,165,001,0.644426,149454,1,1.0,5,0.200000,0.656889,347095
2475971,7441,A040165002,A040000165,1,2020-08-21 01:06:50,8836,0.555556,9,0.649529,1167,...,165,002,0.644426,149454,1,2.0,6,0.333333,0.656889,347095
2475972,7441,A040165003,A040000165,1,2020-08-21 01:07:36,8836,0.555556,9,0.649529,1167,...,165,003,0.644426,149454,1,3.0,7,0.428571,0.656889,347095


In [96]:
data.columns

Index(['userID', 'assessmentItemID', 'testId', 'answerCode', 'Timestamp',
       'KnowledgeTag', 'userAnswerRate', 'userSolvedLen', 'testAnswerRate',
       'testSolvedLen', 'tagAnswerRate', 'tagSolvedLen', 'itemAnswerRate',
       'itemSolvedLen', 'year', 'month', 'day', 'hour', 'minute', 'second',
       'first3', 'mid3', 'last3', 'timeConcentrationRate',
       'timeConcentrationCount', 'timeConcentrationLevel',
       'user_correct_answer', 'user_total_answer', 'user_acc',
       'monthAnswerRate', 'monthSolvedCount'],
      dtype='object')

In [97]:
data["userID"].nunique()

7442

In [99]:
def preprocess_tabpfn_data(
        data,
        drops = [
                'userID', 'assessmentItemID', 'testId', 'Timestamp',
                'KnowledgeTag', 'userSolvedLen', 
                'testSolvedLen', 'tagSolvedLen', 
                'itemSolvedLen', 'year', 'day', 'minute', 'second',
                'mid3', 'last3', 
                'timeConcentrationCount', 'monthSolvedCount'
            ]
    ):
    data = data.drop(drops, axis=1)
    train = data[data["answerCode"]!=-1]
    test = data[data["answerCode"]==-1]
    return train, test

train, test = preprocess_tabpfn_data(data.loc[s:s+step])

In [100]:
test

Unnamed: 0,answerCode,userAnswerRate,testAnswerRate,tagAnswerRate,itemAnswerRate,month,hour,first3,timeConcentrationRate,timeConcentrationLevel,user_correct_answer,user_total_answer,user_acc,monthAnswerRate
10559,-1,0.693769,0.454545,0.491056,0.457831,12,4,70,0.633334,1,914.0,1315,0.695057,0.493275
14851,-1,0.817315,0.436116,0.413959,0.367347,10,5,90,0.624263,0,1030.0,1258,0.81876,0.668669


In [81]:
for col in train.columns:
    train[col]=train[col].astype(float)
    test[col]=test[col].astype(float)

In [82]:
# X, y = train.drop("answerCode",axis=1), train["answerCode"]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# classifier = TabPFNClassifier(device='cuda', N_ensemble_configurations=4)
# classifier.fit(X_train, y_train, overwrite_warning=True)

In [83]:



from sklearn.model_selection import KFold

X = np.array(df.iloc[:, :-1]) # class 열 제외한 feature 열들 모음 -> array 변환
y = df['class']

# split 개수, 셔플 여부 및 seed 설정
kf = KFold(n_splits = 5, shuffle = True, random_state = 50)

# split 개수 스텝 만큼 train, test 데이터셋을 매번 분할
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

accuracy_history = []

# K-fold 검증 과정으로 실제 랜덤 포레스트 모델을 학습하여 정확도 평균을 내는 방법
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    model = TabPFNClassifier(device='cuda', N_ensemble_configurations=4)
    model.fit(X_train, y_train) # 모델 학습
    y_pred = model.predict(X_test) # 예측 라벨
    accuracy_history.append(accuracy_score(y_pred, y_test)) # 정확도 측정 및 기록

print("각 분할의 정확도 기록 :", accuracy_history)
print("평균 정확도 :", np.mean(accuracy_history))




Loading model that can be used for inference only
Using a Transformer with 25.82 M parameters


In [84]:
y_eval, p_eval = classifier.predict(X_test, return_winning_probability=True)
print('Accuracy', accuracy_score(y_test, y_eval))

Accuracy 0.8103030303030303


In [85]:
y_eval, p_eval

(array([1., 1., 1., ..., 1., 1., 1.]),
 array([0.9117074 , 0.96245426, 0.9601697 , ..., 0.8045784 , 0.8854853 ,
        0.8367717 ], dtype=float32))

In [98]:
test.drop(["answerCode"],axis=1)

Unnamed: 0,userAnswerRate,testAnswerRate,tagAnswerRate,itemAnswerRate,month,hour,first3,timeConcentrationRate,timeConcentrationLevel,user_correct_answer,user_total_answer,user_acc,monthAnswerRate
10559,0.693769,0.454545,0.491056,0.457831,12.0,4.0,70.0,0.633334,1.0,914.0,1315.0,0.695057,0.493275
14851,0.817315,0.436116,0.413959,0.367347,10.0,5.0,90.0,0.624263,0.0,1030.0,1258.0,0.81876,0.668669


In [86]:
y_eval, p_eval = classifier.predict(test.drop("answerCode",axis=1), return_winning_probability=True)
print('Accuracy', accuracy_score(y_test, y_eval))

ValueError: Found input variables with inconsistent numbers of samples: [1650, 2]

In [101]:
data = load_raw_data()
train, test = preprocess_tabpfn_data(data)
for col in train.columns:
    train[col]=train[col].astype(float)
    test[col]=test[col].astype(float)


Load custom...


In [110]:
sub = pd.read_csv("/opt/ml/input/data/sample_submission.csv")
sub["prediction"]=p_eval
sub.to_csv("tabpfn.csv")

In [111]:
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score

def get_accuracy(PRED_PATH = "/opt/ml/input/code/TabPfn/tabpfn.csv"):
    threshold=0.5
    ANSWER_PATH = "../../data/custom_answer.csv"

    submission_result = pd.read_csv(PRED_PATH)
    answer = pd.read_csv(ANSWER_PATH)

    y_pred, y = submission_result["prediction"], answer["prediction"]

    return f"accuracy_score: {accuracy_score(y,y_pred.apply(lambda x: 1 if x > threshold else 0))}\nroc  auc_score: {roc_auc_score(y,y_pred)}"

if __name__ == "__main__":
    print(get_accuracy())


accuracy_score: 0.5551075268817204
roc  auc_score: 0.5804993306657498
