# Tabular Models(DKT)

In [1]:
# Import Modules
import pandas as pd
import numpy as np
import os
import random

# Modeling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score

In [14]:
# 파일 불러오기 

data_dir = '/opt/ml/input/DKT/data'
csv_file_path = os.path.join(data_dir,'train_data.csv')
df = pd.read_csv(csv_file_path)

### Train/Valid 데이터 전처리

In [15]:
def feature_engineering(df):
    
    #유저별 시퀀스를 고려하기 위해 아래와 같이 정렬
    df.sort_values(by=['userID','Timestamp'], inplace=True)
    
    #유저들의 문제 풀이수, 정답 수, 정답률을 시간순으로 누적해서 계산
    df['user_correct_answer'] = df.groupby('userID')['answerCode'].transform(lambda x: x.cumsum().shift(1))
    df['user_total_answer'] = df.groupby('userID')['answerCode'].cumcount()
    df['user_acc'] = df['user_correct_answer']/df['user_total_answer']

    # => Null 값 생김
    df = df.fillna(0)

    # testId와 KnowledgeTag의 전체 정답률은 한번에 계산
    # 아래 데이터는 제출용 데이터셋에 대해서도 재사용
    correct_t = df.groupby(['testId'])['answerCode'].agg(['mean', 'sum'])
    correct_t.columns = ["test_mean", 'test_sum']
    correct_k = df.groupby(['KnowledgeTag'])['answerCode'].agg(['mean', 'sum'])
    correct_k.columns = ["tag_mean", 'tag_sum']

    df = pd.merge(df, correct_t, on=['testId'], how="left")
    df = pd.merge(df, correct_k, on=['KnowledgeTag'], how="left")
    
    return df

# train과 test 데이터셋은 사용자 별로 묶어서 분리를 해주어야함
random.seed(42)
def custom_train_test_split(df, ratio=0.7, split=True):
    
    users = list(zip(df['userID'].value_counts().index, df['userID'].value_counts()))
    random.shuffle(users)
    
    max_train_data_len = ratio*len(df)
    sum_of_train_data = 0
    user_ids =[]

    for user_id, count in users:
        sum_of_train_data += count
        if max_train_data_len < sum_of_train_data:
            break
        user_ids.append(user_id)


    train = df[df['userID'].isin(user_ids)]
    test = df[df['userID'].isin(user_ids) == False]

    #test데이터셋은 각 유저의 마지막 interaction만 추출
    test = test[test['userID'] != test['userID'].shift(-1)]
    return train, test

def scaling(df, num_cols, target):
    # 사용할 변수만 남기기
    df = df.drop(drop_cols, axis=1)

    # 독립변수, 종속변수 나누기
    y = df[target]
    df = df.drop(target, axis=1)

    # 수치형이랑 범주형 칼럼 나누기
    X_num = df[num_cols]
    X_cat = df.drop(num_cols, axis=1)

    # 수치형 변수 스케일링 후 다시 합치기
    scaler = StandardScaler()
    scaler.fit(X_num)
    X_scaled = scaler.transform(X_num)
    X_scaled = pd.DataFrame(data=X_scaled, index=X_num.index, columns=X_num.columns)
    X = pd.concat([X_scaled, X_cat], axis=1)

    return X, y    
    

In [16]:
# 사용할 수치형 변수, 카테고리 변수, target, 사용 안 할 변수
num_cols = ['user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum','KnowledgeTag']
cat_cols = []
target = 'answerCode'
use_cols = num_cols + cat_cols + [target]
drop_cols = [x for x in df.columns if x not in use_cols]

# feature engineering
df = feature_engineering(df)

# 카테고리 변수 더미변수화
df = pd.get_dummies(df, columns=cat_cols ,drop_first=True)

# 유저별 분리
train, test = custom_train_test_split(df, ratio=0.7)

# 사용 안 할 변수 drop하고 Train, Test 스케일링 후 X, y 값 분리
X_train, y_train = scaling(train, num_cols=num_cols, target=target)
X_test, y_test = scaling(test, num_cols=num_cols, target=target)

### 사용할 모델들

In [17]:
def grid_searching(model, params, cv, is_grid=True): # grid search 하거나 안 하는 함수
    model = model
    if not is_grid: 
        model.fit(X_train, y_train)
        predict = model.predict(X_test)
        print(f"ACC:{accuracy_score(y_test, predict)}, AUC:{roc_auc_score(y_test,predict)}")

    else:
        grid_model = GridSearchCV(model, param_grid=params, cv=cv, refit=True)
        grid_model.fit(X_train, y_train)
        print('best parameters : ', grid_model.best_params_)
        print('best score : ', grid_model.best_score_)
        best_model = grid_model.best_estimator_
        predict = best_model.predict(X_test)        
        print(f"ACC:{accuracy_score(y_test, predict)}, AUC:{roc_auc_score(y_test,predict)}")

#### Logistic Regression

In [2]:
data_dir = '/opt/ml/input/DKT/data'

own_df = pd.read_csv(os.path.join(data_dir, 'own_df.csv'))

In [None]:
# 사용할 수치형 변수, 카테고리 변수, target, 사용 안 할 변수
num_cols = ['user_correct_answer', 'user_total_answer', 
         'user_acc', 'test_mean', 'test_sum', 'tag_mean','tag_sum','KnowledgeTag']
cat_cols = []
target = 'answerCode'
use_cols = num_cols + cat_cols + [target]
drop_cols = [x for x in df.columns if x not in use_cols]

# 카테고리 변수 더미변수화
df = pd.get_dummies(df, columns=cat_cols ,drop_first=True)

# 유저별 분리
train, test = custom_train_test_split(df, ratio=0.7)

# 사용 안 할 변수 drop하고 Train, Test 스케일링 후 X, y 값 분리
X_train, y_train = scaling(train, num_cols=num_cols, target=target)
X_test, y_test = scaling(test, num_cols=num_cols, target=target)

In [6]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=0.01, penalty='l2', max_iter=500, solver='saga')
params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}
grid_searching(model=model, params=params, cv=3, is_grid=False)

ACC:0.5779770802192327, AUC:0.5864594514013118


#### Decision Tree

In [20]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier().fit(X_train, y_train)
params = {
    'max_depth': [2, 3],
    'min_samples_split': [2, 3]
}
grid_searching(model=model, params=params, cv=3, is_grid=True)

best parameters :  {'max_depth': 2, 'min_samples_split': 2}
best score :  0.7025370124142819
ACC:0.5854509217737918, AUC:0.5928488372093024


DecisionTreeClassifier(max_depth=2)

#### Naive Bayes

In [24]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB().fit(X_train, y_train)
grid_searching(model=model, params=params, cv=3, is_grid=False)

ACC:0.6128550074738416, AUC:0.6174850924269529


#### LDA

In [26]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis().fit(X_train, y_train)
params = {
    'solver': ['svd', 'lsqr', 'eigen']
}
grid_searching(model=model, params=params, cv=3, is_grid=True)

best parameters :  {'solver': 'svd'}
best score :  0.710780836953719
ACC:0.57847533632287, AUC:0.5866607036374478


#### QDA

In [28]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
params = {
    'reg_param': (0.01, 0.1), 
    'store_covariance': (True, False),
    'tol': (0.01, 0.1), 
}
grid_searching(model=model, params=params, cv=3, is_grid=True)

best parameters :  {'reg_param': 0.1, 'store_covariance': True, 'tol': 0.01}
best score :  0.7072849661162168
ACC:0.5834578973592427, AUC:0.5909108527131783


#### SVM

In [30]:
from sklearn.svm import SVC

model = SVC(kernel='linear').fit(X_train, y_train)

#### Gaussian Process

In [None]:
from sklearn.gaussian_process import GaussianProcessClassifier

model = GaussianProcessClassifier().fit(X_train, y_train)
grid_searching(model=model, params=params, cv=3, is_grid=False)

#### TabNet

In [None]:
from pytorch_tabnet.tab_model import TabNetClassifier

model = TabNetClassifier().fit(X_train.values, y_train.values)
grid_searching(model=model, params=params, cv=3, is_grid=False)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier().fit(X_train, y_train)
grid_searching(model=model, params=params, cv=3, is_grid=False)

#### Extra Tree

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier().fit(X_train, y_train)
grid_searching(model=model, params=params, cv=3, is_grid=False)

#### AdaBoost

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(n_estimators=100).fit(X_train, y_train)
params = {
    'base_estimators': [DecisionTreeClassifier(max_depth=1), DecisionTreeClassifier(max_depth=2)],
    'n_estimators' : [100, 200], 
    'learning_rate' : [0.05, 0.1]
}
grid_searching(model=model, params=params, cv=3, is_grid=True)

#### Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100).fit(X_train, y_train)
params = {
    'n_estimators' : [100, 200], 
    'learning_rate' : [0.05, 0.1]
}
grid_searching(model=model, params=params, cv=3, is_grid=False)

#### XGBoost

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier().fit(X_train, y_train)
params = {
    'n_estimators' : [100, 200], 
    'learning_rate' : [0.05, 0.1]
}
grid_searching(model=model, params=params, cv=3, is_grid=True)

#### LGBM

In [20]:
from lightgbm import LGBMClassifier


model = LGBMClassifier(num_iterations=50).fit(X_train, y_train)
param = {
    'n_estimators' : [100, 200], 
    'learning_rate' : [0.05, 0.1]
}
grid_searching(model=model, params=params, cv=3, is_grid=False)



ACC:0.5829596412556054, AUC:0.5908228980322003


#### CatBoost

In [12]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(verbose=False).fit(X_train, y_train)
grid_searching(model=model, params=params, cv=3, is_grid=False)

ACC:0.5864474339810662, AUC:0.5943843172331544


In [None]:
#### Voting Classifier

# from sklearn.ensemble import VotingClassifier

# model = VotingClassifier(
#     estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
#     voting='hard')
