# Machine Learning Model

In [1]:
#!pip install -q -U PyYAML
#!pip install -q -U scikit-learn
#!pip install -q shap eli5 cairosvg jupyter_contrib_nbextensions mglearn pip install category_encoders

In [2]:
import os
from abc import *

# 설정
from datetime import datetime

# Log
import logging

# I/O
from PIL import Image
from io import BytesIO
from tqdm import tqdm

# 공학용 연산
import numpy as np
import pandas as pd

# 시각화
import matplotlib.pyplot as plt
import seaborn as sns
import missingno

import pprint
pp = pprint.PrettyPrinter(indent=1)

# 머신러닝
import lightgbm as lgb

import mglearn

# 딥러닝
import torch
from torch.utils.data import DataLoader, TensorDataset

import torch.nn as nn
import torch.nn.functional as F

In [3]:
import warnings

# warning을 보고 싶지 않을 경우 아래 코드 활성화
warnings.filterwarnings('ignore')

## 데이터 로딩

In [4]:
# load data
data_dir = '/opt/ml/input/data' # data 경로 지정
csv_file_path = os.path.join(data_dir, 'train_data.csv')
df = pd.read_csv(csv_file_path, parse_dates=["Timestamp"]) 

## Feature Engineering

In [5]:
from feature_engineering import get_features

df = get_features(df)
# NaN value가 있는 데이터 제거
df = df.dropna(axis=0)
df.head()

Unnamed: 0,userID,assessmentItemID,testId,answerCode,Timestamp,KnowledgeTag,user_correct_answer,user_total_answer,user_acc,correct_shift_-2,...,relative_time,test_mean,test_sum,tag_mean,tag_sum,assess_mean,assess_sum,user_mean,user_sum,correct_per_hour
2,0,A060001003,A060000001,1,2020-03-24 00:17:22,7225,2.0,2,1.0,1.0,...,-19.0,0.947683,1268,0.913187,3040,0.910314,203,0.630872,470,0.649446
3,0,A060001004,A060000001,1,2020-03-24 00:17:29,7225,3.0,3,1.0,1.0,...,-20.0,0.947683,1268,0.913187,3040,0.96861,216,0.630872,470,0.649446
4,0,A060001005,A060000001,1,2020-03-24 00:17:36,7225,4.0,4,1.0,0.0,...,-20.0,0.947683,1268,0.913187,3040,0.941704,210,0.630872,470,0.649446
5,0,A060001007,A060000001,1,2020-03-24 00:17:47,7225,5.0,5,1.0,1.0,...,-16.0,0.947683,1268,0.913187,3040,0.919283,205,0.630872,470,0.649446
6,0,A060003001,A060000003,0,2020-03-26 05:52:03,7226,6.0,6,1.0,1.0,...,192829.0,0.790562,1223,0.799552,3570,0.882353,195,0.630872,470,0.624776


In [6]:
# 사용할 Feature 설정

cate_cols=[#"assessmentItemID", 
           #"testId", 
           "KnowledgeTag", 
           "correct_shift_-2", 
           "correct_shift_-1", 
           "correct_shift_1", 
           "correct_shift_2",
           #"hour", 
           "hour_mode",                 
           ]
    
cont_cols=["user_acc", 
           #"user_correct_answer", 
           #"user_total_answer", 
           #"future_correct", 
           "average_content_correct", 
           "mean_time", 
           #"time_median",
           #"correct_per_hour", 
           "time", 
           #"normalized_time",
           #"relative_time", 
           "assess_mean", 
           "assess_sum", 
           "tag_mean", 
           "tag_sum", 
           "test_mean", 
           "test_sum",
           "user_mean",
           #"user_sum",
           ]
           
FEATS = cate_cols + cont_cols

## Train/Test 데이터셋 분리

In [7]:
from util import setSeeds, custom_train_test_split, get_eval

setSeeds(42)

# 유저별 분리
train, test = custom_train_test_split(df, ratio=0.99)

In [8]:
# X, y 값 분리
y_train = train['answerCode']
train = train.drop(['answerCode'], axis=1)

y_test = test['answerCode']
test = test.drop(['answerCode'], axis=1)

In [9]:
X_train = train[FEATS]
X_test = test[FEATS]

# Modeling

##### 로지스틱 회귀 (Logistic Regression)

In [10]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression().fit(X_train, y_train)
predict = model.predict(X_test)

get_eval(y_test, np.where(predict>=0.5, 1, 0))

정확도: 0.6333
정밀도: 0.6250
재현율: 0.8824
AUC: 0.5950


##### 의사결정나무 (Decision Tree)

In [11]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier().fit(X_train, y_train)
predict = model.predict(X_test)

get_eval(y_test, np.where(predict>=0.5, 1, 0))

정확도: 0.7000
정밀도: 0.7000
재현율: 0.8235
AUC: 0.6810


##### 나이브베이즈 (Naive Bayes)

In [12]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB().fit(X_train, y_train)
predict = model.predict(X_test)

get_eval(y_test, np.where(predict>=0.5, 1, 0))

정확도: 0.5833
정밀도: 0.6286
재현율: 0.6471
AUC: 0.5735


##### 선형판별분석법 (LDA, Linear Discriminant Analysis)

In [13]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

model = LinearDiscriminantAnalysis().fit(X_train, y_train)
predict = model.predict(X_test)

get_eval(y_test, np.where(predict>=0.5, 1, 0))

정확도: 0.7500
정밀도: 0.7317
재현율: 0.8824
AUC: 0.7296


##### 이차판별분석법 (QDA, Quadratic Discriminant Analysis)

In [14]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis().fit(X_train, y_train)
predict = model.predict(X_test)

get_eval(y_test, np.where(predict>=0.5, 1, 0))

정확도: 0.6500
정밀도: 0.6757
재현율: 0.7353
AUC: 0.6369


##### 서포트 벡터 머신 (SVM, Support Vector Machine)

In [None]:
from sklearn.svm import SVC

model = SVC(kernel='linear').fit(X_train, y_train)
predict = model.predict(X_test)

get_eval(y_test, np.where(predict>=0.5, 1, 0))

##### KNeighborsClassifier

In [15]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier().fit(X_train, y_train)
predict = model.predict(X_test)

get_eval(y_test, np.where(predict>=0.5, 1, 0))


정확도: 0.6333
정밀도: 0.6429
재현율: 0.7941
AUC: 0.6086



# 앙상블 (Ensemble)

##### 하드 보팅 (Hard Voting)

In [16]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# model 1 - 로지스틱 회귀
model = LogisticRegression().fit(X_train, y_train)
predict = model.predict(X_test)
logistic_score = accuracy_score(y_test, predict)

# model 2 - 의사결정나무
model = DecisionTreeClassifier().fit(X_train, y_train)
predict = model.predict(X_test)
tree_score = accuracy_score(y_test, predict)

# model 3 - 가우시안 나이브 베이즈
model = GaussianNB().fit(X_train, y_train)
predict = model.predict(X_test)
naive_score = accuracy_score(y_test, predict)

# 하드 보팅
logistic = LogisticRegression()
tree = DecisionTreeClassifier()
naive = GaussianNB()

estimators = [
    ('logistic', logistic),
    ('tree', tree),
    ('naive', naive)
]
hard_vote = VotingClassifier(estimators=estimators, voting='hard')
hard_vote.fit(X_train, y_train)
predict = hard_vote.predict(X_test)
hard_vote_score = accuracy_score(y_test, predict)

print(f"Logistic Regression accuracy : {logistic_score}")
print(f"Decision Tree accuracy : {tree_score}")
print(f"naive accuracy : {naive_score}")
print(f"Hard Voting of 3 model accuracy : {hard_vote_score}")

get_eval(y_test, np.where(predict>=0.5, 1, 0))

Logistic Regression accuracy : 0.6333333333333333
Decision Tree accuracy : 0.6833333333333333
naive accuracy : 0.5833333333333334
Hard Voting of 3 model accuracy : 0.6833333333333333
정확도: 0.6833
정밀도: 0.6667
재현율: 0.8824
AUC: 0.6527


##### 소프트 보팅 (Soft Voting)

In [17]:
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

# model 1 - 로지스틱 회귀
model = LogisticRegression().fit(X_train, y_train)
predict = model.predict(X_test)
logistic_score = accuracy_score(y_test, predict)

# model 2 - 의사결정나무
model = DecisionTreeClassifier().fit(X_train, y_train)
predict = model.predict(X_test)
tree_score = accuracy_score(y_test, predict)

# model 3 - 가우시안 나이브 베이즈
model = GaussianNB().fit(X_train, y_train)
predict = model.predict(X_test)
naive_score = accuracy_score(y_test, predict)

# 소프트 보팅
logistic = LogisticRegression()
tree = DecisionTreeClassifier()
naive = GaussianNB()

estimators = [
    ('logistic', logistic),
    ('tree', tree),
    ('naive', naive)
]
soft_vote = VotingClassifier(estimators=estimators, voting='soft')
soft_vote.fit(X_train, y_train)
predict = soft_vote.predict(X_test)
soft_vote_score = accuracy_score(y_test, predict)

print(f"Logistic Regression accuracy : {logistic_score}")
print(f"Decision Tree accuracy : {tree_score}")
print(f"naive accuracy : {naive_score}")
print(f"Soft Voting of 3 model accuracy : {soft_vote_score}")

get_eval(y_test, np.where(predict>=0.5, 1, 0))

Logistic Regression accuracy : 0.6333333333333333
Decision Tree accuracy : 0.6666666666666666
naive accuracy : 0.5833333333333334
Soft Voting of 3 model accuracy : 0.7
정확도: 0.7000
정밀도: 0.7105
재현율: 0.7941
AUC: 0.6855


##### 배깅 분류기 (Bagging Classifier)
- 배깅 (Bagging)
- 패스팅 (Pasting)
- 랜덤 부분공간 (Random Subspace)
- 랜덤 패치 (Random Patches)

In [14]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# 랜덤 seed
random_seed = 42

# 배깅 (Bagging) : 같은 데이터 샘플 중복사용(replacement)
bagging = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=50),
    n_estimators=100,
    bootstrap=True,
    max_samples=0.5,
    bootstrap_features=False,
    max_features=1.0,
    random_state=random_seed
)

# 패스팅 (Pasting) : 같은 데이터 샘플 중복사용(replacement) 안함
pasting = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=50),
    n_estimators=100,
    bootstrap=False,
    max_samples=0.5,
    bootstrap_features=False,
    max_features=1.0,
    random_state=random_seed
)

# 랜덤 부분공간 (Random Subspace) : feature들 중 일부만 사용
subspace = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=50),
    n_estimators=100,
    bootstrap=False,
    max_samples=1.0,
    bootstrap_features=True,
    max_features=0.5,
    random_state=random_seed
)

# 랜덤 패치 (Random Patches) : 데이터 & feature 둘다 일부만 사용
patch = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=50),
    n_estimators=100,
    bootstrap=True,
    max_samples=0.5,
    bootstrap_features=True,
    max_features=0.5,
    random_state=random_seed
)

model_types = ['bagging', 'pasting', 'random subspace', 'random patches']
models = [bagging, pasting, subspace, patch]

predicts = []
for model in models:

    model.fit(X_train, y_train)
    predicts.append(model.predict(X_test))


for model_type, predict in zip(model_types, predicts):
    print(model_type)
    get_eval(y_test, np.where(predict>=0.5, 1, 0))
    

##### 랜덤 포레스트 (Random Forest)

In [10]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier().fit(X_train, y_train)
predict = model.predict(X_test)

get_eval(y_test, np.where(predict>=0.5, 1, 0))

정확도: 0.7000
정밀도: 0.7105
재현율: 0.7941
AUC: 0.6855


##### 엑스트라 트리 (Extra Tree)

In [11]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier().fit(X_train, y_train)
predict = model.predict(X_test)

get_eval(y_test, np.where(predict>=0.5, 1, 0))

정확도: 0.7000
정밀도: 0.7105
재현율: 0.7941
AUC: 0.6855


##### 에이다 부스트 (AdaBoost)

In [12]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(n_estimators=100).fit(X_train, y_train)
predict = model.predict(X_test)

get_eval(y_test, np.where(predict>=0.5, 1, 0))

정확도: 0.7167
정밀도: 0.6977
재현율: 0.8824
AUC: 0.6912


##### 그래디언트 부스트 (Gradient Boost)

In [13]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100).fit(X_train, y_train)
predict = model.predict(X_test)

get_eval(y_test, np.where(predict>=0.5, 1, 0))

정확도: 0.7167
정밀도: 0.6889
재현율: 0.9118
AUC: 0.6867
