In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
df = pd.read_csv("imputed_final_deleted.csv")
df.shape

(6835, 31)

In [4]:
y = df['class']
X = df[df.columns[df.columns != 'class']]

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=5)

In [6]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit_transform(X_train)
X_train_scaled = scaler.fit_transform(X_train)

In [7]:
import joblib
import pickle

joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [8]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
import warnings

cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 5)
warnings.filterwarnings('ignore')

# 1. Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

grid= {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg = LogisticRegression(class_weight='balanced')
logreg_cv = GridSearchCV(logreg, grid, cv=cv, scoring='f1')
logreg_cv.fit(X_train_scaled, y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("f1 score :",logreg_cv.best_score_)

In [None]:
from sklearn.base import clone

lr_clf = clone(logreg_cv.best_estimator_)
lr_clf.fit(X_train_scaled, y_train)

In [None]:
lr_clf_score = cross_val_score(lr_clf, X_train_scaled, y_train, scoring='f1', cv=cv).mean()

In [None]:
lr_clf_score

In [None]:
joblib.dump(lr_clf, "logistic.pkl")

# 1. SVM
## 1-1. Linear SVM

In [None]:
from sklearn.svm import SVC

svm_linear_clf = SVC(C=10, kernel="linear", class_weight='balanced', probability=True)
svm_linear_clf.fit(X_train_scaled, y_train)

In [None]:
svm_linear_clf_score = cross_val_score(svm_linear_clf, X_train_scaled, y_train, scoring='f1', cv=cv).mean()

In [None]:
svm_linear_clf_score

## 1-2. Kernel SVM

In [None]:
from sklearn.svm import SVC

svm_rbf = SVC(class_weight='balanced')
grid= {"C":np.logspace(-2,2,5), "gamma": np.logspace(-2, 2, 5)}
svm_rbf_cv = GridSearchCV(svm_rbf, grid, cv=cv, scoring='f1')
svm_rbf_cv.fit(X_train_scaled, y_train)

print("tuned hpyerparameters :(best parameters) ",svm_rbf_cv.best_params_)
print("f1 score :",svm_rbf_cv.best_score_)

In [None]:
svm_rbf_clf = clone(svm_rbf_cv.best_estimator_)
svm_rbf_clf.fit(X_train_scaled, y_train)

In [None]:
svm_rbf_clf_score = cross_val_score(svm_rbf_clf, X_train_scaled, y_train, scoring='f1', cv=cv).mean()

In [None]:
svm_rbf_clf_score

In [None]:
joblib.dump(svm_linear_clf, "SVM_linear.pkl")
joblib.dump(svm_rbf_clf, "SVM_rbf.pkl")

# 2. RandomForest

In [None]:
rf_clf = joblib.load("forest_01.pkl")

In [None]:
rf_clf_score = cross_val_score(rf_clf, X_train_scaled, y_train, scoring='f1', cv=cv).mean()

In [None]:
rf_clf_score

# 3. XGBoost

In [9]:
import xgboost
from xgboost import XGBClassifier
print(xgboost.__version__)


xgb_clf = XGBClassifier(learning_rate=0.3, max_depth=10, min_child_weight=0.1, gamma=0.2, colsample_bytree=0.7)
xgb_clf.fit(X_train_scaled, y_train)

1.1.0


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.2, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=10,
              min_child_weight=0.1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [10]:
xgb_clf.save_model('xgb.xgb')

In [11]:
from xgboost import Booster

booster = Booster()
booster.load_model('xgb.xgb')
new_xgb_clf = XGBClassifier(learning_rate=0.3, max_depth=10, min_child_weight=0.1, gamma=0.2, colsample_bytree=0.7)
new_xgb_clf._Booster = booster

In [12]:
xgb_clf_score = cross_val_score(new_xgb_clf, X_train_scaled, y_train, scoring='f1', cv=cv).mean()

In [13]:
xgb_clf_score

0.43240846338085576

# 4. FA XGBoost

In [None]:
! pip install factor_analyzer
from factor_analyzer import FactorAnalyzer

fa = FactorAnalyzer(n_factors = 6, rotation = 'varimax', bounds = (0, 1), method = 'principal')
X_fa = fa.fit_transform(X_train_scaled)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(colsample_bytree= 0.5, eta= 0.05, gamma= 0.0, max_depth= 4, min_child_weight= 1)
model.fit(X_fa, y_train)

In [None]:
xgb_fa_clf_score = cross_val_score(model, X_fa, y_train, scoring='f1', cv=cv).mean()

In [None]:
xgb_fa_clf_score # 너무 구려서 뺌

In [None]:
total = sum((lr_clf_score, svm_linear_clf_score, rf_clf_score, xgb_clf_score))
scores = list(map(lambda x: x / total, [lr_clf_score, svm_linear_clf_score, rf_clf_score, xgb_clf_score]))

with open("scores.pkl", "wb") as f:
    pickle.dump(scores, f)

In [None]:
scores

In [None]:
test = pd.concat([X_test, y_test], axis=1)
test.shape

In [None]:
test.columns = df.columns
test.columns

In [None]:
test.to_csv("df_test.csv", index=False)

# 5. Evaluation
## 5-1. Train set