In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from google.colab import drive

drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
%cd /content/gdrive/My Drive/Colab Notebooks/Classification_Bankruptcy/5조

/content/gdrive/My Drive/Colab Notebooks/Classification_Bankruptcy/5조


In [4]:
df = pd.read_csv("imputed_final_deleted.csv")
df.shape

(6835, 31)

In [0]:
y = df['class']
X = df[df.columns[df.columns != 'class']]

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=5)

In [0]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit_transform(X_train)
X_train_scaled = scaler.fit_transform(X_train)

In [8]:
import joblib
import pickle

joblib.dump(scaler, "scaler.pkl")

['scaler.pkl']

In [0]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
import warnings

cv = RepeatedStratifiedKFold(n_splits = 5, n_repeats = 3, random_state = 5)
warnings.filterwarnings('ignore')

# 1. Logistic Regression

In [10]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

grid= {"C":np.logspace(-3,3,7), "penalty":["l1","l2"]}# l1 lasso l2 ridge
logreg = LogisticRegression(class_weight='balanced')
logreg_cv = GridSearchCV(logreg, grid, cv=cv, scoring='f1')
logreg_cv.fit(X_train_scaled, y_train)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("f1 score :",logreg_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 100.0, 'penalty': 'l2'}
f1 score : 0.2122141991594421


In [11]:
from sklearn.base import clone

lr_clf = clone(logreg_cv.best_estimator_)
lr_clf.fit(X_train_scaled, y_train)

LogisticRegression(C=100.0, class_weight='balanced', dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [0]:
lr_clf_score = cross_val_score(lr_clf, X_train_scaled, y_train, scoring='f1', cv=cv).mean()

In [13]:
lr_clf_score

0.2122141991594421

In [14]:
joblib.dump(lr_clf, "logistic.pkl")

['logistic.pkl']

# 1. SVM
## 1-1. Linear SVM

In [15]:
from sklearn.svm import SVC

svm_linear_clf = SVC(C=10, kernel="linear", class_weight='balanced', probability=True)
svm_linear_clf.fit(X_train_scaled, y_train)

SVC(C=10, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [0]:
svm_linear_clf_score = cross_val_score(svm_linear_clf, X_train_scaled, y_train, scoring='f1', cv=cv).mean()

In [17]:
svm_linear_clf_score

0.22057820338161865

## 1-2. Kernel SVM

In [18]:
from sklearn.svm import SVC

svm_rbf = SVC(class_weight='balanced')
grid= {"C":np.logspace(-2,2,5), "gamma": np.logspace(-2, 2, 5)}
svm_rbf_cv = GridSearchCV(svm_rbf, grid, cv=cv, scoring='f1')
svm_rbf_cv.fit(X_train_scaled, y_train)

print("tuned hpyerparameters :(best parameters) ",svm_rbf_cv.best_params_)
print("f1 score :",svm_rbf_cv.best_score_)

tuned hpyerparameters :(best parameters)  {'C': 10.0, 'gamma': 0.1}
f1 score : 0.2120952250188968


In [19]:
svm_rbf_clf = clone(svm_rbf_cv.best_estimator_)
svm_rbf_clf.fit(X_train_scaled, y_train)

SVC(C=10.0, break_ties=False, cache_size=200, class_weight='balanced',
    coef0=0.0, decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [0]:
svm_rbf_clf_score = cross_val_score(svm_rbf_clf, X_train_scaled, y_train, scoring='f1', cv=cv).mean()

In [21]:
svm_rbf_clf_score

0.2120952250188968

In [22]:
joblib.dump(svm_linear_clf, "SVM_linear.pkl")
joblib.dump(svm_rbf_clf, "SVM_rbf.pkl")

['SVM_rbf.pkl']

#2. RandomForest

In [0]:
rf_clf = joblib.load("forest_01.pkl")

In [0]:
rf_clf_score = cross_val_score(rf_clf, X_train_scaled, y_train, scoring='f1', cv=cv).mean()

In [25]:
rf_clf_score

0.20035503699688448

# 3. XGBoost

In [26]:
from xgboost import XGBClassifier


xgb_clf = XGBClassifier(learning_rate=0.3, max_depth=10, min_child_weight=0.1, gamma=0.2, colsample_bytree=0.7)
xgb_clf.fit(X_train_scaled, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, gamma=0.2,
              learning_rate=0.3, max_delta_step=0, max_depth=10,
              min_child_weight=0.1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [27]:
joblib.dump(xgb_clf, "xgb.pkl")

['xgb.pkl']

In [0]:
xgb_clf = joblib.load("xgb.pkl")

In [0]:
xgb_clf_score = cross_val_score(xgb_clf, X_train_scaled, y_train, scoring='f1', cv=cv).mean()

In [30]:
xgb_clf_score

0.4164212814644637

#4. FA XGBoost

In [31]:
! pip install factor_analyzer
from factor_analyzer import FactorAnalyzer

fa = FactorAnalyzer(n_factors = 6, rotation = 'varimax', bounds = (0, 1), method = 'principal')
X_fa = fa.fit_transform(X_train_scaled)



In [32]:
from xgboost import XGBClassifier

model = XGBClassifier(colsample_bytree= 0.5, eta= 0.05, gamma= 0.0, max_depth= 4, min_child_weight= 1)
model.fit(X_fa, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, eta=0.05, gamma=0.0,
              learning_rate=0.1, max_delta_step=0, max_depth=4,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
xgb_fa_clf_score = cross_val_score(model, X_fa, y_train, scoring='f1', cv=cv).mean()

In [34]:
xgb_fa_clf_score

0.0020512820512820513

In [35]:
joblib.dump(fa, "fa.pkl")
joblib.dump(model, "xgb_fa.pkl")

['xgb_fa.pkl']

In [0]:
total = sum((lr_clf_score, svm_linear_clf_score, rf_clf_score, xgb_clf_score))
scores = list(map(lambda x: x / total, [lr_clf_score, svm_linear_clf_score, rf_clf_score, xgb_clf_score]))

with open("scores.pkl", "wb") as f:
    pickle.dump(scores, f)

In [39]:
scores

[0.20219180975283182,
 0.2101608012583984,
 0.19089272859192288,
 0.3967546603968469]

In [40]:
test = pd.concat([X_test, y_test], axis=1)
test.shape

(1367, 31)

In [41]:
test.columns = df.columns
test.columns

Index(['Attr4', 'Attr5', 'Attr9', 'Attr12', 'Attr18', 'Attr21', 'Attr23',
       'Attr24', 'Attr27', 'Attr29', 'Attr33', 'Attr35', 'Attr36', 'Attr38',
       'Attr40', 'Attr43', 'Attr45', 'Attr47', 'Attr48', 'Attr49', 'Attr50',
       'Attr51', 'Attr52', 'Attr56', 'Attr58', 'Attr59', 'Attr60', 'Attr61',
       'Attr63', 'Attr64', 'class'],
      dtype='object')

In [0]:
test.to_csv("df_test.csv", index=False)