# Drive Mount

In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Library Definition

In [33]:
import numpy as np
import pandas as pd

from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

# Data Upload

In [34]:
# preprocess type
PCA = 1
FEATURE_REDUC = 2
UNDER_SAMPLE = 3

HYPERPARAMETER_TUNNING = True

preprocess_type = PCA

In [35]:

if preprocess_type == PCA:
  X_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_model_preprocess_pca.csv"
  Y_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/Y_model.csv"
  X_exam_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_test_preprocess_pca.csv"

elif preprocess_type == FEATURE_REDUC:
  X_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_model_preprocess_feat_reduc.csv"
  Y_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/Y_model.csv"
  X_exam_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_test_preprocess_feat_reduc.csv"

elif UNDER_SAMPLE:
  X_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_model_preprocess_undersampled.csv"
  Y_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/Y_model_undersampled.csv"
  X_exam_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_test_preprocess.csv"


In [36]:
X_model_all = pd.read_csv(X_model_file_path, index_col = 0)

In [37]:
Y_model_all = pd.read_csv(Y_model_file_path)

# Model 훈련

In [38]:
model = LGBMClassifier(n_estimators = 100)

In [39]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_model_all, Y_model_all, test_size= 0.2, shuffle = True, random_state= 100)

model.fit(X_train, Y_train)

Yhat = model.predict(X_valid)
roc_auc_valid_score = roc_auc_score(Y_valid, Yhat)
cv_valid_score = cross_val_score(model, X_model_all, Y_model_all, scoring = 'roc_auc', cv = 5)

print(f"roc_auc_valid_score: {roc_auc_valid_score}")
print("%0.4f accuracy with a standard deviation of %0.4f" % (cv_valid_score.mean(), cv_valid_score.std()))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


roc_auc_valid_score: 0.5638758585052646
0.8878 accuracy with a standard deviation of 0.0012


# Hyperparameter Tuning

In [40]:
model.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [None]:
if HYPERPARAMETER_TUNNING:
  params = {
    "learning_rate" : [0.1, 0.11, 0.12],
    "min_child_samples" : list(np.arange(16, 24, 2)),
    "n_estimators" : list(np.arange(80, 120, 10)),
  }

  random_search = RandomizedSearchCV(
    model,
    param_distributions = params, # 파라미터 입력
    n_iter = 25,   # random search 탐색 횟수
    cv = 5,        # cv 검증을 위한 분할 검증 횟수
    scoring = 'roc_auc',  # 오차 평가방법
    verbose = 0,     # 진행상황
    random_state = 100
  )

  random_search.fit(X_model_all, Y_model_all)

  print(f"params: {random_search.best_params_}")
  print(f"estimator: {random_search.best_estimator_}")

  best_model = random_search.best_estimator_
else:
  best_model = model

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

KeyboardInterrupt: ignored

In [None]:
best_model.fit(X_train, Y_train)

Yhat = best_model.predict(X_valid)
roc_auc_valid_score = roc_auc_score(Y_valid, Yhat)
cv_valid_score = cross_val_score(best_model, X_model_all, Y_model_all, scoring = 'roc_auc', cv = 5)

print(f"roc_auc_valid_score: {roc_auc_valid_score}")
print("%0.4f accuracy with a standard deviation of %0.4f" % (cv_valid_score.mean(), cv_valid_score.std()))

# Probability Predication

In [None]:
X_exam = pd.read_csv(X_exam_file_path, index_col = 0)

In [None]:
Y_exam_prob_numpy = best_model.predict_proba(X_exam)
Y_exam_prob = pd.DataFrame(Y_exam_prob_numpy[:, 1])

Y_exam_prob.columns = ['business_prob']

In [None]:
Y_exam_prob.head()

In [None]:
threshold = 0.5
Y_exam_prob[Y_exam_prob["business_prob"] >= threshold]

In [None]:
Y_exam_prob.to_csv("/content/drive/Shareddrives/Intro-data-science/data/Y_exam_prob.csv")