# Drive Mount

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Library Definition

In [2]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

# Data Upload

In [3]:
# preprocess type
PCA = 1
FEATURE_REDUC = 2
UNDER_SAMPLE = 3
HYPERPARAMETER_TUNNING = False

preprocess_type = FEATURE_REDUC

In [4]:

if preprocess_type == PCA:
  X_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_model_preprocess_pca.csv"
  Y_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/Y_model.csv"
  X_exam_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_test_preprocess_pca.csv"

elif preprocess_type == FEATURE_REDUC:
  X_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_model_preprocess_feat_reduc.csv"
  Y_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/Y_model.csv"
  X_exam_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_test_preprocess_feat_reduc.csv"

elif UNDER_SAMPLE:
  X_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_model_preprocess_undersampled.csv"
  Y_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/Y_model_undersampled.csv"
  X_exam_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_test_preprocess.csv"


In [5]:
X_model_all = pd.read_csv(X_model_file_path, index_col = 0)

In [6]:
Y_model_all = pd.read_csv(Y_model_file_path)

# Model 훈련

In [7]:
model = LogisticRegression(random_state = 100)

In [8]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_model_all, Y_model_all, test_size= 0.2, shuffle = True, random_state= 100)

model.fit(X_train, Y_train)

Yhat = model.predict(X_valid)
roc_auc_valid_score = roc_auc_score(Y_valid, Yhat)
cv_valid_score = cross_val_score(model, X_model_all, Y_model_all, scoring = 'roc_auc', cv = 5)

print(f"roc_auc_valid_score: {roc_auc_valid_score}")
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_valid_score.mean(), cv_valid_score.std()))

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

roc_auc_valid_score: 0.5421041199431033
0.81 accuracy with a standard deviation of 0.00


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Hyperparameter Tuning

In [9]:
model.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': 100,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [10]:

if HYPERPARAMETER_TUNNING:
  params = {
    "learning_rate" : [0.08, 0.09, 0.1, 0.11, 0.12],
    "min_child_samples" : list(np.arange(15, 25, 1)),
    "n_estimators" : list(np.arange(80, 120, 10)),
  }

  random_search = RandomizedSearchCV(
    model,
    param_distributions = params, # 파라미터 입력
    n_iter = 50,   # random search 탐색 횟수
    cv = 5,        # cv 검증을 위한 분할 검증 횟수
    scoring = 'roc_auc',  # 오차 평가방법
    verbose = 0,     # 진행상황
    random_state = 100
  )

  random_search.fit(X_model_all, Y_model_all)

  print(f"params: {random_search.best_params_}")
  print(f"estimator: {random_search.best_estimator_}")

  best_model = random_search.best_estimator_

else:
  best_model = model


In [11]:
best_model.fit(X_train, Y_train)

Yhat = best_model.predict(X_valid)
roc_auc_valid_score = roc_auc_score(Y_valid, Yhat)
cv_valid_score = cross_val_score(best_model, X_model_all, Y_model_all, scoring = 'roc_auc', cv = 5)

print(f"roc_auc_valid_score: {roc_auc_valid_score}")
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_valid_score.mean(), cv_valid_score.std()))

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

roc_auc_valid_score: 0.5421041199431033
0.81 accuracy with a standard deviation of 0.00


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Probability Predication

In [12]:
X_exam = pd.read_csv(X_exam_file_path, index_col = 0)

In [13]:
Y_exam_prob_numpy = best_model.predict_proba(X_exam)
Y_exam_prob = pd.DataFrame(Y_exam_prob_numpy[:, 1])

Y_exam_prob.columns = ['business_prob']

In [14]:
Y_exam_prob.head()

Unnamed: 0,business_prob
0,0.018819
1,0.075337
2,0.020031
3,0.062094
4,0.062094


In [15]:
Y_exam_prob[Y_exam_prob["business_prob"] >= 0.5]

Unnamed: 0,business_prob
32,0.692587
200,0.919209
263,0.791136
308,0.580993
310,0.542541
...,...
199782,0.786932
199801,0.701794
199928,0.958260
199952,0.519666


In [16]:
Y_exam_prob.to_csv("/content/drive/Shareddrives/Intro-data-science/data/Y_exam_prob.csv")