# Path

In [6]:
import os
os.chdir(r'C:\Users\user\Desktop\調參')

# Package

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Data

In [3]:
train_v1 = pd.read_csv("train_cleaned_version1_proc_nontree.csv", low_memory=False, index_col=0)

In [5]:
train_v1.shape

(99758, 142)

# Train_test_split

In [7]:
X = train_v1.drop("Y1", axis=1)
y = train_v1["Y1"]

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [10]:
# 驗證分層抽樣比例
print(y_train.value_counts())
print(y_test.value_counts())
print("y_train 0比例:",65543*100/(65543 + 1294) )
print("y_train 1比例:",1294*100/(65543 + 1294) )
print("y_test 0比例:", 32284*100/(32284 + 637))
print("y_test 1比例:", 637*100/(32284 + 637))

0    65543
1     1294
Name: Y1, dtype: int64
0    32284
1      637
Name: Y1, dtype: int64
y_train 0比例: 98.06394661639511
y_train 1比例: 1.9360533836048894
y_test 0比例: 98.06506485222198
y_test 1比例: 1.9349351477780141


# Logistic Regression

In [11]:
pipeline = Pipeline([
                     ('classifier', LogisticRegression())
                     ])
parameters = {    
              'classifier__penalty': [ 'l1', 'l2'],
              'classifier__C': [0.01, 0.1, 1, 10]
              }
scoring = 'roc_auc'
n_splits = 3
cv = KFold(n_splits=n_splits, shuffle=True, random_state=123457)
SearchCV = GridSearchCV(estimator=pipeline,
                        param_grid=parameters,
                        scoring=scoring, 
                        cv=cv,
                        return_train_score=True,
                        verbose=1, 
                        n_jobs=2)

In [12]:
SearchCV.fit(X_train, y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  24 out of  24 | elapsed:  3.6min finished


GridSearchCV(cv=KFold(n_splits=3, random_state=123457, shuffle=True),
       error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=2,
       param_grid={'classifier__penalty': ['l1', 'l2'], 'classifier__C': [0.01, 0.1, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=1)

In [17]:
best_estimator = SearchCV.best_estimator_
best_params = SearchCV.best_params_
print(best_params)

{'classifier__C': 1, 'classifier__penalty': 'l1'}


In [14]:
y_test_pred = SearchCV.predict(X_test)
test_score = SearchCV.score(X_test, y_test) 
print(" test score ("+scoring+"):", test_score)

 test score (roc_auc): 0.8013657780525932


In [15]:
r_lr=SearchCV.cv_results_

In [16]:
print('params:\n',r_lr['params'],'\n')
print('mean_test_score:\n',r_lr['mean_test_score'],'\n')
print('std_test_score:\n',r_lr['std_test_score'],'\n')
print('mean_train_score:\n',r_lr['mean_train_score'],'\n')
print('std_train_score:\n',r_lr['std_train_score'],'\n')

params:
 [{'classifier__C': 0.01, 'classifier__penalty': 'l1'}, {'classifier__C': 0.01, 'classifier__penalty': 'l2'}, {'classifier__C': 0.1, 'classifier__penalty': 'l1'}, {'classifier__C': 0.1, 'classifier__penalty': 'l2'}, {'classifier__C': 1, 'classifier__penalty': 'l1'}, {'classifier__C': 1, 'classifier__penalty': 'l2'}, {'classifier__C': 10, 'classifier__penalty': 'l1'}, {'classifier__C': 10, 'classifier__penalty': 'l2'}] 

mean_test_score:
 [0.74689363 0.77268657 0.79624918 0.7923745  0.79774573 0.79529274
 0.79387598 0.79413973] 

std_test_score:
 [0.02193695 0.01612713 0.01142928 0.01429723 0.01236487 0.01354423
 0.01508336 0.01421541] 

mean_train_score:
 [0.7529634  0.78653792 0.80864928 0.81335619 0.82097879 0.82112332
 0.82215089 0.82204285] 

std_train_score:
 [0.00837872 0.00688056 0.00544448 0.00560852 0.00498166 0.00496416
 0.00515812 0.00505623] 



In [27]:
filename = 'SearchCV_logistic'
pickle.dump(SearchCV, open(filename, 'wb'))