# Sprint1 課題 機械学習フロー

In [29]:
"""
Home Credit Default Riskの分類問題
標準化まで。データは5万件使用
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

df_train = pd.read_csv('../application_train.csv').sample(frac=1)[:50000]
df_X = df_train.drop("TARGET", axis=1)
df_Y = df_train[["TARGET"]]

#カテゴリーを数値変換
categorical_feats = [
    f for f in df_X.columns if df_X[f].dtype == 'object'
]
categorical_feats_ = categorical_feats.copy()

for f in categorical_feats_:
    df_X[f], _ = pd.factorize(df_X[f])
    df_X[f] = df_X[f].astype('int')
    
#残りの欠損値を平均で埋める
df_X = df_X.fillna(df_X.mean())

#データ分割
(X_train, X_test, 
     y_train, y_test) = train_test_split(df_X.values, df_Y.values.flatten(), test_size=0.3)

#標準化
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## 【問題1】クロスバリデーション

In [30]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5)
kf.get_n_splits(X_train)

5

In [31]:
for a, b in kf.split(X_train):
    print('分割データA:{}\n分割データB{}'.format(a, b))

分割データA:[ 7000  7001  7002 ... 34997 34998 34999]
分割データB[   0    1    2 ... 6997 6998 6999]
分割データA:[    0     1     2 ... 34997 34998 34999]
分割データB[ 7000  7001  7002 ... 13997 13998 13999]
分割データA:[    0     1     2 ... 34997 34998 34999]
分割データB[14000 14001 14002 ... 20997 20998 20999]
分割データA:[    0     1     2 ... 34997 34998 34999]
分割データB[21000 21001 21002 ... 27997 27998 27999]
分割データA:[    0     1     2 ... 27997 27998 27999]
分割データB[28000 28001 28002 ... 34997 34998 34999]


## 【問題2】グリッドサーチ

### GridSearchによるパラメータ探索および学習

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

"""GridSearchCVの使い方
GridSearchCV(estimator,
             param_grid,
             scoring=None,
             fit_params=None,
             n_jobs=1,
             iid=True,
             refit=True,
             cv=None,
             verbose=0,
             pre_dispatch=‘2*n_jobs’,
             error_score=’raise’,
             return_train_score=’warn’
            )
estimator: チューニングを行うモデル
param_grid: パラメータ候補値を「パラメータ名, 候補値リスト」の辞書で与える
n_jobs: 同時実行数(-1にするとコア数で同時実行)
refit: Trueだと最良だったパラメータを使い学習データ全体で再学習する
cv: Cross validationの分割数(default: 3分割)
verbose: ログ出力レベル
"""


#LogisticRegressionモデルのパラメータを羅列
diparameter = {
    "C": [10**i for i in range(-2,4)],
    "random_state": [123],
    "class_weight": ['balanced', None],
}

grid_search = GridSearchCV(LogisticRegression(),
                           param_grid=diparameter,
                           cv=5,
                           scoring='roc_auc',
                          )
grid_search.fit(X_train, y_train)





GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.01, 0.1, 1, 10, 100, 1000], 'random_state': [123], 'class_weight': ['balanced', None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [33]:
#最良の点数
grid_search.best_score_

0.7388844955761332

In [34]:
#　最良のパラメータの組み合わせ
grid_search.best_params_

{'C': 0.01, 'class_weight': None, 'random_state': 123}

In [35]:
grid_search.best_estimator_

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=123, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

### GridSearchCVで最良とでたパラメータで予測

In [36]:
#最良のパラメータ
# clf_logistic_gridsearched = LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
#           intercept_scaling=1, max_iter=100, multi_class='warn',
#           n_jobs=None, penalty='l2', random_state=123, solver='warn',
#           tol=0.0001, verbose=0, warm_start=False).fit(X_train, y_train)

y_predict_logistic_gridsearched = grid_search.best_estimator_.predict(X_test)
y_predict_proba_logistic_gridsearched = grid_search.best_estimator_.predict_proba(X_test)

In [37]:
estimation_logistic_dict = {
    "accuracy": accuracy_score(y_test, y_predict_logistic_gridsearched),
    "precision": precision_score(y_test, y_predict_logistic_gridsearched),
    "recall": recall_score(y_test, y_predict_logistic_gridsearched),
    "f1":f1_score(y_test, y_predict_logistic_gridsearched)
}

estimation_logistic = pd.DataFrame(estimation_logistic_dict, index=['LogisticRegression'])
estimation_logistic

Unnamed: 0,accuracy,precision,recall,f1
LogisticRegression,0.9184,0.411765,0.005733,0.011309


In [39]:
"""
テストデータを読み込み、処理
"""

df_apptest = pd.read_csv('../application_test.csv')

#カテゴリーを数値変換
categorical_feats = [
    f for f in df_apptest.columns if df_apptest[f].dtype == 'object'
]
categorical_feats_ = categorical_feats.copy()

for f in categorical_feats_:
    df_apptest[f], _ = pd.factorize(df_apptest[f])
    df_apptest[f] = df_apptest[f].astype('int')
    
#残りの欠損値を平均で埋める
df_apptest = df_apptest.fillna(df_apptest.mean())

X_apptest = df_apptest.values

#標準化
scaler.fit(X_apptest)
X_apptest = scaler.transform(X_apptest)

In [40]:
#予測
y_apt_predict_proba_logistic_gridsearched = grid_search.best_estimator_.predict_proba(X_apptest)

In [41]:
"""
csvファイルへ書き込み
"""

import csv as csv

submit_file = open("home_credit_submit_190202_2.csv", "w")
file_object = csv.writer(submit_file)
file_object.writerow(["SK_ID_CURR", "TARGET"])
file_object.writerows(zip(df_apptest["SK_ID_CURR"].values, y_apt_predict_proba_logistic_gridsearched[:, 1]))
submit_file.close()

#### kaggle　AUCは0.73450

In [42]:
from sklearn.model_selection  import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
    "C": uniform(loc=0, scale=4),
    "penalty": ['l2'],
    "class_weight": ['balanced', None],
}

random_search = RandomizedSearchCV(
    LogisticRegression(),
   param_distributions=param_dist,
   cv=5,
   scoring='roc_auc',
)
random_search.fit(X_train, y_train)





RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a35c907b8>, 'penalty': ['l2'], 'class_weight': ['balanced', None]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=0)

In [43]:
random_search.best_estimator_

LogisticRegression(C=0.028018634499291206, class_weight='balanced',
          dual=False, fit_intercept=True, intercept_scaling=1,
          max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
          random_state=None, solver='warn', tol=0.0001, verbose=0,
          warm_start=False)

In [44]:
random_search.best_score_

0.7375063706146369

In [45]:
#テストデータで予測
y_predict_proba_logistic_random = random_search.best_estimator_.predict_proba(X_test)
y_predict_proba_logistic_random

array([[0.5871687 , 0.4128313 ],
       [0.73007828, 0.26992172],
       [0.52633612, 0.47366388],
       ...,
       [0.69158633, 0.30841367],
       [0.32884981, 0.67115019],
       [0.68211309, 0.31788691]])

In [46]:
#kaggleのテストデータを予測
y_apt_predict_proba_logistic_random = random_search.best_estimator_.predict_proba(X_apptest)

In [47]:
"""
csvファイルへ書き込み
"""

submit_file = open("home_credit_submit_190202_3.csv", "w")
file_object = csv.writer(submit_file)
file_object.writerow(["SK_ID_CURR", "TARGET"])
file_object.writerows(zip(df_apptest["SK_ID_CURR"].values, y_apt_predict_proba_logistic_random[:, 1]))
submit_file.close()

kaggleでAUCは0.73448

## 【問題3】Kernelからの調査

kernelから特徴量のデータにマイナス、異常値を変更するというものがあったため試すことにする。

## 【問題4】高い汎化性能のモデル

In [48]:
pd.set_option("display.max_columns", 120)
df_train.describe()

Unnamed: 0,SK_ID_CURR,TARGET,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,DAYS_BIRTH,DAYS_EMPLOYED,DAYS_REGISTRATION,DAYS_ID_PUBLISH,OWN_CAR_AGE,FLAG_MOBIL,FLAG_EMP_PHONE,FLAG_WORK_PHONE,FLAG_CONT_MOBILE,FLAG_PHONE,FLAG_EMAIL,CNT_FAM_MEMBERS,REGION_RATING_CLIENT,REGION_RATING_CLIENT_W_CITY,HOUR_APPR_PROCESS_START,REG_REGION_NOT_LIVE_REGION,REG_REGION_NOT_WORK_REGION,LIVE_REGION_NOT_WORK_REGION,REG_CITY_NOT_LIVE_CITY,REG_CITY_NOT_WORK_CITY,LIVE_CITY_NOT_WORK_CITY,EXT_SOURCE_1,EXT_SOURCE_2,EXT_SOURCE_3,APARTMENTS_AVG,BASEMENTAREA_AVG,YEARS_BEGINEXPLUATATION_AVG,YEARS_BUILD_AVG,COMMONAREA_AVG,ELEVATORS_AVG,ENTRANCES_AVG,FLOORSMAX_AVG,FLOORSMIN_AVG,LANDAREA_AVG,LIVINGAPARTMENTS_AVG,LIVINGAREA_AVG,NONLIVINGAPARTMENTS_AVG,NONLIVINGAREA_AVG,APARTMENTS_MODE,BASEMENTAREA_MODE,YEARS_BEGINEXPLUATATION_MODE,YEARS_BUILD_MODE,COMMONAREA_MODE,ELEVATORS_MODE,ENTRANCES_MODE,FLOORSMAX_MODE,FLOORSMIN_MODE,LANDAREA_MODE,LIVINGAPARTMENTS_MODE,LIVINGAREA_MODE,NONLIVINGAPARTMENTS_MODE,NONLIVINGAREA_MODE,APARTMENTS_MEDI,BASEMENTAREA_MEDI,YEARS_BEGINEXPLUATATION_MEDI,YEARS_BUILD_MEDI,COMMONAREA_MEDI,ELEVATORS_MEDI,ENTRANCES_MEDI,FLOORSMAX_MEDI,FLOORSMIN_MEDI,LANDAREA_MEDI,LIVINGAPARTMENTS_MEDI,LIVINGAREA_MEDI,NONLIVINGAPARTMENTS_MEDI,NONLIVINGAREA_MEDI,TOTALAREA_MODE,OBS_30_CNT_SOCIAL_CIRCLE,DEF_30_CNT_SOCIAL_CIRCLE,OBS_60_CNT_SOCIAL_CIRCLE,DEF_60_CNT_SOCIAL_CIRCLE,DAYS_LAST_PHONE_CHANGE,FLAG_DOCUMENT_2,FLAG_DOCUMENT_3,FLAG_DOCUMENT_4,FLAG_DOCUMENT_5,FLAG_DOCUMENT_6,FLAG_DOCUMENT_7,FLAG_DOCUMENT_8,FLAG_DOCUMENT_9,FLAG_DOCUMENT_10,FLAG_DOCUMENT_11,FLAG_DOCUMENT_12,FLAG_DOCUMENT_13,FLAG_DOCUMENT_14,FLAG_DOCUMENT_15,FLAG_DOCUMENT_16,FLAG_DOCUMENT_17,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
count,50000.0,50000.0,50000.0,50000.0,50000.0,49999.0,49966.0,50000.0,50000.0,50000.0,50000.0,50000.0,16925.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,21885.0,49893.0,40076.0,24673.0,20799.0,25679.0,16814.0,15079.0,23427.0,24892.0,25172.0,16172.0,20383.0,15902.0,24948.0,15368.0,22449.0,24673.0,20799.0,25679.0,16814.0,15079.0,23427.0,24892.0,25172.0,16172.0,20383.0,15902.0,24948.0,15368.0,22449.0,24673.0,20799.0,25679.0,16814.0,15079.0,23427.0,24892.0,25172.0,16172.0,20383.0,15902.0,24948.0,15368.0,22449.0,25937.0,49847.0,49847.0,49847.0,49847.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,43198.0,43198.0,43198.0,43198.0,43198.0,43198.0
mean,278884.0002,0.07982,0.41788,170255.8,600005.8,27194.199044,539149.8,0.020789,-16036.54284,63363.8106,-4978.11946,-2991.54968,12.099025,1.0,0.82112,0.20214,0.99826,0.28224,0.05944,2.15314,2.05504,2.03474,12.04874,0.01448,0.0493,0.04016,0.07664,0.22876,0.17814,0.500997,0.514076,0.510049,0.117054,0.088272,0.978064,0.751369,0.043817,0.077927,0.149791,0.225735,0.230515,0.065809,0.09953,0.106807,0.008455,0.028745,0.113815,0.087358,0.977465,0.758584,0.041765,0.073522,0.145238,0.221612,0.226593,0.064455,0.104308,0.105391,0.007884,0.027393,0.117567,0.087878,0.978092,0.754688,0.043901,0.077154,0.149343,0.225355,0.230208,0.066706,0.100734,0.107995,0.008351,0.028657,0.101756,1.43449,0.144903,1.417819,0.100046,-966.94388,2e-05,0.71288,0.00014,0.01438,0.08732,0.00016,0.08044,0.00384,2e-05,0.00398,2e-05,0.00376,0.003,0.00118,0.0101,0.0002,0.00824,0.00056,0.00036,0.0003,0.00625,0.006945,0.034816,0.274619,0.263994,1.899694
std,102843.461831,0.271017,0.724532,531607.6,403576.0,14654.089919,370939.3,0.013731,4365.267145,140907.651156,3527.234751,1504.831478,12.117292,0.0,0.383256,0.4016,0.041677,0.450094,0.236449,0.913534,0.507637,0.501257,3.278783,0.11946,0.216496,0.196336,0.266022,0.420039,0.382634,0.211253,0.190515,0.194745,0.106914,0.082107,0.056433,0.112262,0.074713,0.132859,0.098998,0.143612,0.16089,0.079968,0.090352,0.108795,0.042115,0.072682,0.106568,0.084302,0.061407,0.10905,0.073227,0.130491,0.100009,0.142467,0.160685,0.080499,0.095996,0.110116,0.0417,0.073453,0.107822,0.082102,0.057097,0.111065,0.07517,0.132869,0.099323,0.143942,0.161341,0.081183,0.091534,0.110327,0.042122,0.073447,0.105596,2.348349,0.446881,2.329148,0.36078,828.380987,0.004472,0.452423,0.011831,0.119053,0.282306,0.012648,0.271976,0.061849,0.004472,0.062962,0.004472,0.061204,0.054691,0.034331,0.099991,0.014141,0.090401,0.023658,0.01897,0.017318,0.082262,0.118435,0.208376,0.946361,0.623398,1.871267
min,100006.0,0.0,0.0,26460.0,45000.0,1980.0,45000.0,0.000533,-25196.0,-17170.0,-21865.0,-6551.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.014691,5e-06,0.000527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-4185.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189991.0,0.0,0.0,112500.0,270000.0,16542.0,238500.0,0.010006,-19696.0,-2775.0,-7491.0,-4293.0,5.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,2.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333107,0.393668,0.37065,0.0588,0.0442,0.9767,0.6872,0.0078,0.0,0.069,0.1667,0.0833,0.0185,0.0504,0.0458,0.0,0.0,0.0536,0.0403,0.9767,0.6994,0.0071,0.0,0.069,0.1667,0.0833,0.0163,0.0542,0.0429,0.0,0.0,0.0583,0.0436,0.9767,0.6914,0.0078,0.0,0.069,0.1667,0.0833,0.0184,0.0513,0.0462,0.0,0.0,0.0417,0.0,0.0,0.0,0.0,-1580.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,279319.5,0.0,0.0,144000.0,514777.5,24939.0,450000.0,0.01885,-15757.0,-1223.5,-4504.0,-3256.0,9.0,1.0,1.0,0.0,1.0,0.0,0.0,2.0,2.0,2.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.504858,0.564806,0.533482,0.0876,0.0763,0.9816,0.7552,0.0207,0.0,0.1379,0.1667,0.2083,0.0482,0.0748,0.07435,0.0,0.0035,0.084,0.0749,0.9816,0.7583,0.0187,0.0,0.1379,0.1667,0.2083,0.0461,0.0744,0.0731,0.0,0.001,0.0874,0.076,0.9816,0.7585,0.0206,0.0,0.1379,0.1667,0.2083,0.0487,0.0761,0.0748,0.0,0.0029,0.0686,0.0,0.0,0.0,0.0,-765.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
75%,367971.5,0.0,1.0,202500.0,808650.0,34650.0,679500.0,0.028663,-12395.0,-296.0,-1960.0,-1723.75,15.0,1.0,1.0,0.0,1.0,1.0,0.0,3.0,2.0,2.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,0.673628,0.662679,0.667458,0.1474,0.1121,0.9866,0.8164,0.0499,0.12,0.2069,0.3333,0.375,0.0849,0.121,0.129125,0.0039,0.0274,0.1429,0.1125,0.9866,0.8236,0.04735,0.1208,0.2069,0.3333,0.375,0.08345,0.1295,0.1246,0.0039,0.0229,0.1489,0.1116,0.9866,0.8189,0.04985,0.12,0.2069,0.3333,0.375,0.0862,0.1231,0.1295,0.0039,0.0265,0.1258,2.0,0.0,2.0,0.0,-274.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
max,456255.0,1.0,19.0,117000000.0,4050000.0,225000.0,4050000.0,0.072508,-7680.0,365243.0,0.0,0.0,65.0,1.0,1.0,1.0,1.0,1.0,1.0,20.0,3.0,3.0,23.0,1.0,1.0,1.0,1.0,1.0,1.0,0.946098,0.855,0.88253,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,47.0,6.0,47.0,5.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,9.0,6.0,19.0,19.0,23.0


１、DAYS_EMPLOYEDが異常値（365243）を取っている。それにより、平均も上がっている。  
　　　　→365243と０の割合を調べて、0がなければ365243が0であると推測し置き換える。  
２、objectは数値変換する  
３、DAYS_BIRTH、DAYS_REGISTRATION、DAYS_ID_PUBLISHの値がマイナスなので絶対値を取ってプラスに直す。

再度実行することにする。データは１０万件とする。

In [50]:
df_train_former = pd.read_csv('../application_train.csv')
df_train_former['DAYS_EMPLOYED'].where(df_train_former['DAYS_EMPLOYED']!=365243, 0, inplace=True)
df_train_former[['DAYS_BIRTH', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_EMPLOYED']] = np.fabs(df_train_former[['DAYS_BIRTH', 'DAYS_REGISTRATION', 'DAYS_ID_PUBLISH', 'DAYS_EMPLOYED']])
df_train = df_train_former.sample(frac=1)[:100000]
df_X = df_train.drop("TARGET", axis=1)
df_Y = df_train[["TARGET"]]

#カテゴリーを数値変換
categorical_feats = [
    f for f in df_X.columns if df_X[f].dtype == 'object'
]
categorical_feats_ = categorical_feats.copy()

for f in categorical_feats_:
    df_X[f], _ = pd.factorize(df_X[f])
    df_X[f] = df_X[f].astype('int')
    
#残りの欠損値を平均で埋める
df_X = df_X.fillna(df_X.mean())

#データ分割
(X_train, X_test, 
     y_train, y_test) = train_test_split(df_X.values, df_Y.values.flatten(), test_size=0.3)

#標準化
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [51]:
diparameter = {
    "C": [10**i for i in range(-2,4)],
    "random_state": [123],
    "class_weight": ['balanced', None],
}

grid_search = GridSearchCV(LogisticRegression(),
                           param_grid=diparameter,
                           cv=5,
                           scoring='roc_auc',
                          )
grid_search.fit(X_train, y_train)





GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.01, 0.1, 1, 10, 100, 1000], 'random_state': [123], 'class_weight': ['balanced', None]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [52]:
#最良の点数
grid_search.best_score_

0.7441432736796232

In [61]:
# 最良の方法で予測
y_predict_proba_logistic_gridsearched = grid_search.best_estimator_.predict_proba(X_test)

In [62]:
"""
テストデータを読み込み、処理
"""

df_apptest = pd.read_csv('../application_test.csv')

#カテゴリーを数値変換
categorical_feats = [
    f for f in df_apptest.columns if df_apptest[f].dtype == 'object'
]
categorical_feats_ = categorical_feats.copy()

for f in categorical_feats_:
    df_apptest[f], _ = pd.factorize(df_apptest[f])
    df_apptest[f] = df_apptest[f].astype('int')
    
#残りの欠損値を平均で埋める
df_apptest = df_apptest.fillna(df_apptest.mean())

X_apptest = df_apptest.values

#標準化
scaler.fit(X_apptest)
X_apptest = scaler.transform(X_apptest)

y_apt_predict_proba_logistic_gridsearched = grid_search.best_estimator_.predict_proba(X_apptest)

In [63]:
"""
csvファイルへ書き込み
"""

import csv as csv

submit_file = open("home_credit_submit_190203_1.csv", "w")
file_object = csv.writer(submit_file)
file_object.writerow(["SK_ID_CURR", "TARGET"])
file_object.writerows(zip(df_apptest["SK_ID_CURR"].values, y_apt_predict_proba_logistic_gridsearched[:, 1]))
submit_file.close()

In [56]:
#ランダムサーチで最良の方法を確認
param_dist = {
    "C": uniform(loc=0, scale=4),
    "penalty": ['l2'],
    "class_weight": ['balanced', None],
}

random_search = RandomizedSearchCV(
    LogisticRegression(),
   param_distributions=param_dist,
   cv=5,
   scoring='roc_auc',
)
random_search.fit(X_train, y_train)





RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          fit_params=None, iid='warn', n_iter=10, n_jobs=None,
          param_distributions={'C': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1a35e53358>, 'penalty': ['l2'], 'class_weight': ['balanced', None]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=0)

In [57]:
y_predict_proba_logistic_random = random_search.best_estimator_.predict_proba(X_test)

In [59]:
"""
csvファイルへ書き込み
"""

submit_file = open("home_credit_submit_190203_2.csv", "w")
file_object = csv.writer(submit_file)
file_object.writerow(["SK_ID_CURR", "TARGET"])
file_object.writerows(zip(df_apptest["SK_ID_CURR"].values, y_apt_predict_proba_logistic_random[:, 1]))
submit_file.close()