In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBClassifier
import warnings

warnings.filterwarnings('ignore')

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# 處理訓練資料的國家資訊
geo = pd.get_dummies(train["Geography"])
train = pd.concat([train, geo], axis = 1)

# 處理訓練資料的性別資訊
gen = OneHotEncoder(sparse = False)
train["Gender"] = gen.fit_transform(train[["Gender"]])

# 選取訓練資料的變數
x = train[["CreditScore", "Gender", "Tenure", "Balance", "NumOfProducts",
           "HasCrCard", "IsActiveMember", "EstimatedSalary", "France", "Germany", "Spain"]].values
y = train["Exited"].values

# 處理測試資料的國家資訊
geo_test = pd.get_dummies(test["Geography"])
test = pd.concat([test, geo_test], axis = 1)

# 處理測試資料的性別資訊
gen_test = OneHotEncoder(sparse = False)
test["Gender"] = gen_test.fit_transform(test[["Gender"]])

# 選取測試資料的變數
x_test = test[["CreditScore", "Gender", "Tenure", "Balance", "NumOfProducts",
           "HasCrCard", "IsActiveMember", "EstimatedSalary", "France", "Germany", "Spain"]].values

# X_train, X_test, Y_train, Y_test = train_test_split(x, y,
#                                                     test_size = 0.45,
#                                                     random_state = 1234,
#                                                     stratify = y)

classifier = XGBClassifier()

# 設定random search的參數範圍
eta = [x for x in np.linspace(0, 1, 11)]
gamma = [x for x in np.linspace(0, 10, 19)]
max_depth = [x for x in np.linspace(1, 8, 8, dtype = int)]
min_child_weight = [x for x in np.linspace(0, 10, 11, dtype = int)]
subsample = [x for x in np.linspace(0, 1, 11)]

random_grid = {"eta": eta,
               "gamma": gamma,
               "max_depth": max_depth,
               "min_child_weight": min_child_weight,
               "subsample": subsample}

# 執行random search
classifier_random = RandomizedSearchCV(classifier,
                                       param_distributions = random_grid,
                                       n_iter = 50,
                                       cv = 5)
classifier_random = classifier_random.fit(x, y)

# 預測並儲存結果
result = classifier_random.predict(x_test)

result = pd.Series(result, name = "Exited")
result = result.to_frame()

out = pd.concat([test[["RowNumber"]], result], axis = 1)
out.to_csv("xgboost.csv")

# 印出best params
print("best params:", classifier_random.best_params_)

# 評估訓練資料的準確度
classifier_random.score(x, y)

















best params: {'subsample': 0.7000000000000001, 'min_child_weight': 7, 'max_depth': 3, 'gamma': 5.555555555555555, 'eta': 0.1}


0.834875