In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)  #設定可顯示欄位的上限
pd.options.display.float_format = '{:,.6f}'.format  #設定浮點數的格式
import warnings
warnings.filterwarnings("ignore")

#######################################################################################
import matplotlib
import matplotlib.pyplot as plt
#matplotlib內建不支援中文，解決辦法：每次預先指定字體
matplotlib.rcParams.update({'font.size': 36})
matplotlib.use('qt4agg')
myfont = matplotlib.font_manager.FontProperties(fname='C:\\Windows\\Fonts\\msjh.ttc')
#指定字體
matplotlib.rcParams['axes.unicode_minus']=False

from matplotlib.ticker import FormatStrFormatter
majorFormatter = FormatStrFormatter('%0.f') #設定圖表浮點數的格式
%matplotlib inline 
#繪圖完就直接顯示該圖，省略每次繪圖完都要輸入plt.show指令的動作
#######################################################################################


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
from catboost import CatBoostClassifier, Pool, cv

### 由於Hyperopt不支援networkx2.0版本，因此必須將networkx降為1.11版本，才能正常使用Hyperopt
### 指令：pip install networkx==1.11
### 資料來源：https://blog.csdn.net/FontThrone/article/details/79012616

## 讀取資料

In [None]:
data = pd.read_excel('GF_RFMmodelExample_20180709_W2.xlsx', encoding='utf-8')

In [None]:
data.head(1)

In [None]:
data = data.drop_duplicates('遊戲帳號', keep='last')
data.info()

In [None]:
data = data.dropna(how='any')
data.info()

## 建立樣本

In [None]:
data['isvalue'] = 1

data.loc[data['RFM中文類型']=='N_停儲' ,'isvalue']=0
data.loc[data['RFM中文類型']=='N_停滯' ,'isvalue']=0
data.loc[data['RFM中文類型']=='N_退坑' ,'isvalue']=0

In [None]:
x = data.copy()
y = x['isvalue']

#15個變數，10個為連續變數，5個為類別變數
x = x[['平均儲值日期區間','儲值次數', '平均儲值金額','累積儲值金額區間', 
       '角色等級', '角色職業', '修真', '轉生級別', '境界','魔書經驗', '星運',
       '魂力', '傷害力指數', '生存指數', '聲望']]

In [None]:
x.head(1)

In [None]:
x['儲值次數'] = x['儲值次數'].astype(np.float64)
x['平均儲值金額'] = x['平均儲值金額'].astype(np.float64)

In [None]:
#找出categorical features的索引
print(x.dtypes)

categorical_features_indices = np.where(x.dtypes != np.float64)[0] #由於只需要索引號碼，因此加上[0]讓它只回傳索引號碼
print(categorical_features_indices)

## 將資料分割成訓練、測試樣本

In [None]:
#固定random_state，使其每次劃分訓練和測試樣本皆保持一致
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print('訓練樣本數目：', x_train.shape[0])
print('測試樣本數目：', x_test.shape[0])

In [None]:
x_train.head(1)

## 建立模型
### 使用CatBoost的預設參數

In [None]:
#custom_loss讓我們看到其他標準下模型的表現，預設metric是logloss, custom_loss=Accuracy則是另外用準確率來衡量
#metric_period設定訓練每n次打印一次資訊

model = CatBoostClassifier(custom_loss=['Accuracy'],
                           iterations=500,
                           random_seed=42, 
                           metric_period=50)

In [None]:
#cat_features是讓catboost知道categorical features的欄位位置
#eval_set是放驗證or測試樣本，供模型評估其在驗證or測試樣本上的表現
#plot設定是否讓catboost繪製損失函數的走勢圖

model.fit(x_train, y_train,cat_features=categorical_features_indices,eval_set=(x_test, y_test),use_best_model=True,
          plot=True)

## 參數調整前的表現

In [None]:
#測試樣本
y_test_pred = model.predict(x_test)

score = accuracy_score(y_test, y_test_pred)
score

In [None]:
#所有樣本
y_all_pred = model.predict(x)

score = accuracy_score(y, y_all_pred)
score

## 參數調整

### 使用CatBoost推薦的參數調整套件 — Hyperopt
### Hyperopt的操作流程
### 1. 定義目標函數，讓Hyperopt去最小化它
### 2. 定義超參數空間，供Hyperopt選擇空間內最佳的超參數組合
### 3. 定義Hyperopt搜尋的演算法

In [None]:
import hyperopt

In [None]:
# cv為catboost提供的交叉驗證函數

def hyperopt_objective(params):
    model = CatBoostClassifier(
        learning_rate=params['learning_rate'],
        iterations=500,
        custom_loss=['Accuracy'],
        random_seed=42
    )
    
    cv_data = cv(
        Pool(x, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy 

In [None]:
# trials 保存搜尋過程中所有資訊
# fmin 回傳使目標函數最小化的參數
# algo 設定參數搜尋時使用的演算法
# max_evals 尋找幾種參數組合

params_space = {
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 1)}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials
)

print(best)

In [None]:
model = CatBoostClassifier(
    learning_rate=best['learning_rate'], #套用最佳化的參數
    iterations=500,
    eval_metric='Accuracy',
    random_seed=42,
    logging_level='Silent')

model.fit(x_train, y_train,cat_features=categorical_features_indices,eval_set=(x_test, y_test),use_best_model=True,
          plot=True)

## 參數調整後的表現

In [None]:
#測試樣本
y_test_pred = model.predict(x_test)

score = accuracy_score(y_test, y_test_pred)
print('accuracy score: ', score)

In [None]:
#所有樣本
y_all_pred = model.predict(x)

score = accuracy_score(y, y_all_pred)
print('accuracy score: ', score)

## 使用skopt調參

In [None]:
from skopt import gp_minimize  
from skopt.space import Real

In [None]:
model = CatBoostClassifier(
        learning_rate=0.1,
        iterations=500,
        custom_loss=['Accuracy'],
        random_seed=42
    )

In [None]:
def skopt_objective(params):
    
    model = CatBoostClassifier(
        learning_rate=params[0],
        iterations=500,
        custom_loss=['Accuracy'],
        random_seed=42,
        verbose=False
    )
    
    cv_data = cv(
        Pool(x, y, cat_features=categorical_features_indices),
        model.get_params()
    )
    
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    current_param = model.get_params()
    print('\nACC.....',best_accuracy,".....param.....", current_param['learning_rate'])
    
    return 1 - best_accuracy 

In [None]:
search_params = [Real(1e-3, 1)]


best = gp_minimize(
    skopt_objective,
    search_params,
    n_calls=100,
)

In [None]:
#顯示最佳化參數
print(best.x)

model = CatBoostClassifier(
        learning_rate=best.x[0],
        iterations=500,
        custom_loss=['Accuracy'],
        random_seed=42)

In [None]:
model.fit(x_train, y_train,cat_features=categorical_features_indices,eval_set=(x_test, y_test),use_best_model=True,
          plot=True)

## Skopt調整參數後的表現

In [None]:
#測試樣本
y_test_pred = model.predict(x_test)

score = accuracy_score(y_test, y_test_pred)
print('accuracy score: ', score)

In [None]:
#所有樣本
y_all_pred = model.predict(x)

score = accuracy_score(y, y_all_pred)
print('accuracy score: ', score)

## Hyperopt參數調整後的表現

In [None]:
#測試樣本
y_test_pred = model.predict(x_test)

score = accuracy_score(y_test, y_test_pred)
print('accuracy score: ', score)

In [None]:
#所有樣本
y_all_pred = model.predict(x)

score = accuracy_score(y, y_all_pred)
print('accuracy score: ', score)

## 小結
### 在同樣搜尋空間內，hyperopt得到的搜尋結果稍優於skopt。