# アンダーサンプリング＆バギング
### FGの有無を予測する（閾値1600m）

#### データの前処理

In [1]:
import numpy as np
np.random.seed(443)
import pandas as pd
pd.options.display.max_columns = None

from scipy.stats import spearmanr
import re

In [2]:
## 学習用データ
DATA = pd.read_csv("train_data.csv")

# 使うデータを限定
use_valiable = ["DateTime","VIS","VIS_CAT","FG","PRCP_P24HR",\
                "RH_SFC","TMP_SFC","TD_SFC","PRES_SFC","LCDC_SFC","MCDC_SFC","HCDC_SFC",\
                "WSPD_SFC","WDIR_SFC","APCP_SFC","TimeRange","MONTH",\
#                 "D_PRES_SFC","D_TMP_SFC","D_TD_SFC",\
                "LL_VWS1","LL_VWS2","LL_STBL1","LL_STBL2","WARMER_RA",\
                "RH_1000","VVEL_1000","WSPD_1000","RH_975","VVEL_975","WSPD_975",\
                "RH_950","VVEL_950","WSPD_950","RH_850","RH_700","RH_500","RH_300"]
# DATA = DATA[use_valiable]
DATA = DATA[use_valiable].drop("DateTime", axis=1)

# カテゴリー変数はダミー化
cat_val = ['WDIR_SFC', 'TimeRange', 'MONTH']
DATA = pd.get_dummies(data=DATA, columns=cat_val)

# 雨が降った後の夜間に霧が出やすいことを表現できるかもしれない
DATA["Time_12-14_RAp24hr"] = DATA["PRCP_P24HR"] * DATA["TimeRange_12-14"]
DATA["Time_15-17_RAp24hr"] = DATA["PRCP_P24HR"] * DATA["TimeRange_15-17"]
DATA["Time_18-20_RAp24hr"] = DATA["PRCP_P24HR"] * DATA["TimeRange_18-20"]
DATA["Time_21-23_RAp24hr"] = DATA["PRCP_P24HR"] * DATA["TimeRange_21-23"]

In [3]:
## 予測テスト用データ
TEST = pd.read_csv("test_data.csv")
TEST = TEST[use_valiable].drop("DateTime", axis=1)
TEST = pd.get_dummies(data=TEST, columns=cat_val)

TEST["Time_12-14_RAp24hr"] = TEST["PRCP_P24HR"] * TEST["TimeRange_12-14"]
TEST["Time_15-17_RAp24hr"] = TEST["PRCP_P24HR"] * TEST["TimeRange_15-17"]
TEST["Time_18-20_RAp24hr"] = TEST["PRCP_P24HR"] * TEST["TimeRange_18-20"]
TEST["Time_21-23_RAp24hr"] = TEST["PRCP_P24HR"] * TEST["TimeRange_21-23"]

In [4]:
target = 'FG'
exclude = ['VIS','VIS_CAT','FG','PRCP_P24HR']
features = [val for val in DATA.columns if val not in exclude]

# 特徴量を割り算で作成
cutoff_r = 0.5
new_added_col = []
for i in range(0, len(features)-1):
    for j in range(i+1, len(features)):
        first_col_name = features[i]
        second_col_name = features[j]
        r = spearmanr(DATA[first_col_name], DATA[second_col_name]).correlation        
        if abs(r) > cutoff_r:
            new_colname = first_col_name + "_div_" + second_col_name
            DATA[new_colname] = DATA[first_col_name] / (DATA[second_col_name] + 0.001)
            new_added_col.append(new_colname)
features = features + new_added_col

# 増えた特徴量をテストデータでも作成
for feature in features:
    if re.search('_div_', feature):
        feature1, feature2 = feature.split("_div_")
        TEST[feature] = TEST[feature1] / (TEST[feature2] + 0.001)

In [5]:
DATA.shape, TEST.shape

((26181, 125), (8137, 125))

### RandomForestのハイパーパラメータを最適化

In [6]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score

% matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams

In [7]:
# Undersampling関数
# https://qiita.com/ryouta0506/items/619d9ac0d80f8c0aed92
# Undersamplingするとき、まずクラスタリングし、
# 各クラスターの標本数と同じ割合で、各クラスターからサンプリングする

# Under Samplingの関数（X:サンダーサンプルするデータ num:アンダーサンプリング数 label:多数派のラベル）
def Undersampling_Kmeans(X,num,label) :
    
    # KMeansによるクラスタリング
    from sklearn.cluster import KMeans
    km = KMeans(n_clusters=8, init="k-means++")
    km.fit(X)
    X['Cluster'] = km.predict(X)

    # 群別の構成比を少数派の件数に乗じて群別の抽出件数を計算
    count_sum = X.groupby('Cluster').count().iloc[0:,0].as_matrix()
    ratio = count_sum / count_sum.sum()
    #print( ratio )
    samp_num = np.round(ratio * num,0).astype(np.int32)

    # 群別にサンプリング処理を実施
    for i in np.arange(8) :
        tmp = X[X['Cluster']==i]
        if i == 0 :
            tmp1 = tmp.sample(samp_num[i],replace=True)
        else :
            tmp2 = tmp.sample(samp_num[i],replace=True)
            tmp1 = pd.concat([tmp1,tmp2])
    return tmp1.drop("Cluster", axis=1)

### クラスタリングも合わせたUndersampling
def Clustered_Undersampling( data, col_name ):
    data0 = data[ data[col_name] == 0 ]
    data1 = data[ data[col_name] == 1 ]

    X0 = data0.drop(col_name, axis=1)
    data0 = Undersampling_Kmeans( X0, data1.shape[0], 0 )
    data0[col_name] = 0
    
    return pd.concat( [data0, data1], ignore_index=True )

### 通常のUndersampling
def Normal_Undersampling( data, target ):
    features = [val for val in data.columns if val not in target]
    y = np.array( data[target] )
    X = np.array( data[features] )

    from imblearn.under_sampling import RandomUnderSampler
    rus = RandomUnderSampler(ratio='not minority') 
    X_rus, y_rus = rus.fit_sample(X,y)
    
    new_df = pd.DataFrame( X_rus, columns=features )
    new_df[target] = y_rus
    return new_df

In [51]:
for i in range(10):
    print( "Loop {}" . format(i) )
    
    ## Undersampling
    # DATA_us = Clustered_Undersampling( DATA, "FG" )
    DATA_us = Normal_Undersampling( DATA, "FG" )

    X = np.array( DATA_us[features] )
    y = np.array( DATA_us[target] )

    ## グリッドサーチでハイパーパラメータのチューニング
    rf = RandomForestClassifier(class_weight='balanced', random_state=443)
    params = {'n_estimators' :[1000, 5000], 'max_depth':[3], 'max_features':[10, 20]}
#     gcv = GridSearchCV(rf, param_grid=params, n_jobs=-1, cv=3, scoring='roc_auc', verbose=0)
#     gcv = GridSearchCV(rf, param_grid=params, n_jobs=-1, cv=3, scoring='f1', verbose=0)
    gcv = GridSearchCV(rf, param_grid=params, n_jobs=-1, cv=3, scoring='accuracy', verbose=0)
    gcv.fit(X, y)
    print( gcv.best_params_ )
    print( gcv.best_score_ )

Loop 0
{'max_depth': 3, 'max_features': 10, 'n_estimators': 1000}
0.8452380952380952
Loop 1
{'max_depth': 3, 'max_features': 20, 'n_estimators': 1000}
0.8544973544973545
Loop 2
{'max_depth': 3, 'max_features': 10, 'n_estimators': 5000}
0.8492063492063492
Loop 3
{'max_depth': 3, 'max_features': 20, 'n_estimators': 1000}
0.8505291005291006
Loop 4
{'max_depth': 3, 'max_features': 10, 'n_estimators': 1000}
0.8465608465608465
Loop 5
{'max_depth': 3, 'max_features': 20, 'n_estimators': 1000}
0.8518518518518519
Loop 6
{'max_depth': 3, 'max_features': 20, 'n_estimators': 1000}
0.8386243386243386
Loop 7
{'max_depth': 3, 'max_features': 20, 'n_estimators': 1000}
0.8584656084656085
Loop 8
{'max_depth': 3, 'max_features': 20, 'n_estimators': 5000}
0.8412698412698413
Loop 9
{'max_depth': 3, 'max_features': 10, 'n_estimators': 1000}
0.832010582010582


In [22]:
# 'n_estimators': 1000
# 'max_depth': 3
# 'max_features': 20

# RondomForestの設定
rf = RandomForestClassifier(n_estimators=1000, max_depth=3, max_features=20, random_state=443)

### Balanced Bagging Classifier

In [9]:
# スレットスコアが最も高くなる閾値を求める
def Best_TS_Cutoff(y_test, y_proba):
    accuracy = []
    for i in range(1,100):
        var = i / 100
        ypred_flag = (y_proba[:,1] > var).astype(np.int)
        m = confusion_matrix(y_true=y_test, y_pred=ypred_flag)
        ts = m[1,1] / ( m[0,1] + m[1,0] + m[1,1] )
        accuracy.append( ts )
    return (np.argsort(accuracy)[-1] + 1) / 100

# Equitable Threat Scoreを使う
def Best_ETS_Cutoff(y_test, y_proba):
    Pc = len(y_test[ y_test == 1 ]) / len(y_test) # 気候学的出現率
    accuracy = []
    for i in range(1,100):
        var = i / 100
        ypred_flag = (y_proba[:,1] > var).astype(np.int)
        m = confusion_matrix(y_true=y_test, y_pred=ypred_flag)
        Sf = Pc * ( m[1,1] + m[0,1] ) # ランダム的中率
        ets = ( m[1,1] - Sf ) / ( m[0,1] + m[1,0] + m[1,1] - Sf )
        accuracy.append( ets )
    return (np.argsort(accuracy)[-1] + 1) / 100

# 評価計算を出力
def Evaluation(y_test, y_proba):
    #cutoff = Best_TS_Cutoff( y_test, y_proba )
    cutoff = Best_ETS_Cutoff( y_test, y_proba )
    y_fcst = (y_proba[:,1] > cutoff).astype(np.int)

    print( "AUC : {}" . format( roc_auc_score(y_test, y_proba[:,1]) ) )
    print( "" )
    print( "cutoff : {}" . format(cutoff) )
    print( classification_report(y_true=y_test, y_pred=y_fcst) )
    matrix = confusion_matrix(y_true=y_test, y_pred=y_fcst)
    print( "Confusion Matrix" )
    print( matrix )
    print( "" )
    print( "Threat Score : {}" . format( matrix[1,1]/(matrix[0,1]+matrix[1,0]+matrix[1,1]) ) )
    Pc = len(y_test[ y_test == 1 ]) / len(y_test)
    Sf = Pc * ( matrix[1,1] + matrix[0,1] )
    print( "ETS : {}" . format( (matrix[1,1]-Sf)/(matrix[0,1]+matrix[1,0]+matrix[1,1]-Sf) ) )
    print( "BI : {}" . format( matrix[:,1].sum() / matrix[1,:].sum() ) )
    return cutoff

In [10]:
# データセット
X = np.array( DATA[features] )
Y = np.array( DATA[target] )
X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, test_size=0.3, random_state=443)

X_test = np.array( TEST[features] )
Y_test = np.array( TEST[target] )

In [13]:
# Undersampling & Bagging
bbc = BalancedBaggingClassifier(base_estimator=rf, ratio='not minority', random_state=443)

params = {'n_estimators' :[100, 250, 500]}
gcv = GridSearchCV(bbc, param_grid=params, n_jobs=-1, cv=3, scoring='f1', verbose=2)
gcv.fit(X, Y)

print( gcv.best_params_ )
print( gcv.best_score_ )

Fitting 3 folds for each of 3 candidates, totalling 9 fits
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=100 ................................................
[CV] n_estimators=250 ................................................
[CV] ................................. n_estimators=100, total= 6.1min
[CV] n_estimators=250 ................................................
[CV] ................................. n_estimators=100, total= 6.1min
[CV] n_estimators=250 ................................................
[CV] ................................. n_estimators=100, total= 6.2min
[CV] n_estimators=500 ................................................
[CV] ................................. n_estimators=250, total=15.3min
[CV] n_estimators=500 ................................................
[CV] ................................. n_estimators=250, total=15.3min
[CV] n_estimators=

[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed: 45.0min remaining: 12.8min


[CV] ................................. n_estimators=500, total=26.9min
[CV] ................................. n_estimators=500, total=24.5min


[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 58.2min finished


{'n_estimators': 500}
0.14103767607738887


In [23]:
bbc = BalancedBaggingClassifier(base_estimator=rf, n_estimators=100, ratio='not minority', n_jobs=-1, random_state=443)
bbc.fit(X_train, Y_train)
y_proba = bbc.predict_proba(X_valid)
cutoff = Evaluation( Y_valid, y_proba )

AUC : 0.9293301727479715

cutoff : 0.82
             precision    recall  f1-score   support

          0       0.99      0.97      0.98      7739
          1       0.21      0.47      0.29       116

avg / total       0.98      0.97      0.97      7855

Confusion Matrix
[[7537  202]
 [  61   55]]

Threat Score : 0.17295597484276728
ETS : 0.1629660812988893
BI : 2.2155172413793105


### 予測実験

In [15]:
def Final_Accuracy(y_test, y_proba, cutoff):
    y_fcst = (y_proba[:,1] > cutoff).astype(np.int)

    print( "AUC : {}" . format( roc_auc_score(y_test, y_proba[:,1]) ) )
    print( "" )
    print( "cutoff : {}" . format(cutoff) )
    print( classification_report(y_true=y_test, y_pred=y_fcst) )
    matrix = confusion_matrix(y_true=y_test, y_pred=y_fcst)
    print( "Confusion Matrix" )
    print( matrix )
    print( "" )
    print( "Threat Score : {}" . format( matrix[1,1]/(matrix[0,1]+matrix[1,0]+matrix[1,1]) ) )
    Pc = len(y_test[ y_test == 1 ]) / len(y_test)
    Sf = Pc * ( matrix[1,1] + matrix[0,1] )
    print( "ETS : {}" . format( (matrix[1,1]-Sf)/(matrix[0,1]+matrix[1,0]+matrix[1,1]-Sf) ) )
    print( "BI : {}" . format( matrix[:,1].sum() / matrix[1,:].sum() ) )

In [24]:
# 予測テスト
y_proba = bbc.predict_proba(X_test)
Final_Accuracy( Y_test, y_proba, cutoff )

AUC : 0.9022949072712695

cutoff : 0.82
             precision    recall  f1-score   support

          0       0.99      0.97      0.98      8038
          1       0.16      0.37      0.22        99

avg / total       0.98      0.97      0.97      8137

Confusion Matrix
[[7837  201]
 [  62   37]]

Threat Score : 0.12333333333333334
ETS : 0.11478909535237915
BI : 2.404040404040404
