In [None]:
%run /home/jovyan/work/utils/pipelines.ipynb
X, y = data_tieandwin()
feature_name_list = ['X'+str(i+1) for i in range(X.shape[1])]
from sklearn.preprocessing import StandardScaler
scal = StandardScaler()
scal.fit(X)
X = scal.transform(X) 
raw_data = change_result_from_raw2data()

## Methods

In [None]:
import json
def get_configs():
    input_file = open('/root/work/4_data_modelling/configs.json')
    content = json.load(input_file)
    return content

def append_to_config_file(model_name, score, params):
    temp = {}
    for k, v in params.items():
        if hasattr(v, '__dict__'):
            continue
        if type(v) == np.int64:
            temp[k] = int(v)
        else:
            temp[k] = v

    input_file = open('/root/work/4_data_modelling/configs.json')
    content = json.load(input_file)
    content[model_name] = {'f1_score':score, 'params':temp}
    output_file = open('/root/work/4_data_modelling/configs.json', 'w')
    json.dump(content, output_file)

import time
def format_time(sec):
    temp = sec
    day_in_sec = 24*60*60
    hour_in_sec = 60*60
    day = 0
    hour = 0
    minv = 0
    if temp > day_in_sec:
        day = int(temp // day_in_sec)
        temp = temp % day_in_sec
    if temp > hour_in_sec:
        hour = int(temp // hour_in_sec)
        temp = temp % hour_in_sec
    if temp > 60:
        minv = int(temp // 60)
        temp = temp % 60
    return '{}{}{}{}s'.format(
        str(day)+'d ' if day !=0 else '', 
        str(hour)+'h ' if hour !=0 else '',
        str(minv)+'m ' if minv !=0 else '',
        round(temp, 2))

## tuning hyperparameter

In [None]:
from sklearn.model_selection import cross_val_predict, RandomizedSearchCV, GridSearchCV, RepeatedStratifiedKFold

def tunning_hyperparameter(X, y, model, random_hyperparameters={}, grid_hyperparameters={},  scoring='f1', 
    kFoldparams={}, random_params = {}, grid_params = {}
):
    print()
    start = time.time()

    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=66, **kFoldparams)

    random_keys = random_hyperparameters.keys()
    random_search = RandomizedSearchCV(model, random_hyperparameters, scoring=scoring, n_iter=20, **random_params)
    random_search.fit(X,y)
    best_score = random_search.best_score_
    best_params = random_search.best_params_
    print('random:', best_score, best_params)
    
    keys = grid_hyperparameters.keys()
    if len(keys) != 0:
        inner_grid_params = {}
        for key in keys:
            out_grid_v = grid_hyperparameters[key]
            if hasattr(out_grid_v, '__call__'):
                v = best_params[key]
                inner_grid_params[key] = out_grid_v(v)
            else:
                inner_grid_params[key] = out_grid_v

        grid_search = GridSearchCV(model, inner_grid_params, scoring=scoring, error_score='raise', **grid_params)
        grid_search.fit(X, y)
        best_params = dict(list(best_params.items()) + list(grid_search.best_params_.items()))
        best_score = grid_search.best_score_
        print('grid:', best_score, best_params)

    end = time.time()
    print('time:', format_time(end - start))
    return best_score, best_params

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn_random_hyperparameters = {
    'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm used to compute the nearest neighbors
    'weights': ['uniform','distance'], # Weight function used in prediction
    'leaf_size':np.arange(20, 200, 5),# Leaf size passed to BallTree or KDTree
    'n_neighbors':np.arange(5,200,5) # Number of neighbors 
}
knn_grid_hyperparameters = {
    # 'leaf_size':lambda v:np.arange(v-5, v+5, 1),
    'n_neighbors':lambda v:np.arange(v-5, v+5, 1)
}
knn_score, knn_params = tunning_hyperparameter(X, y, knn, knn_random_hyperparameters, knn_grid_hyperparameters)
append_to_config_file('knn', knn_score, knn_params)


random: 0.8350680605680584 {'weights': 'distance', 'n_neighbors': 195, 'leaf_size': 135, 'algorithm': 'ball_tree'}
grid: 0.835571633222479 {'weights': 'distance', 'n_neighbors': 199, 'leaf_size': 135, 'algorithm': 'ball_tree'}
time: 1m 24.35s


In [None]:
# ## Support vector machine classifiers are non-probabilistic!
from sklearn.linear_model import SGDClassifier

sgdc = SGDClassifier()
sgdc_random_hyperparameters = {

}
sgdc_grid_hyperparameters = {
    # 'learning_rate':['constant','optimal','invscaling','adaptive'],
    'warm_start':['Ture','False'],
    'loss':['modified_huber']
}
sgdc_score, sgdc_params = tunning_hyperparameter(X, y, sgdc, sgdc_random_hyperparameters, sgdc_grid_hyperparameters)
append_to_config_file('sgdc', sgdc_score, sgdc_params)


random: 0.6856822945068367 {}
grid: 0.7548988849147135 {'loss': 'modified_huber', 'warm_start': 'Ture'}
time: 9.21s


In [None]:
from sklearn.naive_bayes import GaussianNB
gb = GaussianNB()
gb_random_hyperparameters = {

}
gb_grid_hyperparameters = {

}
gb_score, gb_params = tunning_hyperparameter(X, y, gb, gb_random_hyperparameters, gb_grid_hyperparameters)

append_to_config_file('gb', gb_score, gb_params)


random: 0.7664893896816973 {}
time: 0.11s


In [None]:
# # GB --0.67  0.76 bag 0.8349   --没有参数
from sklearn.ensemble import BaggingClassifier

GB = GaussianNB()
GBbag = BaggingClassifier(GaussianNB())
GBbag_random_hyperparameters = {

}
GBbag_grid_hyperparameters = {
    'max_samples':[.7, 0.8], 
    'max_features':[.3, 0.2]
}
GBbag_score, GBbag_params = tunning_hyperparameter(X, y, GBbag, GBbag_random_hyperparameters, GBbag_grid_hyperparameters)

append_to_config_file('GBbag', GBbag_score, GBbag_params)



random: 0.7678094315683565 {}
grid: 0.8355755504951647 {'max_features': 0.3, 'max_samples': 0.8}
time: 3.62s


In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier()
print(mlp.get_params())
mlp_random_hyperparameters = {
    'hidden_layer_sizes':[(128,64,80,10,1)],
    'activation':['relu'],
    # 'solver':['lbfgs', 'sgd', 'adam'],
    # 'learning_rate':['constant', 'invscaling', 'adaptive']
}
mlp_grid_hyperparameters = {
    # 'hidden_layer':[-1,1,1]
}
mlp_score, mlp_params = tunning_hyperparameter(X, y, mlp, mlp_random_hyperparameters, mlp_grid_hyperparameters)
append_to_config_file('mlp', mlp_score, mlp_params)

{'activation': 'relu', 'alpha': 0.0001, 'batch_size': 'auto', 'beta_1': 0.9, 'beta_2': 0.999, 'early_stopping': False, 'epsilon': 1e-08, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant', 'learning_rate_init': 0.001, 'max_fun': 15000, 'max_iter': 200, 'momentum': 0.9, 'n_iter_no_change': 10, 'nesterovs_momentum': True, 'power_t': 0.5, 'random_state': None, 'shuffle': True, 'solver': 'adam', 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': False, 'warm_start': False}

random: 0.807334554809783 {'hidden_layer_sizes': (128, 64, 80, 10, 1), 'activation': 'relu'}
time: 9m 4.1s


In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
print(dt.get_params())
dt_random_hyperparameters = {
}
dt_grid_hyperparameters = {
    'criterion':['gini','entropy'],
    'splitter':['best','random']
}
dt_score, dt_params = tunning_hyperparameter(X, y, dt, dt_random_hyperparameters, dt_grid_hyperparameters)
append_to_config_file('dt', dt_score, dt_params)

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': None, 'splitter': 'best'}

random: 0.7209701297331708 {}
grid: 0.730299014920599 {'criterion': 'entropy', 'splitter': 'best'}
time: 4.11s


In [None]:
from sklearn.ensemble import ExtraTreesClassifier

et = ExtraTreesClassifier()

et_random_hyperparameters = {
    # 'criterion':['gini','entropy'],
    'n_estimators':np.arange(1,80),
    'min_samples_split':[2],
    'max_depth':[None]
}
et_grid_hyperparameters = {
    'n_estimators':lambda v:np.arange(v-3, v+3)
}
et_score, et_params = tunning_hyperparameter(X, y, et, et_random_hyperparameters, et_grid_hyperparameters)
append_to_config_file('et', et_score, et_params)


random: 0.8323653747292126 {'n_estimators': 64, 'min_samples_split': 2, 'max_depth': None}
grid: 0.8325942166442328 {'n_estimators': 61, 'min_samples_split': 2, 'max_depth': None}
time: 1m 6.59s


In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_random_hyperparameters = {
    'criterion':['gini','entropy'],
    'n_estimators':np.arange(1,80)
}
rf_grid_hyperparameters = {
    'n_estimators':lambda v:np.arange(v-3, v+3)
}
rf_score, rf_params = tunning_hyperparameter(X, y, rf, rf_random_hyperparameters, rf_grid_hyperparameters)
append_to_config_file('rf', rf_score, rf_params)


random: 0.8307855557513586 {'n_estimators': 73, 'criterion': 'entropy'}
grid: 0.8302442203617231 {'n_estimators': 75, 'criterion': 'entropy'}
time: 1m 22.93s


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier()

gbc = RandomForestClassifier()
gbc_random_hyperparameters = {
    # 'loss':['deviance','exponential'],
    'n_estimators':np.arange(1,500,50),
    # 'criterion':['friedman_mse', 'squared_error', 'mse', 'mae'],
    'max_depth' : np.arange(1,5)
}
gbc_grid_hyperparameters = {
    'n_estimators':lambda v:np.arange(v-10, v+10, 3),
    'max_depth':lambda v:np.arange(v-1, v+1)
}
gbc_score, gbc_params = tunning_hyperparameter(X, y, gbc, gbc_random_hyperparameters, gbc_grid_hyperparameters)
append_to_config_file('gbc', gbc_score, gbc_params)


random: 0.8369333496819105 {'n_estimators': 101, 'max_depth': 4}
grid: 0.8373716746906494 {'n_estimators': 106, 'max_depth': 4}
time: 2m 0.36s


## Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier, BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression



%run /home/jovyan/work/utils/pipelines-splited.ipynb
X, y = data_tieandwin()
feature_name_list = ['X'+str(i+1) for i in range(X.shape[1])]
scal = StandardScaler()
X = scal.fit_transform(X)
# X = preprocessor.fit_transform(X)

def get_stacking():  
    level0 = list()  
    # level0.append(('sgdc', SGDClassifier(**configs['sgdc']['params'])))  
    level0.append(('GBbag', BaggingClassifier(GaussianNB(),**configs['GBbag']['params'])))  
    level0.append(('rf', RandomForestClassifier(**configs['rf']['params'])))  
    level0.append(('lr', LogisticRegression(**configs['lr']['params']))) 
    level1 = MLPClassifier(**configs['mlp']['params'])
    return level0, level1 

configs = get_configs()
model_names = ['knn', 'mlp', 'GBbag', 'et', 'rf', 'lr', 'gbc']
# model_names = ['knn', 'mlp', 'GBbag', 'gb', 'dt', 'et', 'rf', 'lr', 'stacking', 'gbc']
model_configs, model_scores = {}, {}
for name in model_names:
    model_configs[name] = configs[name]['params']
    model_scores[name] = configs[name]['f1_score']

model_weights = [v/sum(model_scores.values()) for v in model_scores.values()]

models = {
    'knn':KNeighborsClassifier(**model_configs['knn']),
    # 'sgdc':SGDClassifier(**model_configs['sgdc']),向量机不能跑出概率
    'mlp':MLPClassifier(**model_configs['mlp']),
    'GBbag':BaggingClassifier(GaussianNB(), **model_configs['GBbag']),
    # 'gb':GaussianNB(**model_configs['gb']) ,
    # 'dt':DecisionTreeClassifier(**model_configs['dt']) ,
    'et':ExtraTreesClassifier(**model_configs['et']) ,
    'rf':RandomForestClassifier(**model_configs['rf']) ,
    'lr':LogisticRegression(**model_configs['lr']) ,
    # 'stacking':StackingClassifier(*get_stacking(), **model_configs['stacking']) ,
    'gbc':GradientBoostingClassifier(**model_configs['gbc'])
}

eclf = VotingClassifier(estimators=list(models.items()), voting='soft', weights=model_weights)

### Test Ensemble performance

In [None]:
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
# from skl
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1)
f1s = []
accuracys = []
for train_index, test_index in cv.split(X, y):
    x_train, x_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    eclf.fit(x_train,y_train)
    results = eclf.predict(x_test)
    f1 = f1_score(y_test, results)
    f1s.append(f1)
    accuracy = accuracy_score(y_test, results)
    accuracys.append(accuracy)

print('f1 score',np.mean(f1s))
print('accuracy', np.mean(accuracys))

f1 score 0.8360422297225091
accuracy 0.7279610774803962


In [None]:
np.c_[f1s, accuracys]

array([[0.83429672, 0.72509323],
       [0.83879173, 0.72974414],
       [0.83572568, 0.72547974],
       [0.84137056, 0.73347548],
       [0.83002646, 0.72601279]])

In [None]:
np.mean(results)

0.8960554371002132

In [None]:
eclf.fit(X, y)
import joblib
joblib.dump(eclf, 'ensemble_model.pkl')

['splited-scal-ensemble_model.pkl']