## 使用xgboost训练功能预测分类模型

使用xgboost提升树模型训练分类算法，基于树模型的特征分裂特性，便于在ensemble leanrning中筛选重要程度高的特征。

## xgboost是如何处理缺失值的呢？

在寻找split point的时候，不会对该特征为missing的样本进行遍历统计，只对该列特征值为non-missing的样本上对应的特征值进行遍历，通过这个技巧来减少了为稀疏离散特征寻找split point的时间开销。

在逻辑实现上，为了保证完备性，会分别处理将missing该特征值的样本分配到左叶子结点和右叶子结点的两种情形，计算增益后选择增益大的方向进行分裂即可。

如果在训练中没有缺失值而在预测中出现缺失，那么会自动将缺失值的划分方向放到右子树。


In [1]:
import pandas as pd
from xgboost import XGBClassifier
from collections import defaultdict
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.metrics import (f1_score, precision_score, recall_score, 
balanced_accuracy_score, roc_auc_score, auc, precision_recall_curve)
from sklearn.metrics import confusion_matrix

In [2]:
import os
os.chdir("D:/functional-prediction/")

In [3]:
df_variant = pd.read_csv("output/variant_feat_clean.csv")

In [4]:
df_feat = df_variant

In [5]:
seed1 = 43987334
seed2 = 39623

In [6]:
# 5 fold split
# good seeds: (43987334, 39623)
# 划分数据集
function_dict = {}
fold = 6 # 1 test set + 5 train-validation set 

for function, content in df_variant.groupby(["function"]):
    content = content.sample(frac=1, random_state=seed1).reset_index(drop=True)

    fold_size = int(len(content) / fold) + 1

    content_list = []
    for i in range(fold):
        content_part = content.iloc[i * fold_size: (i + 1) * fold_size]
        content_part.index = range(len(content_part))
        content_list.append(content_part)
    
    function_dict[function] = content_list

In [7]:
# 5 fold training with dataset balancing

# (2207261730, 2983789) 0.784
# (43987334, 39623)  0.809

fold_clf_list = []

for i in range(1, fold):
    # 当前i下标的datafarme为validation set，其余为train set
    # fold的最后一个下标为test set
    
    df_val = pd.DataFrame(columns=df_variant.columns)
    part_dict = defaultdict(list)
    
    for key, value in function_dict.items():    
        for j in range(1, fold):
            if i == j:
                df_val = pd.concat([df_val, value[j]], axis=0)
            else:
                part_dict[key].append(value[j])
    
    # balancing traning set
    for key in part_dict.keys():
        part_dict[key] = pd.concat(part_dict[key], axis=0)
    
    # upsampling with 2 * max_len
    max_len = max([len(x) for x in part_dict.values()])
    for key in part_dict.keys():
        part_dict[key] = part_dict[key].sample(
            n=max_len*2, replace=True, random_state=seed2)
        
    df_train = pd.concat(list(part_dict.values()), axis=0)

    train_label = df_train["function"].values
    val_label = df_val["function"].values
    
    for col in ["gene", "haplotype_name", "chr", "variant_start", 
                "reference_allele", "variant_allele", "function", 
                "variant", "type"]:
        df_train.pop(col)
        df_val.pop(col)
        
    train_data = df_train.values
    val_data = df_val.values
    
    clf = XGBClassifier(
        learning_rate=0.005,
        subsample=0.6,
        max_depth=5,
        n_estimators=100,
    )
    
    clf.fit(
        train_data, train_label,
        eval_metric='auc',
        eval_set=[(val_data, val_label)],
        early_stopping_rounds=30
    )
        
    fold_clf_list.append(clf)

[0]	validation_0-auc:0.78486
[1]	validation_0-auc:0.81285
[2]	validation_0-auc:0.81533
[3]	validation_0-auc:0.82127
[4]	validation_0-auc:0.81444
[5]	validation_0-auc:0.82488
[6]	validation_0-auc:0.82181
[7]	validation_0-auc:0.82590
[8]	validation_0-auc:0.82991
[9]	validation_0-auc:0.83200




[10]	validation_0-auc:0.82421
[11]	validation_0-auc:0.82209
[12]	validation_0-auc:0.82718
[13]	validation_0-auc:0.82859
[14]	validation_0-auc:0.82786
[15]	validation_0-auc:0.82905
[16]	validation_0-auc:0.83045
[17]	validation_0-auc:0.83486
[18]	validation_0-auc:0.83533
[19]	validation_0-auc:0.83405
[20]	validation_0-auc:0.83191
[21]	validation_0-auc:0.83526
[22]	validation_0-auc:0.83062
[23]	validation_0-auc:0.83074
[24]	validation_0-auc:0.82994
[25]	validation_0-auc:0.82835
[26]	validation_0-auc:0.82786
[27]	validation_0-auc:0.82994
[28]	validation_0-auc:0.83210
[29]	validation_0-auc:0.83118
[30]	validation_0-auc:0.83258
[31]	validation_0-auc:0.82903
[32]	validation_0-auc:0.82715
[33]	validation_0-auc:0.82756
[34]	validation_0-auc:0.82713
[35]	validation_0-auc:0.82792
[36]	validation_0-auc:0.82646
[37]	validation_0-auc:0.82882
[38]	validation_0-auc:0.82784
[39]	validation_0-auc:0.82717
[40]	validation_0-auc:0.82826
[41]	validation_0-auc:0.83170
[42]	validation_0-auc:0.83329
[43]	valid

In [8]:
def performance(true_label, pred_label, classes):
    con_mat = confusion_matrix(true_label, pred_label)

    sensitivity_list = []
    for i in range(con_mat.shape[0]):
        tp = con_mat[i][i]
        fn = np.sum(con_mat[i,:]) - tp
        sensitivity = round(tp / (tp + fn), 4)
        sensitivity_list.append(sensitivity)

    class_sensitivity = dict(zip(classes, sensitivity_list))
    print("sensitivity: ", class_sensitivity)
    print("avg sensitivity: ", round(sum(sensitivity_list) / con_mat.shape[0], 4))
    print()
        
    precision_list = []
    for i in range(con_mat.shape[0]):
        tp = con_mat[i][i]
        fp = np.sum(con_mat[:,i]) - tp
        precision = round(tp / (tp + fp), 4)
        precision_list.append(precision)
        
    class_precision = dict(zip(classes, precision_list))
    print("precision: ", class_precision)
    print("avg precision: ", round(sum(precision_list) / con_mat.shape[0], 4))
    print()
    
    f1_list = [round((x[0] * x[1] * 2) / (x[0] + x[1]), 4) for x in zip(sensitivity_list, precision_list)]
    class_f1 = dict(zip(classes, f1_list))
    print("f1: ", class_f1)
    print("avg f1: ", round(sum(f1_list) / con_mat.shape[0], 4))
    print()
    
    specificity_list = []
    for i in range(con_mat.shape[0]):
        number = np.sum(con_mat[:,:])
        tp = con_mat[i][i]
        fn = np.sum(con_mat[i,:]) - tp
        fp = np.sum(con_mat[:,i]) - tp
        tn = number - tp - fn - fp
        specificity = round(tn / (tn + fp), 4)
        specificity_list.append(specificity)
        
    class_specificity = dict(zip(classes, specificity_list))
    print("specificity: ", class_specificity)
    print("avg specificity: ", round(sum(specificity_list) / con_mat.shape[0], 4))

In [9]:
test_list = []
for key, value in function_dict.items():
    test_list.append(value[0])
    
df_test = pd.concat(test_list, axis=0)
df_test_ = df_test.copy(deep=True)

test_label = df_test["function"].values
for col in ["gene", "haplotype_name", "chr", "variant_start", 
                "reference_allele", "variant_allele", "function", 
                "variant", "type"]:
    df_test_.pop(col)

test_data = df_test_.values
    
average_precision = 0
for clf in fold_clf_list:
    result = list(clf.predict(test_data))
    performance(test_label, result, classes=clf.classes_)
    precision = sum([1 if test_label[i] == result[i] else 0 for i in range(len(result))]) / len(result)
    average_precision += precision
    print("precision: {}".format(precision))
    
print("average precision: {}".format(average_precision / len(fold_clf_list)))

sensitivity:  {'decreased function': 0.6154, 'increased function': 1.0, 'no function': 0.8438, 'normal function': 0.8182}
avg sensitivity:  0.8194

precision:  {'decreased function': 0.7273, 'increased function': 1.0, 'no function': 0.871, 'normal function': 0.6429}
avg precision:  0.8103

f1:  {'decreased function': 0.6667, 'increased function': 1.0, 'no function': 0.8572, 'normal function': 0.72}
avg f1:  0.811

specificity:  {'decreased function': 0.9412, 'increased function': 1.0, 'no function': 0.875, 'normal function': 0.9057}
avg specificity:  0.9305
precision: 0.8125
sensitivity:  {'decreased function': 0.6923, 'increased function': 0.875, 'no function': 0.7812, 'normal function': 0.9091}
avg sensitivity:  0.8144

precision:  {'decreased function': 0.6923, 'increased function': 1.0, 'no function': 0.8621, 'normal function': 0.6667}
avg precision:  0.8053

f1:  {'decreased function': 0.6923, 'increased function': 0.9333, 'no function': 0.8197, 'normal function': 0.7693}
avg f1: 

In [10]:
# columns = df_test_.columns

# # 模型特征分相加
# feature_score = np.zeros(len(columns))
# for clf in fold_clf_list:
#     feature_score = feature_score + clf.feature_importances_

# sorted_idx = np.argsort(feature_score)[::-1]

# for index in sorted_idx[:40]:
#     print([columns[index], feature_score[index]]) 

In [11]:
# total set training
part_dict = defaultdict(list)
for key, value in function_dict.items():    
    for j in range(1, fold):
        part_dict[key].append(value[j])
        
# balancing traning set
for key in part_dict.keys():
    part_dict[key] = pd.concat(part_dict[key], axis=0)

# upsampling with 2 * max_len
max_len = max([len(x) for x in part_dict.values()])
for key in part_dict.keys():
    part_dict[key] = part_dict[key].sample(
        n=max_len*2, replace=True, random_state=seed2)

df_train = pd.concat(list(part_dict.values()), axis=0)

train_label = df_train["function"].values

for col in ["gene", "haplotype_name", "chr", "variant_start", 
                "reference_allele", "variant_allele", "function", 
                "variant", "type"]:
    df_train.pop(col)
    
df_train_ = df_train.copy(deep=True)

In [64]:
df_train_key = df_train_.copy(deep=True)
col_list = [
        'DEOGEN2_score', 'M-CAP_score', 'MPC_score', 'MutationAssessor_score',
        'LRT_score', 'FATHMM_score', 'PROVEAN_score',
        'Polyphen2_HVAR_score', 'integrated_fitCons_score', 'VEST4_score',
        'SIFT4G_score', 'LoFtool', 'GenoCanyon_score', 'CADD_raw', 'APF_score'
    ]
for col in df_train_key.columns:
    if "_variant" in col or "_gained" in col or "_lost" in col:
        col_list.append(col)
        
df_train_all_ = df_train_key[col_list]

# 使用空值数据
# df_train_all_ = pd.concat([df_train_, df_train_key], axis=0)

# 不使用空值数据
# df_train_all_ = df_train_

In [65]:
imputer_dict = {}
normalizer_dict = {}
for col in df_train.columns:
    if "_variant" in col or "_gained" in col or "_lost" in col:
        continue
    imp = SimpleImputer(missing_values=np.nan, strategy='mean').fit(df_train[col].values.reshape(-1, 1))
    imputer_dict[col] = imp
    df_train[col] = imp.transform(df_train[col].values.reshape(-1, 1))
    
    normalizer = StandardScaler().fit(df_train[col].values.reshape(-1, 1))
    normalizer_dict[col] = normalizer
    df_train[col] = normalizer.transform(df_train[col].values.reshape(-1, 1))
    
with open("model/imputer_dict_multi.pkl", "wb") as f:
    pickle.dump(imputer_dict, f)
    
with open("model/normalizer_dict_multi.pkl", "wb") as f:
    pickle.dump(normalizer_dict, f)

In [66]:
df_train_key = df_train.copy(deep=True)
for col in df_train_key.columns:
    if "_variant" in col or "_gained" in col or "_lost" in col:
        continue
        
    if col not in [
        'DEOGEN2_score', 'M-CAP_score', 'MPC_score', 'MutationAssessor_score',
        'LRT_score', 'FATHMM_score', 'PROVEAN_score',
        'Polyphen2_HVAR_score', 'integrated_fitCons_score', 'VEST4_score',
        'SIFT4G_score', 'LoFtool', 'GenoCanyon_score', 'CADD_raw', 'APF_score'
    ]:
        df_train_key[col] = [np.nan] * len(df_train_key)
        df_train_key[col] = imputer_dict[col].transform(df_train_key[col].values.reshape(-1, 1))
        df_train_key[col] = normalizer_dict[col].transform(df_train_key[col].values.reshape(-1, 1))
        
# 使用空值数据
# df_train_all = pd.concat([df_train, df_train_key], axis=0)

# 不使用空值数据
df_train_all = df_train_key

In [67]:
# test data generation
test_list = []
for key, value in function_dict.items():
    test_list.append(value[0])
    
df_test = pd.concat(test_list, axis=0)
df_test_ = df_test.copy(deep=True)

test_label = df_test["function"].values
for col in ["gene", "haplotype_name", "chr", "variant_start", 
                "reference_allele", "variant_allele", "function", 
                "variant", "type"]:
    df_test_.pop(col)

df_test_fillna = df_test_.copy(deep=True)

df_test_ = df_test_[col_list]

with open("model/imputer_dict_multi.pkl", "rb") as f:
    imputer_dict = pickle.load(f)
    
with open("model/normalizer_dict_multi.pkl", "rb") as f:
    normalizer_dict = pickle.load(f)
    

    
for key in imputer_dict.keys():
    df_test_fillna[key] = imputer_dict[key].transform(df_test_fillna[key].values.reshape(-1, 1))
    df_test_fillna[key] = normalizer_dict[key].transform(df_test_fillna[key].values.reshape(-1, 1))


In [68]:
train_data = df_train_all_.values
test_data = df_test_.values

train_data_fillna = df_train_all.values
test_data_fillna = df_test_fillna.values

In [69]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier

In [70]:
# xgboost model
xgb_model = XGBClassifier(
    learning_rate=0.01,
        subsample=0.5,
        max_depth=4,
        n_estimators=100,
)

xgb_model.fit(
    train_data, list(train_label),
    eval_metric='auc'
)

classes = xgb_model.classes_
y_pred = xgb_model.predict_proba(test_data)
result = xgb_model.predict(test_data)

xgb_model.voting = {
    "classes": classes,
    "y_pred": y_pred,
    "result": result
}

performance(test_label, result, classes)



sensitivity:  {'decreased function': 0.6923, 'increased function': 1.0, 'no function': 0.7188, 'normal function': 0.9091}
avg sensitivity:  0.83

precision:  {'decreased function': 0.6923, 'increased function': 1.0, 'no function': 0.92, 'normal function': 0.5556}
avg precision:  0.792

f1:  {'decreased function': 0.6923, 'increased function': 1.0, 'no function': 0.807, 'normal function': 0.6897}
avg f1:  0.7972

specificity:  {'decreased function': 0.9216, 'increased function': 1.0, 'no function': 0.9375, 'normal function': 0.8491}
avg specificity:  0.927


In [71]:
# lightgbm model
lgb_model = LGBMClassifier(
    max_depth=4, 
    learning_rate=0.09, 
    n_estimators=100,
    subsample=0.6, 
    reg_lambda=0.1
)

lgb_model.fit(
   train_data, list(train_label),
)
y_pred = lgb_model.predict_proba(test_data)
classes = lgb_model.classes_
result = lgb_model.predict(test_data)

lgb_model.voting = {
    "classes": classes,
    "y_pred": y_pred,
    "result": result
}

performance(test_label, result, classes)

sensitivity:  {'decreased function': 0.6923, 'increased function': 1.0, 'no function': 0.8438, 'normal function': 0.9091}
avg sensitivity:  0.8613

precision:  {'decreased function': 0.6923, 'increased function': 1.0, 'no function': 0.871, 'normal function': 0.8333}
avg precision:  0.8492

f1:  {'decreased function': 0.6923, 'increased function': 1.0, 'no function': 0.8572, 'normal function': 0.8696}
avg f1:  0.8548

specificity:  {'decreased function': 0.9216, 'increased function': 1.0, 'no function': 0.875, 'normal function': 0.9623}
avg specificity:  0.9397


In [72]:
# rf model 
rf_model = RandomForestClassifier(
    max_depth=5,
    random_state=220
)

rf_model.fit(
   train_data_fillna, list(train_label),
)

classes = rf_model.classes_
y_pred = rf_model.predict_proba(test_data_fillna)
result = rf_model.predict(test_data_fillna)

rf_model.voting = {
    "classes": classes,
    "y_pred": y_pred,
    "result": result
}

performance(test_label, result, classes)

sensitivity:  {'decreased function': 0.2308, 'increased function': 1.0, 'no function': 0.75, 'normal function': 0.7273}
avg sensitivity:  0.677

precision:  {'decreased function': 0.6, 'increased function': 0.8889, 'no function': 0.6857, 'normal function': 0.5333}
avg precision:  0.677

f1:  {'decreased function': 0.3334, 'increased function': 0.9412, 'no function': 0.7164, 'normal function': 0.6154}
avg f1:  0.6516

specificity:  {'decreased function': 0.9608, 'increased function': 0.9821, 'no function': 0.6562, 'normal function': 0.8679}
avg specificity:  0.8668


In [73]:
# lr model
lr_model = LogisticRegression(
    max_iter=200,
    C=0.05
)

lr_model.fit(
   train_data_fillna, list(train_label),
)

classes = lr_model.classes_
y_pred = lr_model.predict_proba(test_data_fillna)
result = lr_model.predict(test_data_fillna)

lr_model.voting = {
    "classes": classes,
    "y_pred": y_pred,
    "result": result
}

performance(test_label, result, classes)

sensitivity:  {'decreased function': 0.5385, 'increased function': 0.0, 'no function': 0.5625, 'normal function': 0.0909}
avg sensitivity:  0.298

precision:  {'decreased function': 0.25, 'increased function': 0.0, 'no function': 0.5806, 'normal function': 0.5}
avg precision:  0.3326

f1:  {'decreased function': 0.3415, 'increased function': nan, 'no function': 0.5714, 'normal function': 0.1538}
avg f1:  nan

specificity:  {'decreased function': 0.5882, 'increased function': 0.9464, 'no function': 0.5938, 'normal function': 0.9811}
avg specificity:  0.7774




In [74]:
# hard voting

def most_common(lst):
    return max(lst, key=lst.count)

hard_vote = [
    most_common(x) for x in list(zip(
        xgb_model.voting["result"],
        lgb_model.voting["result"],
        rf_model.voting["result"],
#         lr_model.voting["result"]
    ))
]

performance(test_label, hard_vote, xgb_model.voting["classes"])

sensitivity:  {'decreased function': 0.6154, 'increased function': 1.0, 'no function': 0.7812, 'normal function': 0.9091}
avg sensitivity:  0.8264

precision:  {'decreased function': 0.7273, 'increased function': 1.0, 'no function': 0.8929, 'normal function': 0.5882}
avg precision:  0.8021

f1:  {'decreased function': 0.6667, 'increased function': 1.0, 'no function': 0.8333, 'normal function': 0.7143}
avg f1:  0.8036

specificity:  {'decreased function': 0.9412, 'increased function': 1.0, 'no function': 0.9062, 'normal function': 0.8679}
avg specificity:  0.9288


In [75]:
# 软投票
y_pred = (
    xgb_model.voting["y_pred"] 
    + lgb_model.voting["y_pred"] 
#     + rf_model.voting["y_pred"] 
#     + lr_model.voting["y_pred"] 
)
soft_vote = [xgb_model.voting["classes"][x] for x in np.argmax(y_pred, axis=1)]

performance(test_label, soft_vote, xgb_model.voting["classes"])

sensitivity:  {'decreased function': 0.6923, 'increased function': 1.0, 'no function': 0.8125, 'normal function': 0.9091}
avg sensitivity:  0.8535

precision:  {'decreased function': 0.6923, 'increased function': 1.0, 'no function': 0.8667, 'normal function': 0.7692}
avg precision:  0.832

f1:  {'decreased function': 0.6923, 'increased function': 1.0, 'no function': 0.8387, 'normal function': 0.8333}
avg f1:  0.8411

specificity:  {'decreased function': 0.9216, 'increased function': 1.0, 'no function': 0.875, 'normal function': 0.9434}
avg specificity:  0.935


In [76]:
from datetime import datetime

today_str = datetime.now().strftime("%m%d")

def save_model():
    with open("model/xgb_multi_{}.pkl".format(today_str), "wb") as f:
        pickle.dump(xgb_model, f)
    
    with open("model/lgb_multi_{}.pkl".format(today_str), "wb") as f:
        pickle.dump(lgb_model, f)
    
    with open("model/rf_multi_{}.pkl".format(today_str), "wb") as f:
        pickle.dump(rf_model, f)
        
    with open("model/lr_multi_{}.pkl".format(today_str), "wb") as f:
        pickle.dump(lr_model, f)
        
    with open("model/col_list_multi_{}.pkl".format(today_str), "wb") as f:
        pickle.dump(col_list, f)
        
save_model()

In [26]:
# df_test_show = df_test[["gene", "haplotype_name", "chr", "variant_start", "reference_allele", "variant_allele", "function"]]
# df_test_show.index = range(len(df_test_show))
# df_test_show["function_prediction"] = result

# test missing feature

In [37]:
# with open("model/imputer_dict_multi.pkl", "rb") as f:
#     imputer_dict = pickle.load(f)

# with open("model/normalizer_dict_multi.pkl", "rb") as f:
#     normalizer_dict = pickle.load(f)

# with open("model/xgb_multi_0823.pkl", "rb") as f:
#     xgb_model = pickle.load(f)
    
# with open("model/lgb_multi_0823.pkl", "rb") as f:
#     lgb_model = pickle.load(f)

# with open("model/rf_multi_0823.pkl", "rb") as f:
#     rf_model = pickle.load( f)

# with open("model/lr_multi_0823.pkl", "rb") as f:
#     lr_model = pickle.load( f)
    

df_test_key = df_test_.copy(deep=True)
for col in df_test_key.columns:
    if "_variant" in col or "_gained" in col or "_lost" in col:
        continue
        
    if col not in [
        'DEOGEN2_score', 'M-CAP_score', 'MPC_score', 'MutationAssessor_score',
        'LRT_score', 'FATHMM_score', 'PROVEAN_score',
        'Polyphen2_HVAR_score', 'integrated_fitCons_score', 'VEST4_score',
        'SIFT4G_score', 'LoFtool', 'GenoCanyon_score', 'CADD_raw', 'APF_score'
    ]:
        df_test_key[col] = [np.nan] * len(df_test_key)

In [38]:
df_test_key_fillna = df_test_key.copy(deep=True)
    
for key in imputer_dict.keys():
    df_test_key_fillna[key] = imputer_dict[key].transform(df_test_key_fillna[key].values.reshape(-1, 1))
    df_test_key_fillna[key] = normalizer_dict[key].transform(df_test_key_fillna[key].values.reshape(-1, 1))


In [39]:
key_test_data = df_test_key.values
key_test_data_fillna = df_test_key_fillna.values

In [40]:
y_pred = lgb_model.predict_proba(key_test_data)
classes = lgb_model.classes_
result = lgb_model.predict(key_test_data)

lgb_model.voting = {
    "classes": classes,
    "y_pred": y_pred,
    "result": result
}

performance(test_label, result, classes)

sensitivity:  {'decreased function': 0.6154, 'increased function': 1.0, 'no function': 0.8438, 'normal function': 0.9091}
avg sensitivity:  0.8421

precision:  {'decreased function': 0.6667, 'increased function': 1.0, 'no function': 0.871, 'normal function': 0.7692}
avg precision:  0.8267

f1:  {'decreased function': 0.64, 'increased function': 1.0, 'no function': 0.8572, 'normal function': 0.8333}
avg f1:  0.8326

specificity:  {'decreased function': 0.9216, 'increased function': 1.0, 'no function': 0.875, 'normal function': 0.9434}
avg specificity:  0.935


In [41]:
y_pred = xgb_model.predict_proba(key_test_data)
classes = xgb_model.classes_
result = xgb_model.predict(key_test_data)

xgb_model.voting = {
    "classes": classes,
    "y_pred": y_pred,
    "result": result
}

performance(test_label, result, classes)

sensitivity:  {'decreased function': 0.7692, 'increased function': 1.0, 'no function': 0.7188, 'normal function': 0.9091}
avg sensitivity:  0.8493

precision:  {'decreased function': 0.6667, 'increased function': 1.0, 'no function': 0.92, 'normal function': 0.625}
avg precision:  0.8029

f1:  {'decreased function': 0.7143, 'increased function': 1.0, 'no function': 0.807, 'normal function': 0.7407}
avg f1:  0.8155

specificity:  {'decreased function': 0.902, 'increased function': 1.0, 'no function': 0.9375, 'normal function': 0.8868}
avg specificity:  0.9316


In [42]:
classes = lr_model.classes_
y_pred = lr_model.predict_proba(key_test_data_fillna)
result = lr_model.predict(key_test_data_fillna)

lr_model.voting = {
    "classes": classes,
    "y_pred": y_pred,
    "result": result
}

performance(test_label, result, classes)

sensitivity:  {'decreased function': 0.6923, 'increased function': 1.0, 'no function': 0.6562, 'normal function': 0.7273}
avg sensitivity:  0.769

precision:  {'decreased function': 0.6923, 'increased function': 0.5, 'no function': 0.875, 'normal function': 0.7273}
avg precision:  0.6986

f1:  {'decreased function': 0.6923, 'increased function': 0.6667, 'no function': 0.75, 'normal function': 0.7273}
avg f1:  0.7091

specificity:  {'decreased function': 0.9216, 'increased function': 0.8571, 'no function': 0.9062, 'normal function': 0.9434}
avg specificity:  0.9071


In [43]:
classes = rf_model.classes_
y_pred = rf_model.predict_proba(key_test_data_fillna)
result = rf_model.predict(key_test_data_fillna)

rf_model.voting = {
    "classes": classes,
    "y_pred": y_pred,
    "result": result
}

performance(test_label, result, classes)

sensitivity:  {'decreased function': 0.5385, 'increased function': 1.0, 'no function': 0.8125, 'normal function': 0.8182}
avg sensitivity:  0.7923

precision:  {'decreased function': 0.7, 'increased function': 0.7273, 'no function': 0.8387, 'normal function': 0.75}
avg precision:  0.754

f1:  {'decreased function': 0.6087, 'increased function': 0.8421, 'no function': 0.8254, 'normal function': 0.7826}
avg f1:  0.7647

specificity:  {'decreased function': 0.9412, 'increased function': 0.9464, 'no function': 0.8438, 'normal function': 0.9434}
avg specificity:  0.9187


In [44]:
# hard voting

def most_common(lst):
    return max(lst, key=lst.count)

hard_vote = [
    most_common(x) for x in list(zip(
        lgb_model.voting["result"],
        xgb_model.voting["result"],
    ))
]

performance(test_label, hard_vote, lgb_model.voting["classes"])

sensitivity:  {'decreased function': 0.6154, 'increased function': 1.0, 'no function': 0.8438, 'normal function': 0.9091}
avg sensitivity:  0.8421

precision:  {'decreased function': 0.6667, 'increased function': 1.0, 'no function': 0.871, 'normal function': 0.7692}
avg precision:  0.8267

f1:  {'decreased function': 0.64, 'increased function': 1.0, 'no function': 0.8572, 'normal function': 0.8333}
avg f1:  0.8326

specificity:  {'decreased function': 0.9216, 'increased function': 1.0, 'no function': 0.875, 'normal function': 0.9434}
avg specificity:  0.935


In [46]:
y_pred = (
    xgb_model.voting["y_pred"] 
    + lgb_model.voting["y_pred"] 
    + rf_model.voting["y_pred"] 
    + lr_model.voting["y_pred"] 
)
soft_vote = [xgb_model.voting["classes"][x] for x in np.argmax(y_pred, axis=1)]

performance(test_label, soft_vote, xgb_model.voting["classes"])

sensitivity:  {'decreased function': 0.6154, 'increased function': 1.0, 'no function': 0.8125, 'normal function': 0.9091}
avg sensitivity:  0.8342

precision:  {'decreased function': 0.6667, 'increased function': 1.0, 'no function': 0.8667, 'normal function': 0.7143}
avg precision:  0.8119

f1:  {'decreased function': 0.64, 'increased function': 1.0, 'no function': 0.8387, 'normal function': 0.8}
avg f1:  0.8197

specificity:  {'decreased function': 0.9216, 'increased function': 1.0, 'no function': 0.875, 'normal function': 0.9245}
avg specificity:  0.9303
