In [1]:
import numpy as np
import os
import time
from sklearn import metrics
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import time
import warnings
warnings.filterwarnings("ignore") 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import joblib
from tqdm import tqdm_notebook as tqdm
import time
from itertools import combinations,permutations

In [2]:
ori_csv_path = "拟合.xlsx"
ai_csv_path = ori_csv_path
df = pd.read_excel(ori_csv_path)
df_ai = pd.read_excel(ai_csv_path)

# 数据预处理

In [3]:
df['Tone'] = df['Tone'].astype(str)
df_ai['Tone'] = df_ai['Tone'].astype(str)
df['Location'] = df['Location'].astype(str)
df_ai['Location'] = df_ai['Location'].astype(str)
mid = pd.get_dummies(df[df.columns[3:]])
head = df[df.columns[:3]]
mid_ai = pd.get_dummies(df_ai[df_ai.columns[3:]])
head_ai = df_ai[df_ai.columns[:3]]
new_df = pd.concat([head,mid],axis=1)
new_df_ai = pd.concat([head_ai,mid_ai],axis=1)

In [4]:
new_df.columns,len(new_df.columns)
issmic_train_features = new_df.loc[new_df['Training/Testing'] == 'Training'].iloc[:,3:].values
issmic_train_labels = new_df.loc[new_df['Training/Testing'] == 'Training'].iloc[:,2].values
issmic_train_features.shape, issmic_train_labels.shape
time.strftime('%m-%d',time.localtime())
result_name = '{}_{}features_{}trained'.format(time.strftime('%m-%d',time.localtime()),issmic_train_features.shape[1],issmic_train_features.shape[0])
result_path = os.path.join("model",result_name)

In [3]:
if not os.path.exists(result_path):
    os.makedirs(result_path)
issmic_test_features = new_df_ai.loc[new_df_ai['Training/Testing'] == 'Testing'].iloc[:,3:].values
issmic_test_labels = new_df_ai.loc[new_df_ai['Training/Testing'] == 'Testing'].iloc[:,2].values
issmic_test_features.shape, issmic_test_labels.shape

In [6]:
def naive_bayes_classifier(train_x, train_y):
    from sklearn.naive_bayes import MultinomialNB
    model = MultinomialNB(alpha=0.01)  
    model.fit(train_x, train_y,verbose=1)  
    return model

def gaussian_classifier(train_x, train_y):
    from sklearn.naive_bayes import GaussianNB
    model = GaussianNB()  
    model.fit(train_x, train_y)  
    return model
def knn_classifier(train_x, train_y):  
    from sklearn.neighbors import KNeighborsClassifier  
    model = KNeighborsClassifier()  
    model.fit(train_x, train_y)  
    return model  
def logistic_regression_classifier(train_x, train_y):  
    from sklearn.linear_model import LogisticRegression  
    model = LogisticRegression(penalty='l2',solver='lbfgs', max_iter=100)  
    model.fit(train_x, train_y)  
    return model  
def random_forest_classifier(train_x, train_y):  
    from sklearn.ensemble import RandomForestClassifier  
    model = RandomForestClassifier(n_estimators=8,random_state=66)  
    model.fit(train_x, train_y)  
    return model  
def decision_tree_classifier(train_x, train_y):  
    from sklearn import tree  
    model = tree.DecisionTreeClassifier()  
    model.fit(train_x, train_y)  
    return model  
def gradient_boosting_classifier(train_x, train_y):  
    from sklearn.ensemble import GradientBoostingClassifier  
    model = GradientBoostingClassifier(n_estimators=200)  
    model.fit(train_x, train_y)  
    return model  
def svm_classifier(train_x, train_y):  
    from sklearn.svm import SVC  
    model = SVC(kernel='rbf', probability=True)  
    model.fit(train_x, train_y)  
    return model  
def svm_cross_validation(train_x, train_y):  
    from sklearn.model_selection import GridSearchCV  
    from sklearn.svm import SVC  
    model = SVC(kernel='rbf', probability=True)  
    param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}  
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)  
    grid_search.fit(train_x, train_y)  
    best_parameters = grid_search.best_estimator_.get_params()  
    for para, val in list(best_parameters.items()):  
        print(para, val)  
    model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)  
    model.fit(train_x, train_y)  
    return model  

In [2]:
def save_csv(columns, data, csv_path, index=False, header=True):
    data_array = np.array(data)
    df = pd.DataFrame(data_array.T, columns=columns)
    df.to_csv(csv_path, encoding='gbk', index=index, header=header) 
test_classifiers = ['GNB','KNN', 'LR', 'RF', 'DT', 'SVM', 'GBDT']
classifiers = {'NB':naive_bayes_classifier,  
               'GNB':gaussian_classifier,
              'KNN':knn_classifier,  
               'LR':logistic_regression_classifier,  
               'RF':random_forest_classifier,  
               'DT':decision_tree_classifier,  
              'SVM':svm_classifier,  
            'SVMCV':svm_cross_validation,  
             'GBDT':gradient_boosting_classifier  
}  
train_xs = [issmic_train_features]
train_ys = [issmic_train_labels]
test_xs = [issmic_test_features]
test_ys = [issmic_test_labels]
cases = ['neoplasm_diagnosis']
ori_csv_name = os.path.split(ori_csv_path)[-1]
ori_csv_name

In [8]:
new_df.to_csv(os.path.join(result_path,ori_csv_name[:-4]+'_processed'+  '.csv'),index=False,encoding='gbk')

In [1]:
for case_idx in range(len(cases)):
    
    features_choose_list = []
    
    NB_acc_list = []
    NB_train_acc_list = []
    NB_avg_acc_list = []
    NB_sens_list = []
    NB_spec_list = []
    
    GNB_acc_list = []
    GNB_avg_acc_list = []

    KNN_acc_list = []
    KNN_avg_acc_list = []

    LR_acc_list = []
    LR_avg_acc_list = []

    RF_acc_list = []
    RF_avg_acc_list = []

    DT_acc_list = []
    DT_avg_acc_list = []

    SVM_acc_list = []
    SVM_avg_acc_list = []

    GBDT_acc_list = []
    GBDT_avg_acc_list = []

    acc_dict = { 
        'NB':NB_acc_list,
        'GNB':GNB_acc_list,
        'KNN':KNN_acc_list,  
        'LR':LR_acc_list,  
        'RF':RF_acc_list,  
        'DT':DT_acc_list,  
        'SVM':SVM_acc_list,   
        'GBDT':GBDT_acc_list  
    }  
    
    avg_acc_dict = { 
        'NB':NB_avg_acc_list,
        'GNB':GNB_avg_acc_list,
        'KNN':KNN_avg_acc_list,  
        'LR':LR_avg_acc_list,  
        'RF':RF_avg_acc_list,  
        'DT':DT_avg_acc_list,  
        'SVM':SVM_avg_acc_list,   
        'GBDT':GBDT_avg_acc_list  
    }  

    acc_max = 0
    train_acc_max = 0
    avg_acc_max = 0
    featrues_len = issmic_train_features.shape[1]  
    _train_x=train_xs[case_idx]
    _train_y=train_ys[case_idx]
    _test_x=test_xs[case_idx]
    _test_y=test_ys[case_idx]
    
    featrues_idxs=list(range(featrues_len))
    csv_name = cases[case_idx] + '.csv'

    min_combination_num = 2
    for num in tqdm(range(min_combination_num,featrues_len+1)):   
        choose_idxs = list(combinations(featrues_idxs,num))
        for choose_idx in choose_idxs:
            train_x = _train_x[:, choose_idx]
            train_y = _train_y
            test_x = _test_x[:, choose_idx]
            test_y = _test_y
            choose_features = ','.join([str(x) for x in choose_idx])
            features_choose_list.append(choose_features)
            for classifier in test_classifiers:  
                start_time = time.time()  
                model = classifiers[classifier](train_x, train_y)  
                predict = model.predict(test_x)   
                c = confusion_matrix(test_y, predict)
                acc = (c[0][0] + c[1][1]) / (np.sum(c)) * 100
                train_pred = model.predict(train_x)
                train_c = confusion_matrix(train_y, train_pred)
                
                train_acc = (train_c[0][0] + train_c[1][1]) / (np.sum(train_c)) * 100
                avg_acc = (train_acc + acc)/2.
                
                if acc > acc_max:
                    features_acc_max = choose_features
                    methods_acc_max = classifier
                    acc_max = acc
                    
                if train_acc > train_acc_max:
                    features_train_acc_max = choose_features
                    methods_train_acc_max = classifier
                    train_acc_max = train_acc
                    
                if avg_acc > avg_acc_max:
                    features_avg_acc_max = choose_features
                    methods_avg_acc_max = classifier
                    avg_acc_max = avg_acc
    
                acc_dict[classifier].append("{:.2f}%".format(acc))
                avg_acc_dict[classifier].append("{:.2f}%".format(avg_acc))
    print("{},best_acc, {},{},{}".format(cases[case_idx], features_acc_max, methods_acc_max, acc_max)) 
    print("{},best_avg_acc, {},{},{}".format(cases[case_idx], features_avg_acc_max, methods_avg_acc_max, avg_acc_max)) 
    columns = ['features', 'GNB_acc', 'GNB_avg_acc','KNN_acc','KNN_avg_acc',  'LR_acc','LR_avg_acc','RF_acc', 'RF_avg_acc',
              'DT_acc', 'DT_avg_acc','SVM_acc', 'SVM_avg_acc','GBDT_acc','GBDT_avg_acc']
    data = [features_choose_list, GNB_acc_list, GNB_avg_acc_list, KNN_acc_list,KNN_avg_acc_list, LR_acc_list, LR_avg_acc_list,
            RF_acc_list,RF_avg_acc_list, DT_acc_list, DT_avg_acc_list,SVM_acc_list, SVM_avg_acc_list, GBDT_acc_list, GBDT_avg_acc_list]
    csv_path = os.path.join(result_path, csv_name)
    save_csv(columns, data, csv_path)