In [5]:
import numpy as np
import os
from sklearn import metrics
import pandas as pd
from sklearn.preprocessing import StandardScaler,MinMaxScaler
import warnings
warnings.filterwarnings("ignore")  
from tqdm import tqdm_notebook as tqdm
import sys
sys.path.insert(0, '../../trainer')
from core.plot import Plot
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
import joblib
import time

In [None]:
ori_csv_path = r"~\feature_extracted.xlsx"
ai_csv_path = ori_csv_path
df = pd.read_excel(ori_csv_path)
df_ai = pd.read_excel(ai_csv_path)


df['Tone'] = df['Tone'].astype(str)
df_ai['Tone'] = df_ai['Tone'].astype(str)
df['Location'] = df['Location'].astype(str)
df_ai['Location'] = df_ai['Location'].astype(str)
mid = pd.get_dummies(df[df.columns[3:]])
head = df[df.columns[:3]]
mid_ai = pd.get_dummies(df_ai[df_ai.columns[3:]])
head_ai = df_ai[df_ai.columns[:3]]
new_df = pd.concat([head,mid],axis=1)
new_df_ai = pd.concat([head_ai,mid_ai],axis=1)


new_df.columns,len(new_df.columns)
issmic_train_features = new_df.loc[new_df['Training/Testing'] == 'Training'].iloc[:,3:].values
issmic_train_labels = new_df.loc[new_df['Training/Testing'] == 'Training'].iloc[:,2].values
issmic_train_features.shape, issmic_train_labels.shape
time.strftime('%m-%d',time.localtime())
result_name = '{}_{}features_{}trained'.format(time.strftime('%m-%d',time.localtime()),issmic_train_features.shape[1],issmic_train_features.shape[0])
result_name
result_path = os.path.join("model",result_name)
result_path

if not os.path.exists(result_path):
    os.makedirs(result_path)
issmic_test_features = new_df_ai.loc[new_df_ai['Training/Testing'] == 'Testing'].iloc[:,3:].values
issmic_test_labels = new_df_ai.loc[new_df_ai['Training/Testing'] == 'Testing'].iloc[:,2].values
issmic_test_features.shape, issmic_test_labels.shape

In [6]:
def naive_bayes_classifier(train_x, train_y):
    from sklearn.naive_bayes import MultinomialNB
    model = MultinomialNB(alpha=0.01)  
    model.fit(train_x, train_y,verbose=1)  
    return model

def gaussian_classifier(train_x, train_y):
    from sklearn.naive_bayes import GaussianNB
    model = GaussianNB()  
    model.fit(train_x, train_y)  
    return model

def knn_classifier(train_x, train_y):  
    from sklearn.neighbors import KNeighborsClassifier  
    model = KNeighborsClassifier()  
    model.fit(train_x, train_y)  
    return model  

def logistic_regression_classifier(train_x, train_y):  
    from sklearn.linear_model import LogisticRegression  
    model = LogisticRegression(penalty='l2',solver='lbfgs', max_iter=100)  
    model.fit(train_x, train_y)  
    return model  

def random_forest_classifier(train_x, train_y):  
    from sklearn.ensemble import RandomForestClassifier  
    model = RandomForestClassifier(n_estimators=8,random_state=66)  
    model.fit(train_x, train_y)  
    return model  

def decision_tree_classifier(train_x, train_y):  
    from sklearn import tree  
    model = tree.DecisionTreeClassifier()  
    model.fit(train_x, train_y)  
    return model  

def gradient_boosting_classifier(train_x, train_y):  
    from sklearn.ensemble import GradientBoostingClassifier  
    model = GradientBoostingClassifier(n_estimators=200)  
    model.fit(train_x, train_y)  
    return model  

def svm_classifier(train_x, train_y):  
    from sklearn.svm import SVC  
    model = SVC(kernel='rbf', probability=True)  
    model.fit(train_x, train_y)  
    return model  

def svm_cross_validation(train_x, train_y):  
    from sklearn.model_selection import GridSearchCV  
    from sklearn.svm import SVC  
    model = SVC(kernel='rbf', probability=True)  
    param_grid = {'C': [1e-3, 1e-2, 1e-1, 1, 10, 100, 1000], 'gamma': [0.001, 0.0001]}  
    grid_search = GridSearchCV(model, param_grid, n_jobs = 1, verbose=1)  
    grid_search.fit(train_x, train_y)  
    best_parameters = grid_search.best_estimator_.get_params()  
    for para, val in list(best_parameters.items()):  
        print(para, val)  
    model = SVC(kernel='rbf', C=best_parameters['C'], gamma=best_parameters['gamma'], probability=True)  
    model.fit(train_x, train_y)  
    return model  

def save_csv(columns, data, csv_path, index=False, header=True):
    data_array = np.array(data)
    df = pd.DataFrame(data_array.T, columns=columns)
    df.to_csv(csv_path, encoding='gbk', index=index, header=header)

test_classifiers = ['GNB','KNN', 'LR', 'RF', 'DT', 'SVM', 'GBDT']
classifiers = {'NB':naive_bayes_classifier,  
               'GNB':gaussian_classifier,
              'KNN':knn_classifier,  
               'LR':logistic_regression_classifier,  
               'RF':random_forest_classifier,  
               'DT':decision_tree_classifier,  
              'SVM':svm_classifier,  
            'SVMCV':svm_cross_validation,  
             'GBDT':gradient_boosting_classifier  
} 
train_xs = [issmic_train_features]
train_ys = [issmic_train_labels]
test_xs = [issmic_test_features]
test_ys = [issmic_test_labels]
cases = ['neoplasm_diagnosis']
ori_csv_name = os.path.split(ori_csv_path)[-1]
ori_csv_name
new_df.to_csv(os.path.join(result_path,ori_csv_name[:-4]+'_processed'+  '.csv'),index=False,encoding='gbk')

In [3]:
# choose_idx = [1,3,6,8,13,15,16]
# classifier = 'GNB'

# choose_idx = [1,3,5,9,12,15,16,17]
# classifier = 'KNN'

# choose_idx = [1,2,3,8,10,13,15]
# classifier = 'LR'

# choose_idx = [1,3,7,8,12,15]
# classifier = 'RF'

# choose_idx = [0,1,3,8,11,15]
# classifier = 'DT'

# choose_idx = [0,1,3,8,10,11,12,15]
# classifier = 'SVM'

# choose_idx = [0,1,3,8,12,16]
# classifier = 'GBDT'

train_x = issmic_train_features[:,choose_idx]
train_y = issmic_train_labels

test_x = issmic_test_features[:,choose_idx]
test_y = issmic_test_labels
model = classifiers[classifier](train_x,train_y)

In [4]:
from core.images import save_cache, load_cache
save_cache(model, "nihe\model.pkl")

Using TensorFlow backend.


In [4]:
from core.images import save_cache, load_cache
rfc_model = load_cache(r"nihe\model.pkl")
predict = rfc_model.predict(test_x) 
c = confusion_matrix(test_y, predict)
acc = (c[0][0] + c[1][1]) / (np.sum(c)) * 100
sens = c[1][1] / np.sum(c[1]) * 100
spec = c[0][0] / np.sum(c[0]) * 100
print("testing-- accuracy:{:.2f}%, sensitivity:{:.2f}%, specificity:{:.2f}%".format(acc, sens, spec))

In [3]:
importances = model.feature_importances_
for imp in importances:
    print('{:.3f}'.format(imp),',',end='')

In [6]:
pred_proba_both = model.predict_proba(test_x)
issmic_test_features = list(new_df_ai.loc[new_df_ai['Training/Testing'] == 'Testing'].iloc[:,0].values)
pred_proba_pos = pred_proba_both[:,1]
pred_proba_pos = np.expand_dims(pred_proba_pos,axis=-1)

In [1]:
to_check_path_result =" result"
if not os.path.exists(to_check_path_result):
    os.mkdir(to_check_path_result)
import matplotlib.pyplot as plt  
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
threshhold = 0.5
pred_proba_pos_bak = pred_proba_pos.copy()
pred_proba_pos_bak[np.where(pred_proba_pos_bak > threshhold)]= 1
pred_proba_pos_bak[np.where(pred_proba_pos_bak <= threshhold)] = 0
Plot.show_matrix(pred_proba_pos_bak, test_y, 2, to_check_path_result)
Plot.get_roc(test_y, pred_proba_pos, to_check_path_result)