In [None]:
import pandas as pd
import numpy as np
import sklearn

from sklearn.linear_model import LogisticRegression,SGDClassifier 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import StandardScaler,MinMaxScaler,Normalizer
from sklearn.model_selection import train_test_split,KFold,LeaveOneOut,cross_val_score,cross_validate,GridSearchCV
from sklearn.metrics import accuracy_score,confusion_matrix

import warnings
warnings.filterwarnings("ignore")

In [None]:
model_KNN = KNeighborsClassifier()
model_LR = LogisticRegression()
model_DT = DecisionTreeClassifier()
model_SVM = SVC()

model_MLP = MLPClassifier()
model_SGD = SGDClassifier()
model_RF = RandomForestClassifier()
model_AB = AdaBoostClassifier()
model_GB = GradientBoostingClassifier()

In [None]:
data = pd.read_excel('V5_RFEranking4.xlsx')

In [None]:
data.head(5)

In [None]:
X = data.iloc[:,3:]
Y = data.iloc[:,2:3]
X

In [None]:
X_std = StandardScaler().fit_transform(X)
X_nml = Normalizer().fit_transform(X_std)

In [None]:
#这里边只有准确率可以用于多分类，其他三个只能用于二分类，所有其他三个没有参考价值
KF_result = pd.DataFrame()
model_list = [model_KNN,model_LR,model_DT,model_SVM,model_MLP,model_SGD,model_RF,model_AB,model_GB]

for i,x in enumerate(model_list):
    scores1= cross_val_score(x,X_nml,Y,cv=10,scoring='accuracy')
    KF_result.loc[i,'accuracy_score'] = np.mean(scores1)
KF_result.index = pd.Series(model_list)
KF_result

In [None]:
KF_result.to_excel('KF10_result_Former4Feature_No0DPbBr_Oversampling0D.xlsx')

In [None]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=2)

In [None]:
# model_RF调参
params = {'n_estimators':[2,5,10,20,50],
          'criterion':['gini','entropy'],
          'max_depth':[None,1,5,10,20],
          'min_samples_split':[1,2,5,10,20],
          'min_samples_leaf':[1,2,3,4,5,10,20]
         }
clf = GridSearchCV(model_RF,param_grid=params,cv=5,scoring='accuracy')
clf.fit(x_train,y_train)
model_RF_best = clf.best_estimator_
print(clf.best_params_)
clf.best_score_

In [None]:
for i in range(2,50,2):
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=i)
    params = {'n_estimators':[2,5,10,20,50],
          'criterion':['gini','entropy'],
          'max_depth':[None,1,5,10,20],
          'min_samples_split':[1,2,5,10,20],
          'min_samples_leaf':[1,2,3,4,5,10,20]
         }
    clf = GridSearchCV(model_RF,param_grid=params,cv=5,scoring='accuracy')
    clf.fit(x_train,y_train)
    model_RF_best = clf.best_estimator_
    print("random_state= ",i)
    print(clf.best_params_)
    print(clf.best_score_)

In [None]:
#字典转数组保存
result = clf.best_params_.items()
data = list(result)
numpyArray = np.array(data)
nA = pd.DataFrame(numpyArray)
nA.to_excel('bestRF_Former4Feature.xlsx')


In [None]:
best_RF = pd.DataFrame()
for i in range(1,51):
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=i)
    y_pred = model_RF_best.predict(x_test)
    acc_score = accuracy_score(y_test,y_pred)
    best_RF.loc[i,'acc_score_RF'] = acc_score
    #print(acc_score)
best_RF

In [None]:
best_RF.to_excel('bestRF_Former4Feature_No0DPbBr_Oversampling0D.xlsx')

In [None]:
# model_LR调参 https://zhuanlan.zhihu.com/p/55438631
params = {'C':[0.0001, 1, 100, 1000],
          'max_iter':[1, 10, 100, 500],
          'class_weight':['balanced', None],
          'solver':['liblinear','sag','lbfgs','newton-cg']
         }
clf = GridSearchCV(model_LR,param_grid=params,cv=5,scoring='accuracy')
clf.fit(x_train,y_train)
model_LR_best = clf.best_estimator_
print(clf.best_params_)
clf.best_score_

In [None]:
best_LR = pd.DataFrame()
for i in range(1,51):
    x_train,x_test,y_train,y_test = train_test_split(X_nml,Y,test_size=0.2,random_state=i)
    y_pred = model_LR_best.predict(x_test)
    acc_score = accuracy_score(y_test,y_pred)
    best_LR.loc[i,'acc_score_LR'] = acc_score
    #print(acc_score)
best_LR

In [None]:
best_LR.to_excel('bestLR_Former4Feature_No0DPbBr_Oversampling0D.xlsx')

In [None]:
# model_SVM调参 https://zhuanlan.zhihu.com/p/55438631
params = {'C': [0.001,0.01,0.1,1,10], 
          'degree': [3,2,1],
          'kernel': ['linear','poly''rbf','sigmoid'], 
          'max_iter': [1,2,5,10,20,50,-1], 
          'probability': [True,False], 
          'shrinking': [True,False],  
          'tol': [1e-1,1e-2,1e-3,1e-4,1e-5]
         }
clf = GridSearchCV(model_SVM,param_grid=params,cv=5,scoring='accuracy')
clf.fit(x_train,y_train)
model_SVM_best = clf.best_estimator_
print(clf.best_params_)
clf.best_score_

In [None]:
best_SVM = pd.DataFrame()
for i in range(1,51):
    x_train,x_test,y_train,y_test = train_test_split(X_nml,Y,test_size=0.2,random_state=i)
    y_pred = model_SVM_best.predict(x_test)
    acc_score = accuracy_score(y_test,y_pred)
    best_SVM.loc[i,'acc_score_SVM'] = acc_score
    #print(acc_score)
best_SVM

In [None]:
# model_SGD调参  https://zhuanlan.zhihu.com/p/55438631
params = {'loss': ['hinge','log','modified_huber','squared_hinge','perceptron','huber','epsilon_insensitive','squared_epsilon_insensitive'], 
          'penalty': ['l1'],
          'alpha': [0.1], 
          'l1_ratio': [0.05],
          'fit_intercept': [True,False], 
          'max_iter': [1,2,5,10,50,100],  
          'tol': [1e-2],
          'shuffle': [True,False],
          'verbose': [0,0.0001,0.001,0.01,0.1,1,2,5,10,20,50],
          'epsilon': [0,0.0001,0.001,0.01,0.1,1,2,5],
         }
clf = GridSearchCV(model_SGD,param_grid=params,cv=5,scoring='accuracy')
clf.fit(x_train,y_train)
model_SGD_best = clf.best_estimator_
print(clf.best_params_)
clf.best_score_

In [None]:
# model_RF调参
for i in range(1,51):
    x_train,x_test,y_train,y_test = train_test_split(X_nml,Y,test_size=0.2,random_state=i)
    params = {'n_estimators':[2,5,10,20,50],
          'criterion':['gini','entropy'],
          'max_depth':[None,1,5,10,20],
          'min_samples_split':[1,2,5,10,20],
          'min_samples_leaf':[1,2,3,4,5,10,20]
         }
    clf = GridSearchCV(model_RF,param_grid=params,cv=5,scoring='accuracy')
    clf.fit(x_train,y_train)
    model_RF_best = clf.best_estimator_
    print('random_state=',i,clf.best_params_)
    print('best_score_',i,clf.best_score_)
    
    
    
    
#     y_pred = model_LR_best.predict(x_test)
#     acc_score = accuracy_score(y_test,y_pred)
#     best_LR.loc[i,'acc_score_LR'] = acc_score
#     #print(acc_score)
# best_LR

In [None]:
# model_LR调参
for i in range(1,51):
    x_train,x_test,y_train,y_test = train_test_split(X_nml,Y,test_size=0.2,random_state=i)
    params = {'C':[0.0001, 0.01,1,100, 1000],
          'max_iter':[1, 10,50, 100, 500],
          'class_weight':['balanced', None],
          'solver':['liblinear','sag','lbfgs','newton-cg']
         }
    clf = GridSearchCV(model_LR,param_grid=params,cv=5,scoring='accuracy')
    clf.fit(x_train,y_train)
    model_LR_best = clf.best_estimator_
    print('random_state=',i,clf.best_params_)
    print('best_score_=',i,clf.best_score_)

In [None]:
for i in range(2,50,2):
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=i)
    params = {'n_estimators':[2,5,10,20,50],
          'criterion':['gini','entropy'],
          'max_depth':[None,1,5,10,20],
          'min_samples_split':[1,2,5,10,20],
          'min_samples_leaf':[1,2,3,4,5,10,20]
         }
    clf = GridSearchCV(model_RF,param_grid=params,cv=5,scoring='accuracy')
    clf.fit(x_train,y_train)
    model_RF_best = clf.best_estimator_
    print("random_state= ",i)
    print(clf.best_params_)
    print(clf.best_score_)

In [None]:
# model_DT调参； 不用一步一步算了；  https://zhuanlan.zhihu.com/p/55438631
# 调参：https://blog.csdn.net/VariableX/article/details/107188730
for i in range(2,50,2):
    x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=i)    
    params = {'criterion': ['entropy','gini'],
          # 'criterion': ['gini'],
         'splitter': ['best','random'],
          # 'splitter': ['random'],
         # 'max_features': [None],
          'max_features': [None,'log2','sqrt','auto'],
          'max_depth':[3,5,8,10,20,30,50,None],
         # 'max_depth':[None],
         'min_samples_leaf':[1,2,5,10],
         'min_samples_split':[1,2,5,10,15,100],
          'class_weight':[None,'balanced'],
         }
    clf = GridSearchCV(model_DT,param_grid=params,cv=5,scoring='accuracy')
    clf.fit(x_train,y_train)
    model_DT_best = clf.best_estimator_
    print("random_state= ",i)
    print(clf.best_params_)
    print(clf.best_score_)