In [1]:
# Make the following code support python2 and python3
from __future__ import division, print_function, unicode_literals

# Check if the version of python is 3.5 and above
import sys
assert sys.version_info >= (3, 5)

# Check to see if sklearn is version 0.20 and above
import sklearn
assert sklearn.__version__ >= "0.20"
from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import os 
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, r2_score, make_scorer, f1_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
import seaborn as sns
# The result obtained after each run is the same as that of this notebook
np.random.seed(42) 

# Make matplotlib diagrams work better
%matplotlib inline
import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)


In [2]:
# Ignoring Unnecessary Warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [3]:
orig_data = pd.read_excel('C:\\Users\\lenovo\\Desktop\\Trainingset\\finaltrain\\TRACE1.xlsx')#Preprocessing\\REE微量\\删除重复标签
 #orig_data.dropna(inplace=True) #delete missing value
#orig_data C:\\Users\\lenovo\\Desktop\\Trainingset\\finaltrain\\MAJOR1-4.xlsx

In [4]:
# Separate features and labels
X = orig_data.drop(["TRUE_VALUE"], axis=1).copy()
y = orig_data["TRUE_VALUE"]

In [None]:
plt.figure(figsize=(16, 6))
sns.countplot(orig_data.TRUE_VALUE, palette="Set2")
plt.xticks(rotation=0)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((610, 30), (262, 30), (610,), (262,))

In [7]:
#roc_auc_score = make_scorer(roc_auc_score, multi_class='ovo',needs_proba=True) #3 classes roc_auc
#f1_score = make_scorer(f1_score, multi_class='ovo',needs_proba=True)
#recall_score = make_scorer(recall_score, multi_class='ovo',needs_proba=True)
#precision_score = make_scorer(precision_score, multi_class='ovo',needs_proba=True)

In [None]:
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
y_pred_xgb = xgb_clf.predict(X_train)

In [11]:
from sklearn.model_selection import cross_val_score

# Ten-fold cross validation
scores = cross_val_score(xgb_clf, X, y,
                        scoring = "f1", cv=10,
                        n_jobs=-1)
def display_scores(scores):

    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(scores)

Scores: [0.84536082 0.94339623 1.         0.99082569 1.         0.97297297
 0.89795918 0.95575221 0.99065421 0.57894737]
Mean: 0.9175868682295109
Standard deviation: 0.12237463150215917


In [10]:
xgb_clf.feature_importances_

array([0.05618815, 0.10033856, 0.1845818 , 0.12067229, 0.1685267 ,
       0.06916326, 0.05847796, 0.10180902, 0.14024228], dtype=float32)

In [11]:
# show feature importance
for feature_name, score in zip(list(X.columns), xgb_clf.feature_importances_):
    print(feature_name, ":", score)

SIO2(WT%) : 0.056188148
TIO2(WT%) : 0.10033856
AL2O3(WT%) : 0.1845818
CR2O3(WT%) : 0.120672286
FEOT(WT%) : 0.1685267
CAO(WT%) : 0.069163255
MGO(WT%) : 0.058477964
MNO(WT%) : 0.10180902
NA2O(WT%) : 0.14024228


In [30]:
learning_rate = [ 0.1, 0.2, 0.5,0.6,0.7]
depth = [3, 4, 5, 6, 7]
min_split = [0.1,0.2,0.5, 1, 2, 3]
alpha1 = [0.1,0.3,0.5,0.7,0.9, 1]
#colsample_bytree= [0.5,0.6,0.7,0.8,0.9,1]
#n_estimiter = [100,200,300,500,800,1000]  
#weight gain cover #,n_estimators=1000

In [31]:
xgb = XGBClassifier(objective='binary:logistic',
                    eval_metric = 'auc', tree_method='hist', seed=2021,importance_type = 'cover')

In [None]:
xgb_cv = GridSearchCV(xgb, param_grid = {'eta': learning_rate, 'gamma': min_split, 'max_depth': depth, 'alpha':alpha1}, 
                      cv=10, scoring='f1') 
xgb_cv.fit(X_train, y_train)

In [33]:
#xgb_cv.best_params_

In [34]:
xgb_cv.best_score_

0.9728113778113778

In [35]:
xgb_cv.best_estimator_

XGBClassifier(alpha=0.1, base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eta=0.7,
              eval_metric='auc', gamma=0.5, gpu_id=-1, importance_type='cover',
              interaction_constraints='', learning_rate=0.699999988,
              max_delta_step=0, max_depth=3, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, random_state=2021, reg_alpha=0.100000001,
              reg_lambda=1, scale_pos_weight=1, seed=2021, subsample=1,
              tree_method='hist', validate_parameters=1, verbosity=None)

In [36]:
features = list(X.columns)
importances = xgb_cv.best_estimator_.feature_importances_
indices = np.argsort(importances)

In [37]:
df4 = pd.DataFrame({'features':features,'importances':importances})
df4.to_excel('fig5-trace.xlsx')

In [None]:
plt.barh(range(len(indices)), importances[indices], color='c', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices], fontsize=20)
plt.xticks(fontsize=20)
plt.xlabel('Relative Importance',fontsize=25)
plt.savefig('Major1 Feature.png', dpi=900)
plt.show()
plt.rcParams["figure.figsize"] = (20, 10)


In [None]:
from sklearn.model_selection import cross_val_score

# ten-fold cross validation
scores = cross_val_score(xgb_cv.best_estimator_, X_train, y_train,
                        scoring = "accuracy", cv=10,
                        n_jobs=-1)
def display_scores(scores):
    
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())
    
display_scores(scores)

In [None]:
# predict the test data set
xgb_test = xgb_cv.best_estimator_
xgb_test.fit(X_train,y_train)
y_test_pred = xgb_test.predict(X_test)

In [None]:
print('Accuracy: %.3f' % accuracy_score(y_test, y_test_pred))
print('ROC AUC: %.3f' % roc_auc_score(y_test, y_test_pred))
print('Precision: %.3f' % precision_score(y_true=y_test, y_pred=y_test_pred))
print('Recall: %.3f' % recall_score(y_true=y_test, y_pred=y_test_pred))
print('F1 Score: %.3f' % f1_score(y_true=y_test, y_pred=y_test_pred))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=y_test_pred))

In [None]:
confmat = confusion_matrix(y_true=y_test, y_pred=y_test_pred)

print(confmat)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i,j], va='center', ha='center')
plt.xlabel('predicted label',fontsize=20)
plt.ylabel('true label',fontsize=20)
plt.savefig('Major1 Confusion.png', dpi=900)
plt.show()

In [86]:
#predict the non label data
predict_data = pd.read_excel ('C:\\Users\\lenovo\\Desktop\\Trainingset\\预测数据\\MAJOR.xlsx')

#process = preprocessing.StandardScaler()
x_predict_data = predict_data # process.fit_transform(predict_data)

In [None]:
predict_results = xgb_cv.best_estimator_.predict_proba(x_predict_data)

In [None]:
predict_results

In [None]:
df = pd.DataFrame(data=predict_results)
df.to_excel('predict_resultsMAJOR2.xlsx')

In [None]:
###Xgboost end