# Apatite_XGBoost_ore_bearing
By Yuyu Zheng       25/03/2023

In [None]:
import pandas as pd
import numpy as np
import sys

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import warnings
warnings.filterwarnings('ignore')

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, r2_score, make_scorer, f1_score, recall_score, precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

import os
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

import pickle

In [None]:
#Display the dataset
df_raw = pd.read_excel(r"Major and Trace dataset.xlsx")

### Data Preprocessing

In [None]:
df = df_raw.copy()
df.drop(df.columns[df.isna().mean() > 0.4], axis=1, inplace=True)  # Remove columns that >60% missing. 
df=df.drop(df.columns[0:1], axis=1)
df.head()

In [None]:
df.Class=df.Class.astype('category')
df.Class.value_counts()

In [None]:
df.info()

Missing Value proportions

In [None]:
df.isnull().sum().sum() /(df.shape[0]* df.shape[1])

### Split data into Class and elements

In [None]:
classes = df.Class.values.copy()
classes=classes.astype('str')
classes

In [None]:
le = LabelEncoder()
classes = le.fit_transform(classes)
classes

In [None]:
features = df.drop(['Class'], axis=1).copy()

### Split into training set and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, classes, test_size=0.2, 
                                                    stratify = classes, random_state = 2021) 

# XGBoost

# Base Models

In [None]:
xgb = XGBClassifier(objective='binary:logistic', 
                    eval_metric = 'error',tree_method='hist', seed=2022)          
xgb.fit(X_train, y_train)
y_train_preds = xgb.predict(X_train)
y_test_preds = xgb.predict(X_test)

In [None]:
#display feature importance
xgb.feature_importances_
for feature_name, score in zip(list(features.columns), xgb.feature_importances_):
    print(feature_name, ":", score)

In [None]:
def get_metrics(mod, X_train, X_test, y_train, y_test):
    """ Returns a data frame of metrics (precision,
        recall, AUC ROC) from training and test sets.
        Assumes model has decision_function() method.
        This will at least work for SVC, LDA, QDA.
    """
    pred_train = mod.predict(X_train)
    pred_test = mod.predict(X_test)
    recall_train = recall_score(y_train, pred_train,average='weighted')
    recall_test = recall_score(y_test, pred_test, average='weighted')
    precision_train = precision_score(y_train, pred_train, average='weighted')
    precision_test = precision_score(y_test, pred_test, average='weighted')
    f1_train = f1_score(y_train, pred_train)
    f1_test = f1_score(y_test, pred_test)
    accuracy_train = accuracy_score(y_train, pred_train)
    accuracy_test = accuracy_score(y_test, pred_test)
    metrics = {'Set':['Train', 'Test'],
               'Recall':[recall_train, recall_test],
               'Precision':[precision_train, precision_test],
               'f1':[f1_train, f1_test],
               'Accuracy':[accuracy_train, accuracy_test]
              }
    return pd.DataFrame(metrics)

In [None]:
def plot_importance(feature_df, model, n_feature):
    '''
    Input:
        feature_df: The feature dataframe / The X_train with column names
        model: The training model
        n_feature: number of feature importance you want to display
    Output:
        A plot with the top n feature importance in decreasing order
    '''
    vals = model.feature_importances_
    df = pd.DataFrame(vals, index=feature_df.columns, columns=['Importance']).sort_values(by='Importance', ascending=False)
    df_n = df.iloc[:n_feature,]
    plt.figure(figsize=(16, 9))
    sns.barplot(df_n.index, df_n.Importance, palette="Set3")
    plt.xticks(rotation=90)

In [None]:
get_metrics(xgb, X_train, X_test, y_train, y_test)

In [None]:
plot_importance(features, xgb, 32)

### Model Optimization

In [None]:
learning_rate = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3 ]
depth = [3,4,5,6,7]
min_split = np.linspace(0.1,2,20)
alpha1 = [0.1,0.3,0.5,0.7,0.9, 1]

In [None]:
xgb = XGBClassifier(objective='binary:logistic', 
                    eval_metric = 'error', tree_method='hist', seed=2021,importance_type = 'cover')  

In [None]:
xgb_cv = GridSearchCV(xgb, param_grid = {'eta': learning_rate, 'gamma': min_split, 'max_depth': depth, 'alpha':alpha1}, 
                      cv=5, scoring='f1') 
xgb_cv.fit(X_train, y_train) 

In [None]:
xgb_cv.best_params_

In [None]:
xgb_cv.best_estimator_

In [None]:
xgb_cv.best_score_

In [None]:
xgb_best = xgb_cv.best_estimator_

In [None]:
y_train_preds = xgb_best.predict(X_train)
y_test_preds = xgb_best.predict(X_test)

In [None]:
get_metrics(xgb_best, X_train, X_test, y_train, y_test)

In [None]:
features1 = list(features.columns)
importances = xgb_best.feature_importances_
indices = np.argsort(importances)

In [None]:
plt.barh(range(len(indices)), importances[indices], color='c', align='center')
plt.yticks(range(len(indices)), [features1[i] for i in indices],fontsize=20)
plt.xticks(fontsize=20)
plt.xlabel('Relative Importance',fontsize=25)
plt.figure(figsize=(16,10))
plt.show()

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_test, y_pred=y_test_preds))

In [None]:
confmat = confusion_matrix(y_true=y_test, y_pred=y_test_preds)

print(confmat)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(confmat, cmap=plt.cm.Blues, alpha=0.3)
for i in range(confmat.shape[0]):
    for j in range(confmat.shape[1]):
        ax.text(x=j, y=i, s=confmat[i,j], va='center', ha='center',fontsize=20)
plt.xlabel('predicted label',fontsize=20)
plt.ylabel('true label',fontsize=20)
plt.show()

### ============ The End ==================================