In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

import xgboost as xgb

In [2]:
data = pd.read_csv('K:/Google Drive/DOUTORADO/Tese 2.0/Chapter I/KELLOGs/dataset.csv')

Elements = ['As', 'Ba', 'Cd', 'Co', 'Cr', 'Cu', 'Pb', 'Zn', 'Mo']

filter = pd.read_csv('filter.csv')

data = data[data['id.layer_uuid_c'].isin(filter['id.layer_uuid_c'])]

#for col in Elements:
#    data = data.sort_values(by=col,ascending=False)
#    data = data[5:]

In [3]:
def xgb_cv_val(i, data):

    SEED = 255
    np.random.seed(SEED)

    Class = pd.DataFrame()
    Class['Class'] = pd.cut(data[i],
                               bins = [data[i].min(), data[i].quantile(0.75), data[i].max()],
                               labels = [0, 1],
                               right = False)

    Class = pd.concat([Class, data.iloc[:,10:]], axis=1)
    x = Class.dropna().drop(columns=['Class'])
    y = Class['Class'].dropna()

    x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                            test_size=0.2,
                                                            shuffle=True,
                                                            stratify=y)
    xgb_classifier = xgb.XGBClassifier()
    
    param_grid = {
        'max_depth': [4, 8],
        'learning_rate': [0.1, 0.3],
        'subsample': [0.5, 1.0],
        'gamma': [0, 0.1]
    }

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    grid_search = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid, cv=cv)

    grid_search.fit(x_train, y_train)
    best_params = grid_search.best_params_
    best_xgb_classifier = xgb.XGBClassifier(**best_params)
    cross_val_scores = cross_val_score(best_xgb_classifier, x_train, y_train, cv=cv)
    mean_cv_accuracy = np.mean(cross_val_scores)
    
    best_xgb_classifier.fit(x_train, y_train)
    y_pred = best_xgb_classifier.predict(x_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"A acurácia para {i} usando XGBoost foi de %.2f%%" % (accuracy*100))

    return mean_cv_accuracy, best_params, accuracy, precision, recall, f1

In [4]:
xgb_results = []

for i in Elements:
    cv_accuracy, best_params, accuracy, precision, recall, f1 = xgb_cv_val(i, data)
    result_dict = {
        'Element': i,
        'CV accuracy': cv_accuracy,
        'Best Parameters': best_params,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
    xgb_results.append(result_dict)

xgb_results_df = pd.DataFrame(xgb_results)
xgb_results_df.to_csv('xgb_results.csv', index=False)

A acurácia para As usando XGBoost foi de 89.27%
A acurácia para Ba usando XGBoost foi de 85.41%
A acurácia para Cd usando XGBoost foi de 91.85%
A acurácia para Co usando XGBoost foi de 84.12%
A acurácia para Cr usando XGBoost foi de 87.55%
A acurácia para Cu usando XGBoost foi de 87.12%
A acurácia para Pb usando XGBoost foi de 86.27%
A acurácia para Zn usando XGBoost foi de 81.55%
A acurácia para Mo usando XGBoost foi de 88.41%


In [5]:
subsoil = pd.read_csv('subsoil.csv', header=None)
topsoil = pd.read_csv('topsoil.csv', header=None)

topsoil_data = data.loc[data['id.layer_uuid_c'].isin(topsoil[0])]
subsoil_data = data.loc[data['id.layer_uuid_c'].isin(subsoil[0])]
print(topsoil_data.shape, subsoil_data.shape)

(428, 1711) (513, 1711)


In [None]:
topsoil_xgb_results = []

for i in Elements:
    cv_accuracy, best_params, accuracy, precision, recall, f1 = xgb_cv_val(i, topsoil_data)
    result_dict = {
        'Element': i,
        'CV accuracy': cv_accuracy,
        'Best Parameters': best_params,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
    topsoil_xgb_results.append(result_dict)

topsoil_xgb_results_df = pd.DataFrame(topsoil_xgb_results)
topsoil_xgb_results_df.to_csv('topsoil_xgb_results.csv', index=False)

A acurácia para As usando XGBoost foi de 88.37%
A acurácia para Ba usando XGBoost foi de 80.23%
A acurácia para Cd usando XGBoost foi de 87.21%
A acurácia para Co usando XGBoost foi de 87.21%


In [None]:
subsoil_xgb_results = []

for i in Elements:
    cv_accuracy, best_params, accuracy, precision, recall, f1 = xgb_cv_val(i, subsoil_data)
    result_dict = {
        'Element': i,
        'CV accuracy': cv_accuracy,
        'Best Parameters': best_params,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
    subsoil_xgb_results.append(result_dict)

subsoil_xgb_results_df = pd.DataFrame(subsoil_xgb_results)
subsoil_xgb_results_df.to_csv('subsoil_xgb_results.csv', index=False)