In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score

import xgboost as xgb

In [2]:
data = pd.read_csv('K:/Google Drive/DOUTORADO/Tese 2.0/Chapter I/KELLOGs/dataset.csv')

Elements = ['As', 'Ba', 'Cd', 'Co', 'Cr', 'Cu', 'Pb', 'Zn', 'Mo']

filter = pd.read_csv('filter.csv')

#for col in Elements:
#    data = data.sort_values(by=col,ascending=False)
#    data = data[5:]

In [3]:
print(data.shape)
data = data[data['id.layer_uuid_c'].isin(filter['id.layer_uuid_c'])]
print(data.shape)

(1337, 1711)
(1162, 1711)


In [4]:
def rf_cv_val(i, data):

    SEED = 255
    np.random.seed(SEED)

    Class = pd.DataFrame()
    Class['Class'] = pd.cut(data[i],
                               bins = [data[i].min(), data[i].quantile(0.75), data[i].max()],
                               labels = [0, 1],
                               right = False)

    Class = pd.concat([Class, data.iloc[:,10:]], axis=1)
    x = Class.dropna().drop(columns=['Class'])
    y = Class['Class'].dropna()

    x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                            test_size=0.2,
                                                            shuffle=True,
                                                            stratify=y)
    rf_classifier = RandomForestClassifier()
    
    param_grid = {
        'n_estimators': [100, 300],
        'max_depth': [None, 20],
        'min_samples_split': [5, 10],
        'min_samples_leaf': [2, 4]
    }

    cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=SEED)
    grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=cv)

    grid_search.fit(x_train, y_train)
    best_params = grid_search.best_params_
    best_rf_classifier = RandomForestClassifier(**best_params)
    cross_val_scores = cross_val_score(best_rf_classifier, x_train, y_train, cv=cv)
    mean_cv_accuracy = np.mean(cross_val_scores)
    
    best_rf_classifier.fit(x_train, y_train)
    y_pred = best_rf_classifier.predict(x_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    print(f"A acurácia para {i} usando RandomForest foi de %.2f%%" % (accuracy*100))

    return mean_cv_accuracy, best_params, accuracy, precision, recall, f1

In [5]:
rf_results = []

for i in Elements:
    cv_accuracy, best_params, accuracy, precision, recall, f1 = rf_cv_val(i, data)
    result_dict = {
        'Element': i,
        'CV accuracy': cv_accuracy,
        'Best Parameters': best_params,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
    rf_results.append(result_dict)

rf_results_df = pd.DataFrame(rf_results)
rf_results_df.to_csv('rf_results.csv', index=False)

A acurácia para As usando RandomForest foi de 89.27%
A acurácia para Ba usando RandomForest foi de 85.84%
A acurácia para Cd usando RandomForest foi de 90.13%
A acurácia para Co usando RandomForest foi de 83.69%
A acurácia para Cr usando RandomForest foi de 86.27%
A acurácia para Cu usando RandomForest foi de 86.27%
A acurácia para Pb usando RandomForest foi de 87.55%
A acurácia para Zn usando RandomForest foi de 81.12%
A acurácia para Mo usando RandomForest foi de 89.70%


In [6]:
subsoil = pd.read_csv('subsoil.csv', header=None)
topsoil = pd.read_csv('topsoil.csv', header=None)

topsoil_data = data.loc[data['id.layer_uuid_c'].isin(topsoil[0])]
subsoil_data = data.loc[data['id.layer_uuid_c'].isin(subsoil[0])]
print(topsoil_data.shape, subsoil_data.shape)

(428, 1711) (513, 1711)


In [None]:
topsoil_rf_results = []

for i in Elements:
    cv_accuracy, best_params, accuracy, precision, recall, f1 = rf_cv_val(i, topsoil_data)
    result_dict = {
        'Element': i,
        'CV accuracy': cv_accuracy,
        'Best Parameters': best_params,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
    topsoil_rf_results.append(result_dict)

topsoil_rf_results_df = pd.DataFrame(topsoil_rf_results)
topsoil_rf_results_df.to_csv('topsoil_rf_results.csv', index=False)

A acurácia para As usando RandomForest foi de 87.21%
A acurácia para Ba usando RandomForest foi de 84.88%
A acurácia para Cd usando RandomForest foi de 88.37%
A acurácia para Co usando RandomForest foi de 87.21%


In [None]:
subsoil_rf_results = []

for i in Elements:
    cv_accuracy, best_params, accuracy, precision, recall, f1 = rf_cv_val(i, subsoil_data)
    result_dict = {
        'Element': i,
        'CV accuracy': cv_accuracy,
        'Best Parameters': best_params,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }
    subsoil_rf_results.append(result_dict)

subsoil_rf_results_df = pd.DataFrame(subsoil_rf_results)
subsoil_rf_results_df.to_csv('subsoil_rf_results.csv', index=False)