In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import shap
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from matplotlib.font_manager import FontProperties
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
import os
from typing import Counter
import matplotlib.colors as mcolors
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
from openpyxl import Workbook
import matplotlib
import shap
from matplotlib.colors import ListedColormap
from PIL import Image

In [None]:
file_path = r"D:\cddvd\sedex_vms_zhong.xlsx"#Please enter the path to the Supplementary data 3
data = pd.read_excel(file_path)
df = data.loc[:, ["Deposit type", "Co", "Ni", "Zn", "Cd", "Sb", "Pb", "Ag", "Se"]]
X = df.copy(deep=True)
y = X.pop('Deposit type')

# Print the original dataset class distribution
print("Original dataset class distribution:")
print(y.value_counts())

if y.isnull().values.any():
    y.dropna(inplace=True)
    X = X.loc[y.index]  
    print("X and y lengths:", X.shape[0], y.shape[0])

if X.isnull().values.any():
    X.dropna(inplace=True)
    y = y.loc[X.index]

# Print class distribution after removing missing values
print("Class distribution after missing values removal:")
print(y.value_counts())

# Factorize the target variable
y_int, index = pd.factorize(y, sort=True)
y = y_int

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)
print("Training set class distribution after split:")
print(pd.Series(y_train).value_counts())

if X.isnull().values.any():
    X.dropna(inplace=True)
    y = y.loc[X.index]  

models = (RandomForestClassifier(),)
for clf in models:
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=-1)
    print(f'{scores.mean():2.2f} ± {scores.std():2.2f}')

pipe_clf = make_pipeline(RandomForestClassifier(oob_score=True, random_state=10, class_weight='balanced'))

param_grid = {
    "randomforestclassifier__n_estimators": [130],
    "randomforestclassifier__max_depth": [11],
    "randomforestclassifier__min_samples_leaf": [3],
    "randomforestclassifier__min_samples_split": [5]
}

grid = GridSearchCV(pipe_clf, param_grid=param_grid, cv=10, scoring="f1_macro", n_jobs=-1, refit=True)
grid.fit(X_train, y_train)

print("Best parameters: %s, score: %0.2f" % (grid.best_params_, grid.best_score_))

y_test_pred = grid.predict(X_test)
t_train_pred = grid.predict(X_train)
y_test_proba = grid.predict_proba(X_test)[:, 1]
# Print classification report and confusion matrix
print(classification_report(y_train, t_train_pred))
print(classification_report(y_test, y_test_pred, output_dict=False))
print(confusion_matrix(y_test, y_test_pred))
# Print classification report and confusion matrix for the test set
print(classification_report(y_test, y_test_pred, output_dict=False))
print(confusion_matrix(y_test, y_test_pred))

# Final evaluation
print(classification_report(y_test, y_test_pred, output_dict=False))
print(confusion_matrix(y_test, y_test_pred))
report_filename = r"D:\cddvd\RFpyrite_report.txt"
with open(report_filename, 'w', encoding='utf-8') as f:
    f.write("Training set report:\n")
    f.write(classification_report(y_train, t_train_pred))
    
    f.write("\nTest set report:\n")
    f.write(classification_report(y_test, y_test_pred, output_dict=False))
    
    f.write("\nConfusion Matrix:\n")
    cm = confusion_matrix(y_test, y_test_pred)
    f.write(str(cm))

print(f"Classification report and confusion matrix saved to: {report_filename}")
best_rf_model = grid.best_estimator_.named_steps['randomforestclassifier']

# Create SHAP explainer
explainer = shap.Explainer(best_rf_model, X_train)
shap_values_train = explainer.shap_values(X_train)

# Create DataFrame for each class
shap_values_df_train_0 = pd.DataFrame(shap_values_train[0], columns=X_train.columns, index=X_train.index)
shap_values_df_train_1 = pd.DataFrame(shap_values_train[1], columns=X_train.columns, index=X_train.index)

output_path_train = 'D:/cddvd/shapvalue_train_rfccp.xlsx'
with pd.ExcelWriter(output_path_train, engine='xlsxwriter') as writer:
    shap_values_df_train_0.to_excel(writer, sheet_name='Class_SEDEX')
    shap_values_df_train_1.to_excel(writer, sheet_name='Class_VMS')

print(f"SHAP values for the training set successfully saved to {output_path_train}")

# Set font and size
plt.rcParams['font.family'] = 'Times New Roman'
plt.rcParams['font.size'] = 8

shap_abs_values_train = [np.abs(values) for values in shap_values_train]
mean_shap_values_train = [np.mean(np.abs(shap_values_train[i]), axis=0) for i in range(len(shap_values_train))]

output_path_mean_train = 'D:/cddvd/shapvalue_mean_train_rfccp.xlsx'
mean_shap_df_train = pd.DataFrame(mean_shap_values_train, columns=X_train.columns)
mean_shap_df_train.to_excel(output_path_mean_train, index=False)

print(f"Mean SHAP values for the training set successfully saved to {output_path_mean_train}")

shap.plots.beeswarm(shap.Explanation(values=shap_values_train[0], base_values=explainer.expected_value, 
                                      data=X_train, feature_names=X_train.columns),
                                      max_display=10, show=False)

# Save SHAP figure
plt.savefig('D:/cddvd/SHAP_RF_pyrite.jpg', dpi=600, format='jpg')
plt.close()

In [None]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

# Export the test set
test_results_df = pd.DataFrame(X_test)
test_results_df['Actual'] = y_test
test_results_df['Predicted'] = y_test_pred
output_file_path = r"D:\cddvd\test_set_results_Ccp.xlsx"
test_results_df.to_excel(output_file_path, index=False, header=True)

print(f"Test set results have been successfully saved to: {output_file_path}")


In [None]:
# Confusion matrix
label_order = ["SEDEX", "VMS"]
cm = confusion_matrix(y_test, y_test_pred)
cm_df = pd.DataFrame(cm, columns=label_order, index=label_order)
cm_df_percentage = cm_df.div(cm_df.sum(axis=0), axis=1) * 100
plt.figure(figsize=(2.5, 2.5))
plt.rc('font', family='Times New Roman', size=8)
ax = sns.heatmap(cm_df, linewidths=.5, ax=plt.gca(), cmap="Blues")
norm = plt.Normalize(vmin=cm_df.values.min(), vmax=cm_df.values.max())
sm = plt.cm.ScalarMappable(cmap="Blues", norm=norm)

# Manually add text to each cell
for i in range(len(cm_df)):
    for j in range(len(cm_df)):
        value = cm_df.iloc[i, j]
        percentage = cm_df_percentage.iloc[i, j]
        color = 'white' if sm.to_rgba(value)[0:3] < (0.5, 0.5, 0.5) else 'black'
        plt.text(j + 0.5, i + 0.5, f"{value}\n{percentage:.1f}%",
                 ha='center', va='center', color=color, family='Times New Roman', size=8)

plt.title("Test set confusion matrix (RF)", fontsize=8)
plt.xlabel("Predictions", fontsize=8)
plt.ylabel("True labels", fontsize=8)
ax.set_xticklabels(label_order, rotation=45, fontsize=8)
ax.set_yticklabels(label_order, fontsize=8)
plt.tight_layout()
plt.savefig(r'D:\cddvd\confusion_matrix_RF_ccp.svg', dpi=600, format='svg')
plt.savefig(r'D:\cddvd\confusion_matrix_RF_ccp.pdf', dpi=600, format='pdf')

# Show the plot
plt.show()


In [None]:
print(
    """
RF classifier to predict the genetic classes of the chalcopyrite source with "Co", "Ni", "Zn", "Sb", "Pb", "Ag", "Se", "Cd"values,
Please enter the path of the .xlsx data file.(for example: /path/to/file/example_data.xlsx )
The data are supposed to contain all the 8 features above for prediction.
If any one of the features is missing in a sample, that sample will be discarded.
The columns' names of Co, Ni, Zn, Cd, Sb, Pb, Ag, Se should be exactly as listed above without any prefix and suffix
and MAKE SURE this column name row is the FIRST row.
"""
)
data_file_path =  r"D:\我的论文\dongshengmiao_knn_ccp.xlsx"#Please enter the path to the data file
df = pd.read_excel(data_file_path, sheet_name='sheet1')

index = ['SEDEX', 'VMS']
print(df)
elements = [ "Co", "Ni", "Zn", "Cd", "Sb", "Pb", "Ag", "Se"]

for element in elements:
    df[element] = pd.to_numeric(df[element], errors="coerce")

to_predict = df.loc[:, elements].dropna()
to_predict.reset_index(drop=True, inplace=True)
print(f"{to_predict.shape[0]} samples available")
print(to_predict.describe())
predict_res = grid.predict(to_predict)
predict_res = list(predict_res)
for i, ind in enumerate(predict_res):
    predict_res[i] = index[ind]

c: Counter[str] = Counter(predict_res)
if not c:
    input("no sample with the 8 features detected!")
    raise SystemExit()
    
proba = grid.predict_proba(to_predict)
predict_res = np.array(predict_res)
predict_res = predict_res.reshape((predict_res.shape[0], 1))
res = np.concatenate([predict_res, proba], axis=1)
res = pd.DataFrame(res, columns=['pred_chalcopyrite_type', 'SEDEX_proba', 'VMS_proba'])
pd.set_option('display.max_columns', 10)
print('Detailed report preview:\n', res)

print("The samples are predicted respectively to be: ")
print(c.most_common(), "\n")
print(
    f"The most possible type of the group of samples is: {c.most_common(1)[0][0]}.\n"
)

if input('Save report? (y/n): ').lower() == 'y':
    base_filename = os.path.basename(data_file_path)
    prefix, _ = os.path.splitext(base_filename)
    save_name = prefix + '_resultrfccp.xlsx'
    res2 = pd.concat([to_predict['Pb'], res], axis=1, )
    output = df.join(res2.set_index('Pb'), on='Pb')
    output.to_excel(save_name)
    print(f'{save_name} saved.')
input("Press any key to exit.")