# Application of Machine Learning Models for Zircon Composition

In [2]:
import os
import pickle
import warnings
import pandas as pd
from sklearn.metrics import r2_score, mean_squared_error
import joblib
import numpy as np
import textwrap

warnings.filterwarnings('ignore')

# Define root paths
PROJECT_ROOT_DIR = os.getcwd()
MODEL_PATH = os.path.join(PROJECT_ROOT_DIR, "Zircon ML models")

# Function to load model
def load_model(component):
    model_file_path = os.path.join(MODEL_PATH, f"Zircon ML-XGBOOST-{component}-ALL1.7.pkl")
    with open(model_file_path, 'rb') as model_file:
        model = pickle.load(model_file)
    return model

# Function to read data
def load_data(file_name):
    if not file_name.endswith('.xlsx'):
        file_name += '.xlsx'
    df = pd.read_excel(file_name)
    return df

# Function to save results
def save_results(df, component):
    result_file_path = f"Results-{component}.xlsx"
    sheet_name = f"Results-{component}"
    df.to_excel(result_file_path, sheet_name=sheet_name, index=False)

# Main function
def main():
    # List available models for user
    available_models = sorted([f.split('-')[2] for f in os.listdir(MODEL_PATH) if f.endswith('.pkl')])
    print("Available components:")
    print(textwrap.fill(", ".join(available_models), width=70))
    
    # User inputs (using Jupyter notebook's input method)
    component = input("[1mEnter the component (e.g., SiO2, TiO2, etc.):[0m ")
    data_file = input("[1mEnter the data file name (e.g., XXX.xlsx):[0m ")

    # Load model and data
    model = load_model(component)
    df = load_data(data_file)
    print("Data overview:")
    print(df.info())

    # Prepare data for prediction
    X_ALL = df.drop(columns=['ID'], axis=1)

    # Predict using the loaded model
    predictions = model.predict(X_ALL)

    # Insert predictions into the dataframe
    df.insert(df.shape[1], component, predictions)

    # Save the results
    save_results(df, component)
    print(f"Results saved successfully for {component}.")

# Run main function in Jupyter Notebook
main()

Available components:
Al2O3, CaO, Ce, Dy, Eu, Eu5, Fe2O3T, Gd, K2O, La, MgO, MnO, Na2O, Nb,
Nd, P2O5, SiO2, Sm, Sr, Th, TiO2, Y, Yb
Enter the component (e.g., SiO2, TiO2, etc.): SiO2
Enter the data file name (e.g., XXX.xlsx): Test-Jack Hills no age
Data overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 488 entries, 0 to 487
Data columns (total 28 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        475 non-null    object 
 1   TI(PPM)   430 non-null    float64
 2   Y(PPM)    488 non-null    float64
 3   NB(PPM)   269 non-null    float64
 4   LA(PPM)   488 non-null    float64
 5   CE(PPM)   488 non-null    float64
 6   PR(PPM)   487 non-null    float64
 7   ND(PPM)   488 non-null    float64
 8   SM(PPM)   488 non-null    float64
 9   EU(PPM)   488 non-null    float64
 10  GD(PPM)   488 non-null    float64
 11  TB(PPM)   488 non-null    float64
 12  DY(PPM)   488 non-null    float64
 13  HO(PPM)   488 non-null    float64
 14  ER