In [1]:
from utils.osc_model import calc_auc, open_pickle, turbiscan_resampling, get_raw_files, open_pickle
import pandas as pd
import os

In [2]:
# AUC data using backscattering data and time 0 - 1 hr using 500 estimators for all ET models, clay and silt mse to guide
# the MC, and 300 train-test splits. 

# Iif retraining the model, ensure the column names of your PSD input file has the same as the example

training = True
output_prefix = 'trial'
iterations = 3

if training:
    
    # Takes as input a file with the particle size distribution values (clay, silt, sand) for each training sample
    psd_df = pd.read_excel('training_pipette_psd.xlsx') # Replace this file with your own PSD data
    psd_df.set_index('sampleid', inplace = True)
    
    #Grab and process the turbiscan output files, note the turbiscan output files must be *.csv files
    back_data, trans_data = get_raw_files(training)
    
    # Calculate the area under the curve (AUC) feature for each scan
    back_df = back_data.iloc[:,:-3]
    trans_df = trans_data.iloc[:,:-3]
    auc_back_df = calc_auc(back_df)
    auc_trans_df = calc_auc(trans_df)
    
    # Set up the backscattering and transmission data inputs to use only the first 6 scans and add the PSD data
    # (i.e., the first 10 minutes if using a 2 minute scan)
    df = pd.merge(pd.merge(auc_back_df.iloc[:,:6], auc_trans_df.iloc[:,:6], on = 'sampleid'), psd_df, on = 'sampleid')
    df.columns = df.columns.astype(str)

    # Set up the X and Y data
    y = df['clay']
    X = df.iloc[:,:-3]

    # Run the resampling for iterations number of train-test-validate splits
    run_name = '' # Set this to whatever name you wish to call the output pickle file
    turbiscan_resampling(output_prefix, df, X, y, iters = iterations)
    
else:
    # Grab and process the turbiscan output files, note the turbiscan output files must be *.csv files
    back_df, trans_df = get_raw_files(training)
    
    # Calculate the area under the curve (AUC) feature for each scan
    auc_back_df = calc_auc(back_df)
    auc_trans_df = calc_auc(trans_df)
    
    # Set up the backscattering and transmission data inputs to use only the first 6 scans
    # (i.e., the first 10 minutes if using a 2 minute scan)
    merged_df = pd.merge(auc_back_df.iloc[:,:6], auc_trans_df.iloc[:,:6], right_index = True, left_index = True)

    # Open the pickled models
    clay_model = open_pickle('turbiscan_model/clay_model.pkl')
    silt_model = open_pickle('turbiscan_model/silt_model.pkl')

    #Calculate the clay and silt components of the PSD using the backscattering and transmission inputs
    psd_results=pd.DataFrame([clay_model.predict(merged_df), silt_model.predict(merged_df)]).T
    psd_results.index = merged_df.index
    psd_results.columns = ['clay', 'silt']
    
    # Calculate the sand component using the formula 100% - (clay + silt)
    psd_results['sand'] = 100-psd_results.sum(axis=1)

# Unpack the results
if training:
    # Unpack the pickle file with all the run information
    results = open_pickle(output_prefix+"_OSC_results.pkl")
    error_distributions, final_training_test_set, final_validate_set, best_clay_model, best_silt_model, best_sand_model = results
else:
    # Print the predicted PSD results
    print(psd_results)
    