In [1]:
from utils.osc_model import calc_auc, open_pickle, turbiscan_resampling, read_turbiscan, open_pickle, clear_cache, get_trans_back
import pandas as pd
import os

In [2]:
# AUC data using backscattering data and time 0 - 1 hr using 500 estimators for all ET models, clay and silt mse to guide
# the MC, and 300 train-test splits. 

# Iif retraining the model, ensure the column names of your PSD input file has the same as the example
data_path = 'run files'
training = False
output_prefix = 'trial'
iterations = 300

# Clear the cache directory and write your data to the cache *** Comment these lines out if you have already written your data to the cache ***
clear_cache()
read_turbiscan(data_path)

# Read in the backscattering and transmission data from cache directory and calculate the AUC
back_df, trans_df = get_trans_back()

back_df.fillna(0, inplace = True)
auc_back_df = calc_auc(back_df)

trans_df.fillna(0, inplace = True)
auc_trans_df = calc_auc(trans_df)

if training:

    # Takes as input a file with the particle size distribution values (clay, silt, sand) for each training sample
    psd_df = pd.read_excel('training_pipette_psd.xlsx') # Replace this file with your own PSD data
    psd_df.set_index('sampleid', inplace = True)
       
    # Set up the backscattering and transmission data inputs to use only the first 6 scans and add the PSD data
    # (i.e., the first 10 minutes if using a 2 minute scan)
    df = pd.merge(pd.merge(auc_back_df.iloc[:,0:6], auc_trans_df.iloc[:,0:6], right_index = True, left_index = True), psd_df, right_index = True, left_index = True)
    df.columns = df.columns.astype(str)

    # Set up the X and Y data
    y = df['clay']
    X = df.iloc[:,:-3]

    # Run the resampling for iterations number of train-test-validate splits
    turbiscan_resampling(output_prefix, df, X, y, iters = iterations)

    # Unpack the pickle file with all the run information
    results = open_pickle(output_prefix+"_OSC_results.pkl")
    error_distributions, final_training_test_set, final_validate_set, best_clay_model, best_silt_model, best_sand_model = results
    
else:
    
    #  Set up the backscattering and transmission data inputs to use only the first 6 scans
    # (i.e., the first 10 minutes if using a 2 minute scan)
    merged_df = pd.merge(auc_back_df.iloc[:,0:6], auc_trans_df.iloc[:,0:6], right_index = True, left_index = True)

    # Get the pickled models and read in the clay and silt models
    directory = 'turbiscan_model'
    for fid in os.listdir(directory):
        full_fid = os.path.join(os.getcwd(),directory, fid)
        if 'clay' in fid:
            clay_model = open_pickle(full_fid)
        elif 'silt' in fid:
            silt_model = open_pickle(full_fid)

    # Predict silt and clay using the pickled models and use these 2 to calculate sand
    results=pd.DataFrame([clay_model.predict(merged_df), silt_model.predict(merged_df)]).T
    results.index = merged_df.index
    results.columns = ['clay', 'silt']
    results['sand'] = 100-results.sum(axis=1)

    # Write the results to a csv file
    results.to_csv(output_prefix+'_predicted_psd.csv')

    print(results)
    

                    clay       silt       sand
sampleid                                      
AnzaBorrego4A   9.024796   6.849228  84.125976
AnzaBorrego4B  12.778165   9.165200  78.056635
AnzaBorrego4C  12.590780   9.690164  77.719057
Backbone1A      6.099177  11.822811  82.078012
Backbone1B      6.099177  11.822811  82.078012
...                  ...        ...        ...
Yuba7B          7.659624   7.380293  84.960083
Yuba7C          6.099177  11.822811  82.078012
Zippel7A        6.099177  11.822811  82.078012
Zippel7B        6.099177  11.822811  82.078012
Zippel7C        6.099177  11.822811  82.078012

[123 rows x 3 columns]
