In [2]:
import pandas as pd
import numpy as np
import csv
from scipy.stats import linregress
from statsmodels.stats.weightstats import ztest as ztest

# local code
from source.interactive_plots import interactive_linear_regression_calibration_plot
from source.get_elements      import get_elements

Load data from XRF analyses

In [3]:
xrf_data = pd.read_csv("../data/interim/xrf_data_clean.csv") # load xrf data

# Calibration

Load data for standard reference materials and clean

In [4]:
srm_data = pd.read_csv("../data/interim/standard_reference_material_certified_values.csv") # load SRM data
srm_data.drop(["1SD", "95% Confidence Low", "95% Confidence High"], axis=1, inplace=True) # drop unnecessary columns

# clean SRM data
for row in srm_data.iterrows(): 
    row[1]["Sample ID"] = row[1]["Sample ID"].lower()
    row[1]["Analyte"] = row[1]["Analyte"].split(",",1)[0]

    # clean certified value
    cert_val = row[1]["Certified Value"]
    if cert_val.startswith("<"): 
        row[1]["Certified Value"] = float(cert_val.lstrip("< ")) / 2 # replace BDL with half value
    else:
        row[1]["Certified Value"] = float(row[1]["Certified Value"])

    # clean units
    row[1]["Units"] = row[1]["Units"].lstrip("(").rstrip(")")
    if row[1]["Units"] == "wt.%":  # convert units
       row[1]["Certified Value"] = row[1]["Certified Value"] * 1e4
       row[1]["Units"] = "ppm"

## Remove standard data for a given element that is unsuitable for calibration

Compare the measured concentration of each element for a given standard to the distribution of measured concentrations for that element across all non-standards. If the measured concentration 

    1.) falls outside the range of the distribution 
    2.) is more than three standard deviations from the mean 

that value is set to `NaN` (all values set to `NaN` are filtered out during calibration). 

Note: I should consider conducting a test to determine if each distribution is normally or log-normally distributed; if the latter, I should log-transform the data and then calculate the z-score. 

In [5]:
def z_score(data, value): 
    mean = np.mean(data)
    std = np.std(data)

    return (value - mean) / std

In [6]:
non_standards_data = xrf_data.loc[xrf_data["qaqc_type"]!="standard"]

outlier_stddev_cutoff = 5

for standard in xrf_data.loc[xrf_data["qaqc_type"]=="standard", "sample_id"].unique(): 
    for date in xrf_data.loc[xrf_data["sample_id"]==standard, "date"].unique():
        for element in get_elements(xrf_data.columns.to_list()): 
            standard_date_element = xrf_data.loc[(xrf_data["sample_id"]==standard) & (xrf_data["date"]==date)][element].values

            if (non_standards_data[element] > standard_date_element[0]).all() | \
               (non_standards_data[element] < standard_date_element[0]).all(): 
                
                # print(non_standards_data[element].to_numpy())
                z = z_score(non_standards_data[element], standard_date_element[0])
                if abs(z) > outlier_stddev_cutoff: 
                    print(standard + ", " + date + ", " + element + ", " + str(standard_date_element[0]) + ", " + str(z))

                    xrf_data.loc[(xrf_data["sample_id"]==standard) & (xrf_data["date"]==date), element] = np.nan

oreas24b, 2021-10-06, K, 31090.0, 6.471422612919988
oreas24b, 2021-10-14, K, 31369.0, 6.563395682670596
oreas24b, 2021-10-21, K, 31367.0, 6.562736377511094
oreas24b, 2021-10-29, K, 31590.0, 6.6362489027956295
oreas24b, 2021-10-31, K, 31006.0, 6.44373179622088
oreas24c, 2021-10-21, Cr, 248.0, 7.647141304308473
oreas901, 2021-10-06, As, 70.0, 5.172844445093555
oreas901, 2021-10-06, Cu, 1275.0, 36.20572840108871
oreas901, 2021-10-06, K, 41054.0, 9.756080917561775
oreas901, 2021-10-22, Cu, 1270.0, 36.06111662068375
oreas901, 2021-10-22, K, 40904.0, 9.706633030599082
oreas501b, 2021-10-06, Cr, 275.0, 8.764381497151398
oreas501b, 2021-10-06, Cu, 2267.0, 64.89670563343348
oreas501b, 2021-10-06, K, 36229.0, 8.165507220261834
oreas501b, 2021-10-06, Mo, 104.5, 12.065017729332341
oreas501b, 2021-10-06, Rb, 183.4, 5.275784103193051
oreas501b, 2021-10-22, Cr, 280.0, 8.971277829159346
oreas501b, 2021-10-22, Cu, 2301.0, 65.88006574018723
oreas501b, 2021-10-22, K, 37039.0, 8.432525809860373
oreas501b,

Get the elements analyzed by the XRF and for which concentrations are reported for one or more standard reference materials

In [7]:
elements = get_elements(
                        list(
                            set(srm_data["Analyte"].unique()) & \
                            set(xrf_data.columns.to_list())
                            )
                        )

Construct a linear regression model for each element in order to predict the true concentration from the measured concentration

In [8]:
# initialize dictionary to hold lin. reg. models for each element
reg = {}
for element in elements: 

    # get IDs of standard reference materials
    srm = srm_data.loc[srm_data["Analyte"]==element]["Sample ID"].unique() 

    ## TRAIN 
    # limit training data to standards for which we have standard reference material info for the element at hand
    data_train = xrf_data.loc[(xrf_data["qaqc_type"]=="standard") & (xrf_data["sample_id"].isin(srm))]
    data_train = data_train.dropna(subset=[element]) # change to true condition statement

    x_train = [srm_data.loc[
                            (srm_data["Sample ID"]==sample) & \
                            (srm_data["Analyte"]==element)
                            ]["Certified Value"].values[0] for sample in data_train["sample_id"]]
    x_train = np.array(x_train)
    y_train = data_train[element].to_numpy()

    model = linregress(x_train, y_train) # fit linear regression model

    if model.slope != 0: #only use calibration curve if meaningful (i.e., if variance in dep. var. explained by variance in indep. var.)
        # invert model so that measured concentration is independent var. and true concentration is dependent var. (i.e., y = m*x + b --> x = (1/m)*y - (b/m))
        intercept_inv = -model.intercept / model.slope
        slope_inv = model.slope ** -1

        ## PREDICT (i.e., calibrate)
        data_predict = xrf_data.loc[xrf_data["qaqc_type"]!="standard"]
        data_predict = data_predict.dropna(subset=[element])

        x_predict = data_predict[element]
        x_predict = x_predict.to_numpy()
        y_predict = slope_inv * x_predict + intercept_inv

        ## Save model results
        reg[element] = {} # initialize empty dict to save model results

        reg[element]["model"]                 = model
        reg[element]["x_train"]               = x_train
        reg[element]["y_train"]               = y_train
        reg[element]["score"]                 = model.rvalue
        reg[element]["y-intercept std error"] = model.intercept_stderr
        reg[element]["slope_inv"]             = slope_inv
        reg[element]["intercept_inv"]         = intercept_inv
        reg[element]["x_predict"]             = x_predict
        reg[element]["y_predict"]             = y_predict
        
        xrf_data.loc[xrf_data["qaqc_type"]!="standard", element] = y_predict

  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)


Display the results of the calibration

In [9]:
dropdown_buttons = {
    "data": 
        {
            "name": "Elements", 
            "columns": list(reg.keys())
        }
    }

interactive_linear_regression_calibration_plot(dropdown_buttons, reg, x_axis_label="True concentration (ppm)", y_axis_label="Measured concentration (ppm)", title="")

VBox(children=(HBox(children=(Dropdown(description='Elements', options=('Ag', 'As', 'Ba', 'Bi', 'Ca', 'Cd', 'C…

Calculate the detection limit for each element. 

In [20]:
for element in elements: 
    detection_limit = reg[element]["slope_inv"] * (reg[element]["model"].intercept + 3*reg[element]["y-intercept std error"]) + reg[element]["intercept_inv"]
    reg[element]["detection_limit"] = detection_limit
    print(detection_limit, element)

9.758418805395854 Ag
2.6244869705522516 As
13.02893554594495 Ba
4.26929417764139 Bi
1432.6020872593424 Ca
nan Cd
-487.1762681329404 Ce
30.441780568272534 Co
61.11209177967383 Cr
5.988465004863517 Cu
3728.128329156112 Fe
2772.1587834927814 K
20.559599300125416 La
95.17841694569336 Mn
2.4797694423678664 Mo
6.528570679765783 Nb
6.358067233908441 Nd
-87.9928841881601 Ni
644.9313652728606 P
7.28235335780706 Pb
-5.108256281669303 Pr
2.401769717040935 Rb
1888.1967632959377 S
1.7869220549759781 Sb
nan Se
-38.65719627635104 Sm
5.342780803987754 Sn
5.793869495134919 Sr
2040.0531359708095 Ti
4.808030478491732 U
28.595683828969484 V
10.34278054504528 Y
34.1266135646 Zn
1868.0650950673244 Zr


# Evaluating Precision

In [36]:
duplicates = xrf_data[xrf_data["qaqc_type"]=="lab duplicate"]

In [26]:
elements_dup = get_elements(xrf_data.columns.to_list())

In [62]:
dup_dict = {}
for element in elements_dup: 
    dup_dict[element] = {"mean": [], "diff": []}
    for row in duplicates.iterrows(): 
        dup_id = row[1]["sample_id"]
        parent_id = dup_id.rstrip('L')
        dup_value = xrf_data.loc[xrf_data["sample_id"]==dup_id, element].values
        if parent_id != "GR1-010": # account for double analyses of this sample
            parent_value = xrf_data.loc[xrf_data["sample_id"]==parent_id, element].values
        else: 
            parent_value = xrf_data.loc[(xrf_data["sample_id"]==parent_id) & (xrf_data["date"]=="2021-10-31"), element].values

        diff = dup_value - parent_value
        dup_dict[element]["diff"].append(diff[0])

        mean = np.mean([dup_value, parent_value])
        dup_dict[element]["mean"].append(mean[0])

    
    dup_dict[element] = [val**2 for val in dup_dict[element]]
    # print(diff)
    # x = xrf_data.loc[xrf_data["sample_id"]==dup_id, elements_dup]
    # y = xrf_data.loc[xrf_data["sample_id"]==parent_id, elements_dup]

    # print(type(x["Ag"]))
    # print(x.subtract(y))


In [None]:
# alpha= 0.05
# element = "Ba"
# normality_test_data = xrf_data.loc[xrf_data["qaqc_type"]!="standard", element]
# k, p = stats.normaltest(normality_test_data)
# if p < alpha:
#     print("reject null hypothesis that sample is normall distributed")

In [None]:
# export data to csv file
xrf_data.to_csv('../data/interim/xrf_data_calib.csv')

In [21]:
# drop uncertainty columns 
xrf_data.drop([column for column in xrf_data.columns if column.endswith("+/-")], axis=1, inplace=True)

for element in reg.keys(): 
    limit = reg[element]["detection_limit"]
    xrf_data[element].where(xrf_data[element] >= limit, other=f"<{limit}", inplace=True)
    # xrf_data.to_excel('../data/interim/xrf_data_calib.xlsx', na_rep="NaN")

In [22]:
xrf_data.head(n=20)

Unnamed: 0.1,Unnamed: 0,sample_id,date,group,sample_type,qaqc_type,Au,As,Sb,Ag,...,Pd,Pt,Rh,Y,La,Ce,Pr,Nd,Sm,comments
0,0,oreas24b,2021-10-06,group 2,standard,standard,1.8,11.1,<1.7869220549759781,32.0,...,0.5,13.0,0.1,44.5,31.0,37.0,<-5.108256281669303,34.0,22.0,
1,1,oreas22d,2021-10-06,group 2,standard,standard,1.0,<2.6244869705522516,<1.7869220549759781,<9.758418805395854,...,0.9,-1.0,0.3,<10.34278054504528,<20.559599300125416,-14.0,25.0,<6.358067233908441,24.0,
2,2,oreas24c,2021-10-06,group 2,standard,standard,2.8,<2.6244869705522516,<1.7869220549759781,72.0,...,-0.6,16.9,-0.5,30.7,<20.559599300125416,13.0,<-5.108256281669303,37.0,21.0,
3,3,oreas901,2021-10-06,group 2,standard,standard,4.8,<2.6244869705522516,6.0,<9.758418805395854,...,0.9,11.2,0.0,56.2,<20.559599300125416,61.0,<-5.108256281669303,26.0,7.0,
4,4,oreas501b,2021-10-06,group 2,standard,standard,2.5,14.7,<1.7869220549759781,40.0,...,0.3,13.7,0.3,35.4,34.0,52.0,<-5.108256281669303,104.0,-26.0,
5,5,oreas45e,2021-10-06,group 2,standard,standard,9.8,11.0,<1.7869220549759781,85.0,...,4.0,13.0,1.4,23.4,<20.559599300125416,5.0,<-5.108256281669303,<6.358067233908441,-20.0,
6,6,oreas25a,2021-10-06,group 2,standard,standard,2.5,9.3,<1.7869220549759781,<9.758418805395854,...,0.9,17.0,0.5,36.4,<20.559599300125416,11.0,<-5.108256281669303,38.0,-2.0,
7,7,oreas902,2021-10-06,group 2,standard,standard,,<2.6244869705522516,<1.7869220549759781,13.0,...,0.0,1.3,0.4,28.1,24.0,91.0,12.0,42.0,3.0,
8,8,sio2blank,2021-10-06,group 2,standard,standard,-0.4,<2.6244869705522516,2.3,<9.758418805395854,...,1.1,-1.5,0.1,<10.34278054504528,<20.559599300125416,5.0,16.0,<6.358067233908441,-5.0,
9,9,oreas503b,2021-10-06,group 2,standard,standard,3.5,15.0,<1.7869220549759781,41.0,...,0.4,6.1,0.4,35.5,<20.559599300125416,30.0,<-5.108256281669303,95.0,3.0,
