In [39]:
import pandas as pd
import numpy as np
import csv
import json
from scipy.stats import linregress
from statsmodels.stats.weightstats import ztest as ztest

# local code
from source.interactive_plots import interactive_linear_regression_calibration_plot
from source.get_elements      import get_elements

Load data from XRF analyses

In [2]:
xrf_data = pd.read_csv("../data/interim/xrf_data_clean.csv") # load xrf data
# drop uncertainty columns 
xrf_data.drop([column for column in xrf_data.columns if column.endswith("+/-")], axis=1, inplace=True)

# Calibration

Load data for standard reference materials and clean

In [3]:
srm_data = pd.read_csv("../data/interim/standard_reference_material_certified_values.csv") # load SRM data
srm_data.drop(["1SD", "95% Confidence Low", "95% Confidence High"], axis=1, inplace=True) # drop unnecessary columns

# clean SRM data
for row in srm_data.iterrows(): 
    row[1]["Sample ID"] = row[1]["Sample ID"].lower()
    row[1]["Analyte"] = row[1]["Analyte"].split(",",1)[0]

    # clean certified value
    cert_val = row[1]["Certified Value"]
    if cert_val.startswith("<"): 
        row[1]["Certified Value"] = float(cert_val.lstrip("< ")) / 2 # replace BDL with half value
    else:
        row[1]["Certified Value"] = float(row[1]["Certified Value"])

    # clean units
    row[1]["Units"] = row[1]["Units"].lstrip("(").rstrip(")")
    if row[1]["Units"] == "wt.%":  # convert units
       row[1]["Certified Value"] = row[1]["Certified Value"] * 1e4
       row[1]["Units"] = "ppm"

## Remove standard data for a given element that is unsuitable for calibration

Compare the measured concentration of each element for a given standard to the distribution of measured concentrations for that element across all non-standards. If the measured concentration 

    1.) falls outside the range of the distribution 
    2.) is more than three standard deviations from the mean 

that value is set to `NaN` (all values set to `NaN` are filtered out during calibration). 

Note: I should consider conducting a test to determine if each distribution is normally or log-normally distributed; if the latter, I should log-transform the data and then calculate the z-score. 

In [4]:
def z_score(data, value): 
    mean = np.mean(data)
    std = np.std(data)

    return (value - mean) / std

In [5]:
non_standards_data = xrf_data.loc[xrf_data["qaqc_type"]!="standard"]

outlier_stddev_cutoff = 5

for standard in xrf_data.loc[xrf_data["qaqc_type"]=="standard", "sample_id"].unique(): 
    for date in xrf_data.loc[xrf_data["sample_id"]==standard, "date"].unique():
        for element in get_elements(xrf_data.columns.to_list()): 
            standard_date_element = xrf_data.loc[(xrf_data["sample_id"]==standard) & (xrf_data["date"]==date)][element].values

            if (non_standards_data[element] > standard_date_element[0]).all() | \
               (non_standards_data[element] < standard_date_element[0]).all(): 
                
                # print(non_standards_data[element].to_numpy())
                z = z_score(non_standards_data[element], standard_date_element[0])
                if abs(z) > outlier_stddev_cutoff: 
                    print(standard + ", " + date + ", " + element + ", " + str(standard_date_element[0]) + ", " + str(z))

                    xrf_data.loc[(xrf_data["sample_id"]==standard) & (xrf_data["date"]==date), element] = np.nan

oreas24b, 2021-10-06, K, 31090.0, 6.471422612919988
oreas24b, 2021-10-14, K, 31369.0, 6.563395682670596
oreas24b, 2021-10-21, K, 31367.0, 6.562736377511094
oreas24b, 2021-10-29, K, 31590.0, 6.6362489027956295
oreas24b, 2021-10-31, K, 31006.0, 6.44373179622088
oreas24c, 2021-10-21, Cr, 248.0, 7.647141304308473
oreas901, 2021-10-06, As, 70.0, 5.172844445093555
oreas901, 2021-10-06, Cu, 1275.0, 36.20572840108871
oreas901, 2021-10-06, K, 41054.0, 9.756080917561775
oreas901, 2021-10-22, Cu, 1270.0, 36.06111662068375
oreas901, 2021-10-22, K, 40904.0, 9.706633030599082
oreas501b, 2021-10-06, Cr, 275.0, 8.764381497151398
oreas501b, 2021-10-06, Cu, 2267.0, 64.89670563343348
oreas501b, 2021-10-06, K, 36229.0, 8.165507220261834
oreas501b, 2021-10-06, Mo, 104.5, 12.065017729332341
oreas501b, 2021-10-06, Rb, 183.4, 5.275784103193051
oreas501b, 2021-10-22, Cr, 280.0, 8.971277829159346
oreas501b, 2021-10-22, Cu, 2301.0, 65.88006574018723
oreas501b, 2021-10-22, K, 37039.0, 8.432525809860373
oreas501b,

Get the elements analyzed by the XRF and for which concentrations are reported for one or more standard reference materials

In [6]:
elements = get_elements(
                        list(
                            set(srm_data["Analyte"].unique()) & \
                            set(xrf_data.columns.to_list())
                            )
                        )

Construct a linear regression model for each element in order to predict the true concentration from the measured concentration

In [7]:
# initialize dictionary to hold lin. reg. models for each element
reg = {}
for element in elements: 

    # get IDs of standard reference materials
    srm = srm_data.loc[srm_data["Analyte"]==element]["Sample ID"].unique() 

    ## TRAIN 
    # limit training data to standards for which we have standard reference material info for the element at hand
    data_train = xrf_data.loc[(xrf_data["qaqc_type"]=="standard") & (xrf_data["sample_id"].isin(srm))]
    data_train = data_train.dropna(subset=[element]) # change to true condition statement

    x_train = [srm_data.loc[
                            (srm_data["Sample ID"]==sample) & \
                            (srm_data["Analyte"]==element)
                            ]["Certified Value"].values[0] for sample in data_train["sample_id"]]
    x_train = np.array(x_train)
    y_train = data_train[element].to_numpy()

    model = linregress(x_train, y_train) # fit linear regression model

    if model.slope != 0: #only use calibration curve if meaningful (i.e., if variance in dep. var. explained by variance in indep. var.)
        # invert model so that measured concentration is independent var. and true concentration is dependent var. (i.e., y = m*x + b --> x = (1/m)*y - (b/m))
        intercept_inv = -model.intercept / model.slope
        slope_inv = model.slope ** -1

        ## PREDICT (i.e., calibrate)
        data_predict = xrf_data.loc[xrf_data["qaqc_type"]!="standard"]
        data_predict = data_predict.dropna(subset=[element])

        x_predict = data_predict[element]
        x_predict = x_predict.to_numpy()
        y_predict = slope_inv * x_predict + intercept_inv

        ## Save model results
        reg[element] = {} # initialize empty dict to save model results

        reg[element]["model"]                 = model
        reg[element]["x_train"]               = x_train
        reg[element]["y_train"]               = y_train
        reg[element]["score"]                 = model.rvalue
        reg[element]["y-intercept std error"] = model.intercept_stderr
        reg[element]["slope_inv"]             = slope_inv
        reg[element]["intercept_inv"]         = intercept_inv
        reg[element]["x_predict"]             = x_predict
        reg[element]["y_predict"]             = y_predict
        
        xrf_data.loc[xrf_data["qaqc_type"]!="standard", element] = y_predict

  slope = ssxym / ssxm
  slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)


Display the results of the calibration

In [8]:
dropdown_buttons = {
    "data": 
        {
            "name": "Elements", 
            "columns": list(reg.keys())
        }
    }

interactive_linear_regression_calibration_plot(dropdown_buttons, reg, x_axis_label="True concentration (ppm)", y_axis_label="Measured concentration (ppm)", title="")

VBox(children=(HBox(children=(Dropdown(description='Elements', options=('Ag', 'As', 'Ba', 'Bi', 'Ca', 'Cd', 'C…

Calculate the detection limit for each element. 

In [9]:
for element in elements: 
    detection_limit = reg[element]["slope_inv"] * (reg[element]["model"].intercept + 3*reg[element]["y-intercept std error"]) + reg[element]["intercept_inv"]
    reg[element]["detection_limit"] = detection_limit

Apply detection limit to dataset.

In [10]:
for element in reg.keys(): 
    limit = reg[element]["detection_limit"]
    xrf_data[element].where(xrf_data[element] >= limit, other=f"<{limit}", inplace=True)

# Evaluating Precision

In [11]:
lab_duplicates = xrf_data[xrf_data["qaqc_type"]=="lab duplicate"]

lab_parent_indexes = []

for row in lab_duplicates.iterrows(): 
    lab_dup_id = row[1]["sample_id"]
    lab_parent_id = lab_dup_id.rstrip('L')

    condition = xrf_data["sample_id"] == lab_parent_id
    index = xrf_data.index[condition][0]
    lab_parent_indexes.append(index)

lab_parents = xrf_data.iloc[lab_parent_indexes]

lab_pairs = pd.concat([lab_duplicates, lab_parents])

In [12]:
field_duplicates = xrf_data[xrf_data["qaqc_type"]=="field duplicate"]
field_parent_indexes = []

for row in field_duplicates.iterrows(): 
        field_dup_id = row[1]["sample_id"]
        field_parent_id = field_dup_id.rstrip('F')

        condition = xrf_data["sample_id"] == field_parent_id
        index = xrf_data.index[condition][0]
        field_parent_indexes.append(index)

field_parents = xrf_data.iloc[field_parent_indexes]

field_pairs = pd.concat([field_duplicates, field_parents])
field_parents.shape

(12, 47)

In [28]:
lab_pair_diffs = lab_parents.copy()

elements_dup = get_elements(xrf_data.columns.to_list())

lab_pair_diffs.loc[:, elements_dup] = lab_pair_diffs.loc[:, elements_dup].apply(pd.to_numeric, errors="coerce")
lab_duplicates = lab_duplicates.copy()
lab_duplicates.loc[:, elements_dup] = lab_duplicates.loc[:, elements_dup].apply(pd.to_numeric, errors="coerce")

for row in lab_pair_diffs.iterrows():
    parent_id = row[1]["sample_id"]

    parent_entry = row[1].to_frame().transpose()
    parent_entry.reset_index(inplace=True)
    parent_values = parent_entry.loc[:, elements_dup]

    duplicate_entry = lab_duplicates.loc[lab_duplicates["sample_id"]==parent_id + "L"]
    duplicate_entry.reset_index(inplace=True)
    duplicate_values = duplicate_entry.loc[:, elements_dup]
    
    condition = lab_pair_diffs["sample_id"] == parent_id
    lab_pair_diffs.loc[condition, elements_dup] = parent_values.subtract(duplicate_values).values
lab_pair_diffs = lab_pair_diffs[["sample_id"] + elements_dup]
 

In [None]:
lab_pair_diffs = lab_parents.copy()

elements_dup = get_elements(xrf_data.columns.to_list())

lab_pair_diffs.loc[:, elements_dup] = lab_pair_diffs.loc[:, elements_dup].apply(pd.to_numeric, errors="coerce")
lab_duplicates = lab_duplicates.copy()
lab_duplicates.loc[:, elements_dup] = lab_duplicates.loc[:, elements_dup].apply(pd.to_numeric, errors="coerce")

for row in lab_pair_diffs.iterrows():
    parent_id = row[1]["sample_id"]

    parent_entry = row[1].to_frame().transpose()
    parent_entry.reset_index(inplace=True)
    parent_values = parent_entry.loc[:, elements_dup]

    duplicate_entry = lab_duplicates.loc[lab_duplicates["sample_id"]==parent_id + "L"]
    duplicate_entry.reset_index(inplace=True)
    duplicate_values = duplicate_entry.loc[:, elements_dup]
    
    condition = lab_pair_diffs["sample_id"] == parent_id
    print(lab_pair_diffs.loc[condition, elements_dup])
    print(parent_values.subtract(duplicate_values))
    # print(type(parent_values.subtract(duplicate_values)))
    # print(type(lab_pair_diffs.loc[lab_pair_diffs["sample_id"] == parent_id, elements_dup]))

# duplicate_values.head()
# parent_values.head()
# dff = pd.concat([duplicate_values, parent_values])
# dff.iloc[1].subtract(dff.iloc[0]).transpose()
# duplicate_entry.loc[:, elements_dup].head()
# lab_pair_diffs.head(n=20)

In [30]:
lab_pair_diffs.head(n=100)

Unnamed: 0,sample_id,Ag,As,Au,Ba,Bi,Ca,Cd,Ce,Cl,...,Se,Sm,Sn,Sr,Ti,U,V,Y,Zn,Zr
19,TE2-018,,13.06715,-1.6,-8.571785,,-420.453794,,-0.547945,62.0,...,,-0.621933,,6.42681,-6.098084,2.202949,0.0,-0.44274,-0.838531,
30,GR1-014,,-7.127536,0.5,3.21442,,-147.232078,,4.109589,-85.0,...,,-1.243866,,-3.213405,-52.595977,0.367158,-4.766385,-1.10685,-2.515594,66.114723
39,TE2-008,,-2.019469,-0.3,-27.858302,,-46.147368,,-18.082192,53.0,...,,-11.816727,,-4.28454,279.749616,,9.532771,,4.695775,1168.026767
75,TE2-021,,-8.315459,1.1,2.142946,,-208.761901,,8.219178,82.0,...,,-9.328995,,-12.853619,42.68659,0.367158,-6.355181,-0.110685,-13.248795,330.573613
73,TE2-036,,-10.691304,-0.7,-10.714732,,1668.630213,,12.328767,-20.0,...,,16.170258,,7.497944,368.934098,-3.304423,0.0,2.103016,-10.062376,1079.873804
63,TE2-017,,-2.375845,0.8,11.786205,,350.866493,,-4.657534,32.0,...,,-3.731598,,7.497944,,-1.591018,,1.881645,5.869719,
58,GR1-030F,,11.879227,1.4,-7.500312,,327.426561,,-1.917808,38.0,...,,-2.487732,,12.853619,148.640804,-0.734316,3.17759,0.110685,-3.354125,396.688336
68,GR1-W04,,2.019469,0.5,-17.143571,,519.34101,,-4.383562,245.0,...,,4.353531,,2.14227,574.744442,0.244772,17.476747,4.648771,9.89467,3327.774375
103,GR1-050F,,2.375845,0.7,18.215044,,-6.592481,,-10.684932,77.0,...,,2.487732,,2.14227,156.263409,,4.766385,,-1.425503,374.650095
98,GR1-030,,16.630918,-0.7,-40.71598,,906.099901,,-3.835616,16.0,...,,1.243866,,143.53208,-471.83927,5.384985,-3.17759,7.194527,46.957754,2380.130016


In [36]:
def std_dev_from_pairs(diffs): 
    diffs = np.asarray(diffs)
    n = diffs.size
    
    std_dev = np.sqrt(np.sum(np.square(diffs)) / (2 * n))

    return std_dev, n

In [57]:
analytical_precision = {}
for element in elements_dup: 
    analytical_precision[element] = dict.fromkeys(["standard deviation", "number of pairs"])

    diffs = lab_pair_diffs[element].dropna()
    std_dev, n = std_dev_from_pairs(diffs)

    if np.isnan(std_dev): 
        std_dev = None

    analytical_precision[element]["standard deviation"] = std_dev
    analytical_precision[element]["number of pairs"] = n


invalid value encountered in double_scalars



In [59]:
with open("../data/interim/analytical_precision.json", "w") as outfile: 
    json.dump(analytical_precision, outfile, indent=4)

In [62]:
dup_dict = {}
for element in elements_dup: 
    dup_dict[element] = {"mean": [], "diff": []}
    for row in duplicates.iterrows(): 
        dup_id = row[1]["sample_id"]
        parent_id = dup_id.rstrip('L')
        dup_value = xrf_data.loc[xrf_data["sample_id"]==dup_id, element].values
        if parent_id == "GR1-010": # account for double analyses of this sample (arbitrarily choose a date)
            parent_value = xrf_data.loc[(xrf_data["sample_id"]==parent_id) & (xrf_data["date"]=="2021-10-31"), element].values
        else: 
            parent_value = xrf_data.loc[xrf_data["sample_id"]==parent_id, element].values


        diff = dup_value - parent_value
        dup_dict[element]["diff"].append(diff[0])

        mean = np.mean([dup_value, parent_value])
        dup_dict[element]["mean"].append(mean[0])

    
    dup_dict[element] = [val**2 for val in dup_dict[element]]
    # print(diff)
    # x = xrf_data.loc[xrf_data["sample_id"]==dup_id, elements_dup]
    # y = xrf_data.loc[xrf_data["sample_id"]==parent_id, elements_dup]

    # print(type(x["Ag"]))
    # print(x.subtract(y))


In [None]:
# alpha= 0.05
# element = "Ba"
# normality_test_data = xrf_data.loc[xrf_data["qaqc_type"]!="standard", element]
# k, p = stats.normaltest(normality_test_data)
# if p < alpha:
#     print("reject null hypothesis that sample is normall distributed")

In [None]:
# export data to csv file
xrf_data.to_csv('../data/interim/xrf_data_calib.csv')

In [21]:

    # xrf_data.to_excel('../data/interim/xrf_data_calib.xlsx', na_rep="NaN")

In [24]:
xrf_data["Ba"].head(n=200)

0                   679.0
1      <13.02893554594495
2                   256.0
3                   197.0
4                   951.0
              ...        
195            501.689504
196             615.26566
197                 653.0
198                 190.0
199    <13.02893554594495
Name: Ba, Length: 200, dtype: object