In [75]:
import pandas as pd
import numpy as np
import csv
from sklearn.linear_model import LinearRegression
from statsmodels.stats.weightstats import ztest as ztest

# local code
from source.interactive_plots import interactive_linear_regression_calibration_plot
from source.get_elements      import get_elements

Load data from XRF analyses

In [76]:
xrf_data = pd.read_csv("../data/interim/xrf_data_clean.csv") # load xrf data

# Calibration

Load data for standard reference materials and clean

In [77]:
srm_data = pd.read_csv("../data/interim/standard_reference_material_certified_values.csv") # load SRM data
srm_data.drop(["1SD", "95% Confidence Low", "95% Confidence High"], axis=1, inplace=True) # drop unnecessary columns

# clean SRM data
for row in srm_data.iterrows(): 
    row[1]["Sample ID"] = row[1]["Sample ID"].lower()
    row[1]["Analyte"] = row[1]["Analyte"].split(",",1)[0]

    # clean certified value
    cert_val = row[1]["Certified Value"]
    if cert_val.startswith("<"): 
        row[1]["Certified Value"] = float(cert_val.lstrip("< ")) / 2 # replace BDL with half value
    else:
        row[1]["Certified Value"] = float(row[1]["Certified Value"])

    # clean units
    row[1]["Units"] = row[1]["Units"].lstrip("(").rstrip(")")
    if row[1]["Units"] == "wt.%":  # convert units
       row[1]["Certified Value"] = row[1]["Certified Value"] * 1e4
       row[1]["Units"] = "ppm"

In [78]:
srm_data[srm_data["Analyte"]=="Mo"].head()

Unnamed: 0,Sample ID,Matrix,Mineralization,Source,Analyte,Units,Certified Value,Method
9,oreas22d,quartz,,https://www.oreas.com/crm/oreas-22d/,Mo,ppm,2.36,4-Acid Digestion
25,oreas24b,granodiorite,,https://www.oreas.com/crm/oreas-24b/,Mo,ppm,4.03,4-Acid Digestion
54,oreas24c,basalt,,https://www.oreas.com/crm/oreas-24c/,Mo,ppm,2.49,4-Acid Digestion
80,oreas25a,ferruginous soil,,https://www.oreas.com/crm/oreas-25a,Mo,ppm,2.55,4-Acid Digestion
107,oreas45d,ferruginous soil,Magmatic Ni-Cu-PGE,https://www.oreas.com/crm/oreas-25a,Mo,ppm,2.5,4-Acid Digestion


In [79]:
xrf_data["Mo"].head(n=20)

0       4.1
1       2.7
2       1.5
3       5.8
4     104.5
5       4.0
6       4.7
7      14.3
8      -0.6
9     335.0
10      4.3
11    247.0
12    508.0
13      2.9
14      5.0
15     12.1
16     27.9
17      1.6
18     25.5
19      9.0
Name: Mo, dtype: float64

## Remove standard data for a given element that in unsuitable for calibration

Compare the measured concentration of each element for a given standard to the distribution of measured concentrations for that element across all non-standards. If the measured concentration 

    1.) falls outside the range of the distribution 
    2.) is more than three standard deviations from the mean 

that value is set to `NaN` (all values set to `NaN` are filtered out during calibration). 

Note: I should consider conducting a test to determine if each distribution is normally or log-normally distributed; if the latter, I should log-transform the data and then calculate the z-score. 

In [80]:
def z_score(data, value): 
    mean = np.mean(data)
    std = np.std(data)

    return (value - mean) / std

In [81]:
non_standards_data = xrf_data.loc[xrf_data["qaqc_type"]!="standard"]

outlier_stddev_cutoff = 5

for standard in xrf_data.loc[xrf_data["qaqc_type"]=="standard", "sample_id"].unique(): 
    for date in xrf_data.loc[xrf_data["sample_id"]==standard, "date"].unique():
        for element in get_elements(xrf_data.columns.to_list()): 
            standard_date_element = xrf_data.loc[(xrf_data["sample_id"]==standard) & (xrf_data["date"]==date)][element].values

            if (non_standards_data[element] > standard_date_element[0]).all() | \
               (non_standards_data[element] < standard_date_element[0]).all(): 
                
                # print(non_standards_data[element].to_numpy())
                z = z_score(non_standards_data[element], standard_date_element[0])
                if abs(z) > outlier_stddev_cutoff: 
                    print(standard + ", " + date + ", " + element + ", " + str(standard_date_element[0]) + ", " + str(z))

                    xrf_data.loc[(xrf_data["sample_id"]==standard) & (xrf_data["date"]==date), element] = np.nan

oreas24b, 2021-10-06, K, 31090.0, 6.471422612919988
oreas24b, 2021-10-14, K, 31369.0, 6.563395682670596
oreas24b, 2021-10-21, K, 31367.0, 6.562736377511094
oreas24b, 2021-10-29, K, 31590.0, 6.6362489027956295
oreas24b, 2021-10-31, K, 31006.0, 6.44373179622088
oreas24c, 2021-10-21, Cr, 248.0, 7.647141304308473
oreas901, 2021-10-06, As, 70.0, 5.172844445093555
oreas901, 2021-10-06, Cu, 1275.0, 36.20572840108871
oreas901, 2021-10-06, K, 41054.0, 9.756080917561775
oreas901, 2021-10-22, Cu, 1270.0, 36.06111662068375
oreas901, 2021-10-22, K, 40904.0, 9.706633030599082
oreas501b, 2021-10-06, Cr, 275.0, 8.764381497151398
oreas501b, 2021-10-06, Cu, 2267.0, 64.89670563343348
oreas501b, 2021-10-06, K, 36229.0, 8.165507220261834
oreas501b, 2021-10-06, Mo, 104.5, 12.065017729332341
oreas501b, 2021-10-06, Rb, 183.4, 5.275784103193051
oreas501b, 2021-10-22, Cr, 280.0, 8.971277829159346
oreas501b, 2021-10-22, Cu, 2301.0, 65.88006574018723
oreas501b, 2021-10-22, K, 37039.0, 8.432525809860373
oreas501b,

Get the elements analyzed by the XRF and for which concentrations are reported for one or more standard reference materials

In [82]:
elements = get_elements(
                        list(
                            set(srm_data["Analyte"].unique()) & \
                            set(xrf_data.columns.to_list())
                            )
                        )

Construct a linear regression model for each element in order to predict the true concentration from the measured concentration

In [83]:
 # initialize dictionary to hold lin. reg. models for each element
reg = {}
for element in elements: 
    # get IDs of standard reference materials
    srm = srm_data.loc[srm_data["Analyte"]==element]["Sample ID"].unique() 

    ## TRAIN 
    # limit training data to standards for which we have standard reference material info for the element at hand
    data_train = xrf_data.loc[(xrf_data["qaqc_type"]=="standard") & (xrf_data["sample_id"].isin(srm))]
    data_train = data_train.dropna(subset=[element]) # change to true condition statement

    x_train = [srm_data.loc[(srm_data["Sample ID"]==sample) & (srm_data["Analyte"]==element)]["Certified Value"].values[0] for sample in data_train["sample_id"]]
    x_train = np.array(x_train)[:, None] # slice to add extra dim. (req. by model)
    y_train = data_train[element]

    model = LinearRegression() # create linear regression model
    model.fit(x_train,y_train) # fit the model
    score = model.score(x_train,y_train)

    if model.coef_ != 0: #only use calibration curve if meaningful
        # invert model so that measured concentration is independent var. and true concentration is dependent var. (i.e., x = (1/m)*y - (b/m))
        model.intercept_ = -model.intercept_ / model.coef_
        model.coef_ = model.coef_**-1

        ## PREDICT (i.e., calibrate)
        data_predict = xrf_data.loc[xrf_data["qaqc_type"]!="standard"]
        data_predict = data_predict.dropna(subset=[element])

        x_predict = data_predict[element]
        x_predict = x_predict.to_numpy()[:, None]
        y_predict = model.predict(x_predict)


        ## Save model results
        reg[element] = {} # initialize empty dict to save model results

        reg[element]["model"]   = model
        reg[element]["x_train"] = x_train
        reg[element]["y_train"] = y_train
        reg[element]["score"]   = score
        reg[element]["x_predict"] = x_predict
        reg[element]["y_predict"] = y_predict
        xrf_data.loc[xrf_data["qaqc_type"]!="standard", element] = y_predict

Display the results of the calibration

In [84]:
dropdown_buttons = {
    "data": 
        {
            "name": "Elements", 
            "columns": list(reg.keys())
        }
    }

interactive_linear_regression_calibration_plot(dropdown_buttons, reg, x_axis_label="True concentration (ppm)", y_axis_label="Measured concentration (ppm)", title="")

VBox(children=(HBox(children=(Dropdown(description='Elements', options=('Ag', 'As', 'Ba', 'Bi', 'Ca', 'Ce', 'C…

In [85]:
duplicates = xrf_data[xrf_data["qaqc_type"]=="lab duplicate"]

In [26]:
elements_dup = get_elements(xrf_data.columns.to_list())

In [62]:
dup_dict = {}
for element in elements_dup: 
    dup_dict[element] = {"mean": [], "diff": []}
    for row in duplicates.iterrows(): 
        dup_id = row[1]["sample_id"]
        parent_id = dup_id.rstrip('L')
        dup_value = xrf_data.loc[xrf_data["sample_id"]==dup_id, element].values
        if parent_id != "GR1-010": # account for double analyses of this sample
            parent_value = xrf_data.loc[xrf_data["sample_id"]==parent_id, element].values
        else: 
            parent_value = xrf_data.loc[(xrf_data["sample_id"]==parent_id) & (xrf_data["date"]=="2021-10-31"), element].values

        diff = dup_value - parent_value
        dup_dict[element]["diff"].append(diff[0])

        mean = np.mean([dup_value, parent_value])
        dup_dict[element]["mean"].append(mean[0])

    
    dup_dict[element] = [val**2 for val in dup_dict[element]]
    # print(diff)
    # x = xrf_data.loc[xrf_data["sample_id"]==dup_id, elements_dup]
    # y = xrf_data.loc[xrf_data["sample_id"]==parent_id, elements_dup]

    # print(type(x["Ag"]))
    # print(x.subtract(y))


In [None]:
dup_dict

In [249]:
print(diff)

     Pt  Pr  Rb  Ti  Rh  Zr   V   U   Y  Sn  ...  Se  Cu  Sm  Ag  Pb  As  Mn  \
220 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN  ... NaN NaN NaN NaN NaN NaN NaN   
222 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN  ... NaN NaN NaN NaN NaN NaN NaN   

     Mo  Nb  Bi  
220 NaN NaN NaN  
222 NaN NaN NaN  

[2 rows x 40 columns]


In [13]:
[(num - 1) / 8 for num in list(range(1,10))]


[0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]

In [None]:
alpha= 0.05
element = "Ba"
normality_test_data = xrf_data.loc[xrf_data["qaqc_type"]!="standard", element]
k, p = stats.normaltest(normality_test_data)
if p < alpha:
    print("reject null hypothesis that sample is normall distributed")

In [103]:
# export data to csv file
xrf_data.to_csv('../data/interim/xrf_data_calib.csv')
#drop uncertainty columns 
xrf_data.drop([column for column in xrf_data.columns if column.endswith("+/-")], axis=1, inplace=True)
# xrf_data[xrf_data.loc[:, get_elements(xrf_data.columns.to_list())] < 0] = 0
xrf_data.where(xrf_data >= 0, other="BDL")
xrf_data.to_excel('../data/interim/xrf_data_calib.xlsx', na_rep="NaN")

TypeError: '>=' not supported between instances of 'str' and 'int'

In [93]:
drop_columns

['Au +/-',
 'As +/-',
 'Sb +/-',
 'Ag +/-',
 'Ba +/-',
 'Bi +/-',
 'Ca +/-',
 'Cd +/-',
 'Cl +/-',
 'Co +/-',
 'Cr +/-',
 'Cu +/-',
 'Fe +/-',
 'Hg +/-',
 'K +/-',
 'Mn +/-',
 'Mo +/-',
 'Ni +/-',
 'P +/-',
 'Pb +/-',
 'Rb +/-',
 'S +/-',
 'Se +/-',
 'Sn +/-',
 'Sr +/-',
 'Ti +/-',
 'U +/-',
 'V +/-',
 'Zn +/-',
 'Zr +/-',
 'Nb +/-',
 'Pd +/-',
 'Pt +/-',
 'Rh +/-',
 'Y +/-',
 'La +/-',
 'Ce +/-',
 'Pr +/-',
 'Nd +/-',
 'Sm +/-']

In [74]:
xrf_data.head(n=200)

Unnamed: 0.1,Unnamed: 0,sample_id,date,group,sample_type,qaqc_type,Au,Au +/-,As,As +/-,...,La +/-,Ce,Ce +/-,Pr,Pr +/-,Nd,Nd +/-,Sm,Sm +/-,comments
0,0,oreas24b,2021-10-06,group 2,standard,standard,1.8,0.9,11.100000,1.8,...,12.0,37.000000,19.0,-71.000000,20.0,34.000000,20.0,22.000000,15.0,
1,1,oreas22d,2021-10-06,group 2,standard,standard,1.0,0.6,0.200000,1.0,...,9.0,-14.000000,15.0,25.000000,15.0,-7.000000,15.0,24.000000,12.0,
2,2,oreas24c,2021-10-06,group 2,standard,standard,2.8,1.0,1.200000,1.7,...,13.0,13.000000,21.0,-22.000000,22.0,37.000000,22.0,21.000000,18.0,
3,3,oreas901,2021-10-06,group 2,standard,standard,4.8,1.1,,2.0,...,12.0,61.000000,18.0,-10.000000,19.0,26.000000,19.0,7.000000,14.0,
4,4,oreas501b,2021-10-06,group 2,standard,standard,2.5,1.0,14.700000,1.9,...,12.0,52.000000,19.0,-113.000000,21.0,104.000000,20.0,-26.000000,16.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,195,TE2-029L,2021-10-31,group 2,soil,lab duplicate,1.9,0.9,53.824658,3.0,...,11.0,88.054795,17.0,2.125581,18.0,17.885710,18.0,-3.787541,13.0,
196,196,GR1-005L,2021-10-31,group 2,soil,lab duplicate,1.0,0.9,4.763451,1.9,...,11.0,88.876712,18.0,4.428633,19.0,23.989105,18.0,1.187923,14.0,
197,197,oreas504b,2021-11-04,group 1,standard,standard,,10.0,11.000000,6.0,...,39.0,-18.000000,62.0,-116.000000,67.0,58.000000,66.0,-39.000000,52.0,
198,198,oreas45d,2021-11-04,group 1,standard,standard,1.6,1.2,11.000000,2.0,...,15.0,33.000000,23.0,-34.000000,24.0,2.000000,24.0,-15.000000,20.0,


In [106]:
np.log(-160)/np.log(289)



invalid value encountered in log



nan