In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from mendeleev.fetch import fetch_table
import numpy as np
from datetime import datetime
from itertools import compress
from sklearn.linear_model import LinearRegression
from source.outliers import *
from source.interactive_plots import interactive_linear_regression_plot

In [2]:
xrf_data = pd.read_csv("../data/interim/xrf_data_clean.csv")

In [3]:
standards_data = xrf_data[xrf_data["qaqc_type"]=="standard"]
lab_duplicates_data = xrf_data[xrf_data["qaqc_type"]=="lab duplicate"]
field_duplicates_data = xrf_data[xrf_data["qaqc_type"]=="field duplicate"]

In [4]:
standards = standards_data["sample_id"].unique()
lab_duplicates = lab_duplicates_data["sample_id"].unique()
field_duplicates = field_duplicates_data["sample_id"].unique()
dates = xrf_data["date"].unique()

In [5]:
# get elements analyzed
columns = xrf_data.columns.tolist()
ptable = fetch_table('elements').symbol.to_list()
elements = [elem for elem in columns if elem in ptable]

In [6]:
initial_date = "2021-10-06"

standards_drift_data = standards_data.copy()

for standard in standards_data["sample_id"].unique(): 

    initial_measurement = standards_data.loc[(standards_data["sample_id"] == standard) & \
                                             (standards_data["date"]      == initial_date)]

    other_measurements  = standards_data.loc[(standards_data["sample_id"]  == standard) & \
                                            (standards_data["date"]       != initial_date)]

    diff = other_measurements.loc[:,elements] - initial_measurement.loc[:, elements].iloc[0]
    
    standards_drift_data.loc[other_measurements.index, elements] = diff

    standards_drift_data.drop(initial_measurement.index, axis=0, inplace=True)
    
standards_drift_data.reset_index(inplace=True)


## Outlier detection

In [7]:
outliers = detect_outliers_Dixons_Q(elements, standards_drift_data)
save_outliers(outliers);
outliers = get_outliers()

## Outlier removal

In [8]:
standards_drift_data = remove_outliers(outliers, standards_drift_data)

## Analysis

In [9]:
# initialize dictionary to hold drift correction lin. reg. models for each element
reg = {}

score_threshold = 0.5 # threshold R^2 value below which no drift correction is necessary
drift_correction_eval = []

for element in elements:
    data_train = standards_drift_data[["date", element]].dropna(axis=0)
    x_train = [datetime.strptime(date, "%Y-%m-%d") for date in data_train["date"]]
    x_train = pd.Series(x_train).map(datetime.toordinal)
    x_train = x_train.to_numpy()[:, None] # slice to add extra dim. (req. by model)
    y_train = data_train[element]

    model = LinearRegression()
    model.fit(x_train,y_train)

    reg[element] = {}

    reg[element]["model"]   = model
    reg[element]["x_train"] = x_train
    reg[element]["y_train"] = y_train
    reg[element]["score"]   = model.score(x_train,y_train)
    if reg[element]["score"] > 0.5: 
        drift_correction_eval.append(True)
    else:
        drift_correction_eval.append(False)

    x_predict = np.unique(x_train.squeeze())[:, None]
    y_predict = model.predict(x_predict)
    
    reg[element]["x_predict"] = x_predict
    reg[element]["y_predict"] = y_predict

if len(list(compress(elements, drift_correction_eval))) == 0: 
    print("No drift correction necessary") 
else: 
    print("Drift correction necessary")


No drift correction necessary


## Plotting

In [10]:
dropdown_buttons = {
    "data": 
        {
            "name": "Elements", 
            "columns": list(reg.keys())
        }
    }
    
interactive_linear_regression_plot(dropdown_buttons, reg, x_axis_label="Dates", y_axis_label="Concentration (ppm)", title="Difference in reported concentration from initial standard analysis regressed on time")

VBox(children=(HBox(children=(Dropdown(description='Elements', options=('Au', 'As', 'Sb', 'Ag', 'Ba', 'Bi', 'C…