In [26]:
import pandas as pd
import os

In [43]:
## combine xrf data from different days (sessions) into one dataframe
directory = r'../data/raw/XRF_data' #location of csv files
counter = 0
for filename in os.listdir(directory): # cycle through csv files
    if filename.endswith(".csv"):
        file_path = os.path.join(directory, filename)        

        date = filename.split("-",1)[1].split(".",1)[0] # get date from filename

        if counter == 0: # create dataframe with 1st file only
            xrf_data = pd.read_csv(file_path, encoding='utf-16', sep="\t")
            xrf_data = xrf_data.iloc[1:] # remove empty row
            xrf_data["date"] = date # add date column

        else: # append for all other files
            df = pd.read_csv(file_path, encoding='utf-16', sep="\t")
            df = df.iloc[1:] # remove empty row
            df["date"] = date # add date column

            xrf_data = xrf_data.append(df) # combine data
        counter += 1

In [44]:
## clean xrf data
xrf_data = xrf_data.drop(columns=["Analyst", "Field Label 1"]) # drop unnecessary columns
xrf_data = xrf_data.rename(columns={"Field 1": "sample_id"}) # rename sample ID column
xrf_data.loc[xrf_data["sample_id"]=="UNKNOWN", "sample_id"] = "TE2-009" # replace unknown sample ID
xrf_data["date"] = pd.to_datetime(xrf_data["date"], format="%m-%d-%Y").dt.date # make replace dates with date-time object
xrf_data.dropna(subset=["sample_id"], inplace=True) # remove empty rows

In [46]:
## create dataframes for sample processing
sample_processing_path = "../data/raw/EPSC 552 sample processing.xlsx"

analysis_log = pd.read_excel(sample_processing_path, sheet_name="lab_processing") # main sample processing
sample_list  = pd.read_excel(sample_processing_path, sheet_name="samples") # list of samples (and their types)
person_dict  = pd.read_excel(sample_processing_path, sheet_name="person_dictionary") # list of people (and their groups)

In [47]:
analysis_log.dropna(subset=["sample_id"], inplace=True) #remove empty rows
sample_list.dropna( subset=["sample_id"], inplace=True) #remove empty rows
person_dict.dropna( subset=["person_id"], inplace=True) #remove empty rows

In [48]:
## clean main sample processing dataframe
analysis_log["date"] = pd.to_datetime(analysis_log["time_entered"], format="%m-%d-%Y %H:%M:%S.%f").dt.date
analysis_log["analysis_order_index"] = analysis_log["analysis_order_index"].astype("int32") # change dtype from float to integer
analysis_log.drop(analysis_log[analysis_log["process"] != "xrf analysis"].index, inplace=True)

In [49]:
## join to get group ID
analysis_log = pd.merge(analysis_log, person_dict, how="inner", left_on="person_1", right_on="person_id")
analysis_log.drop(columns=["person_1", "person_2", "person_3", "person_id", "description"], inplace=True)

In [50]:
## join to get sample type and QAQC type
analysis_log = pd.merge(analysis_log, sample_list, how="inner", on="sample_id")
analysis_log.drop(columns=["sample_type_2"], inplace=True)
analysis_log.rename(columns={"sample_type_1": "sample_type"}, inplace=True)

In [62]:
## combine comments from different dataframes
analysis_log["comments"] = analysis_log["comments_x"].astype("str").replace({"nan": ""}) + analysis_log["comments_y"].astype("str").replace({"nan": ""})
analysis_log.drop(columns=["process", "time_entered", "last_modified", "comments_x", "comments_y"], inplace=True)

In [63]:
## ensure that case of sample IDs will match
xrf_data["sample_id"] = xrf_data["sample_id"].apply(lambda sample_id: sample_id if (sample_id.startswith("GR")) or (sample_id.startswith("TE")) else sample_id.lower())
analysis_log["sample_id"] = analysis_log["sample_id"].apply(lambda sample_id: sample_id if (sample_id.startswith("GR")) or (sample_id.startswith("TE")) else sample_id.lower())

In [64]:
## combine xrf and sample processing data
xrf_data = pd.merge(xrf_data, analysis_log, how="inner", on=["sample_id", "date"])

In [66]:
## reset index column
xrf_data.sort_values(by="analysis_order_index", inplace=True)
xrf_data.reset_index(inplace=True, drop=True)
xrf_data.drop(columns=["analysis_order_index"], inplace=True)

In [67]:
## drop any empty columns (e.g., no data for an element)
xrf_data.dropna(axis=1, how="all", inplace=True)

In [78]:
## change order of columns
columns = xrf_data.columns
headers_to_move = ["date", "group", "sample_type", "qaqc_type"]

columns = [column for column in columns if column not in headers_to_move]
for header in reversed(headers_to_move):
    columns.insert(1, header)

xrf_data = xrf_data[columns]

In [80]:
# export data to excel file
xrf_data.to_excel('../data/interim/xrf_data_clean.xlsx')