# Read in Lab data 
Does some cleaning, similar timestamps are grouped 

In [None]:
import pandas as pd
import numpy as np

def group_timestamps(timestamps, avg_window_hours=24):
    # a list of timestamp ranges that are close to each other
    timestamp_groups = []

    # find timestamps that are within 24 hours of any other timestamp
    for i in timestamps:
        similar_timestamps = []
        for t in timestamps:
            if abs(t-i) < pd.Timedelta(hours=avg_window_hours):
                similar_timestamps.append(t)

        # remove the timestamps that are similar to the current timestamp, as they are already in the list
        for s in similar_timestamps:
            timestamps = timestamps[timestamps != s]

        if len(similar_timestamps) > 0:
            timestamp_groups.append(similar_timestamps)

    return timestamp_groups

with open('Lab Results Compiled.xlsx', 'rb') as f:
    input_df = pd.read_excel(f, sheet_name='Analytics Model - LIMS')

dashboard_data = []
output_df = pd.DataFrame()

# remove pesky trailing spaces
input_df['Tank'] = input_df['Tank'].str.rstrip()

tanks = input_df['Tank'].unique()

for t in tanks:
    # extract data relevant to current tank
    gc_data = input_df[input_df['Tank'] == t]

    timestamps = gc_data["SampleResults[Sampled Timestamp]"]

    # figure out if any timestamps are close enough to be averaged
    timestamp_groups = group_timestamps(timestamps, avg_window_hours=24)

    for g in timestamp_groups:
        # extract the data for the current timestamp group
        data_to_avg = gc_data[gc_data['SampleResults[Sampled Timestamp]'].isin(g)]

        determinands = data_to_avg['Determinand[Determinand Name]'].unique()

        for d in determinands:
            # extract the data for the current determinand
            determinand_df = data_to_avg[data_to_avg['Determinand[Determinand Name]'] == d]
            determinand_df.reset_index(drop=True, inplace=True)

            result = determinand_df['SampleResults[Sample Result]']
            unit = determinand_df['Determinand[Unit of Measure]'][0]
            sample_nums = determinand_df['SampleResults[SampleNumber]'].unique()

            # remove any < or > from the result
            result = result.astype(str).str.replace("<", "")
            result = result.astype(str).str.replace(">", "")
            result = pd.to_numeric(result, errors='raise')

            std_dev = result.std()
            std_dev = round(std_dev, 2)
            result = result.mean()
            result = round(result, 2)

            time = pd.Timestamp(g[0])

            # Some formatting for output

            # remove spaces from determinand
            determinand = d.replace(" ", "-")

            # add ST to tank name unless it is INLET
            tank = t
            # if tank != "INLET":
            #     tank = f"ST{tank}"

            if len(tank) == 5:
                tank = tank.replace(" ", ".")
            if len(tank) == 4:
                tank = tank.replace(" ", "C.")

            dashboard_sample = {
                "key"    : f"{tank}.{determinand}",
                "value"  : result,
                "epoch"  : time.timestamp()
            }

            df_sample = pd.DataFrame({
                "tank"        : tank,
                "determinand" : d,
                "value"       : result,
                "std_dev"     : std_dev,
                "unit"        : unit,
                "timestamp"   : time,
                "sample_nums" : str(sample_nums)
            }, index=[0])

            dashboard_data.append(dashboard_sample)
            output_df = pd.concat([output_df, df_sample], axis=0, ignore_index=True)
print(dashboard_data)
output_df
output_df.to_csv("lab_data.csv", index=False)



In [None]:
# Just a quick check to make sure the tank names are correct
input_df['Tank'].unique()

In [None]:
# Export the lab data to excel (optional)

with pd.ExcelWriter("output.xlsx") as writer:

    output_df.to_excel(writer, sheet_name="Averaged Data")
    input_df.to_excel(writer, sheet_name="Original Data")
    

    tanks = ['CON', 'INS', '20C', '30C', 'INLET']
    for t in tanks:

        # select from output_df where tank name starts with e.g. "STINS"
        tab_df = output_df[output_df['tank'].str.startswith(t)]

        # arrange so the individual tanks are shown are side by side
        tab_df = tab_df.pivot_table(index=['determinand', 'unit' ], columns=['tank', 'timestamp'], values='value')
        tab_df.to_excel(writer, sheet_name=t)


# # resize the columns in the excel file to fit the data, while handling merged cells
import openpyxl
from openpyxl.utils import get_column_letter
from openpyxl.styles import Alignment

wb = openpyxl.load_workbook("output.xlsx")

for ws in wb.worksheets:
    for col in ws.columns:
        max_length = 0
        try:
            column = col[0].column_letter
        except:
            column = col[1].column_letter



        for cell in col:
            cell.alignment = Alignment(wrap_text=True)
            if len(str(cell.value)) > max_length:
                max_length = len(str(cell.value))

        adjusted_width = (max_length + 2) * 1.2
        ws.column_dimensions[column].width = adjusted_width

# Tweak the format of the tank specific sheets
tanks = ['CON', 'INS', '20C', '30C', 'INLET']

for t in tanks:
    ws = wb[t]

    # set the format of row 2, columns c onwards to be a date
    for col in ws.columns:
        letter = col[1].column_letter
        if letter in ["A", "B"]:
            continue
        cell = col[1]
        cell.number_format = "mmm-dd"
        ws.column_dimensions[letter].width = 9


wb.save("output.xlsx")

In [None]:
# This was helpful for generating the label and unit mappings below.
# No need to run every time

# create a new df from lab_df selecting columns "unit" "and determinand"
unit_mapping_df = lab_df[["unit", "determinand"]].drop_duplicates()

# make a new column that is the concatenation of the unit and determinand
unit_mapping_df["label"] = unit_mapping_df["determinand"] + " (" + unit_mapping_df["unit"] +")"
unit_mapping_dict = unit_mapping_df.set_index("determinand").to_dict()["label"]
unit_mapping_dict

In [None]:
# Read in the sensor Data

import pandas as pd

# Split into 2 to stay below github 100MB limit
sensor_df = pd.read_csv("2024 data initial_state_backup.csv", names=["timestamp", "key", "value"], low_memory=False)
# sensor_df = pd.read_csv("initial_state_backup.csv", names=["timestamp", "key", "value"], low_memory=False)
# sensor_df2 = pd.read_csv("initial_state_backup_august_onward.csv", names=["timestamp", "key", "value"], low_memory=False)
# sensor_df = pd.read_csv("initial_state_backup_2023-11-29.csv", names=["timestamp", "key", "value"], low_memory=False)


unit_mapping = {
    "ts" : "C",
    "tl" : "C",
    "ph" : "pH",
    "pr" : "mBar",
    "gc" : "%CH4",
}

labels_mapping = {
    "ts"                            : "Top Temp (C)",
    "tl"                            : "Bottom Temp (C)",
    "ph"                            : "pH",
    "pr"                            : "Pressure (mBar)",
    "gc"                            : "CH4 Concentration (%)",
    'Total Nitrogen (as N)'         : 'Total Nitrogen (as N) (mg/l as N)',
    'Ammonia'                       : 'Ammonia (mg/l as N)',
    'Nitrate'                       : 'Nitrate (mg/l as N)',
    'Nitrite'                       : 'Nitrite (mg/l as N)',
    'Total oxidised nitrogen'       : 'Total oxidised nitrogen (mg/l as N)',
    'Phosphorus'                    : 'Phosphorus (mgP/l)',
    'Sulphate'                      : 'Sulphate (mgSO₄/l)',
    'Suspended solids'              : 'Suspended solids (mg/l)',
    'Total Chemical Oxygen Demand'  : 'Total Chemical Oxygen Demand (mgO₂/l)',
    'Soluble Chemical Oxygen Demand': 'Soluble Chemical Oxygen Demand (mgO₂/l)',
    'Biochemical oxygen demand'     : 'Biochemical oxygen demand (mgO₂/l)',
    'Total organic carbon'          : 'Total organic carbon (mgC/l)',
    'Alkalinity'                    : 'Alkalinity (mgCaCO₃/l)',
    'Conductivity'                  : 'Conductivity (mS/cm)',
    'Manganese'                     : 'Manganese (mgMn/l)',
}

def clean_sensor_df(sensor_df):

    #remove keys we don't need
    sensor_df = sensor_df[~sensor_df["key"].str.contains(".tc")]
    sensor_df = sensor_df[~sensor_df["key"].str.contains("TEST")]
    sensor_df = sensor_df[~sensor_df["key"].str.contains("debug")]
    sensor_df = sensor_df[~sensor_df["key"].str.contains("feedcontrol")]
    sensor_df = sensor_df[~sensor_df["key"].str.contains("feedcontrol_new")]

    # Reformat the key (Move the final number from the key to the end of the tank name, and drop "ST")
    sensor_df["key"] = sensor_df["key"].str[2:5]+"."+sensor_df["key"].str[-1]+sensor_df["key"].str[5:-1]

    # rename key CON.4.ts to AMB
    sensor_df["key"] = sensor_df["key"].str.replace("CON.4.ts","AMB.ts", regex=False)

    sensor_df["key"] = sensor_df["key"].str.replace("CON.4.gc","AMB.gc", regex=False)
    sensor_df["key"] = sensor_df["key"].str.replace("CON.4.pr","AMB.pr", regex=False)

    sensor_df = sensor_df[~sensor_df["key"].str.contains(".4.")]

    # add a new "determanind" column containing the last part of the key
    sensor_df["determinand"] = sensor_df["key"].str.split(".").str[-1]

    # add a new "tank" column containing the first part of the key
    sensor_df["tank"] = sensor_df["key"].str[:-3]

    # add a new "unit" column containing the unit for the determinand
    sensor_df["unit"] = sensor_df["determinand"].map(unit_mapping)

    # drop the key column
    sensor_df = sensor_df.drop(columns=["key"])

    # convert any values that are "*" to NaN
    sensor_df["value"] = sensor_df["value"].replace("*", np.nan)

    # convert the value column to numeric
    sensor_df["value"] = pd.to_numeric(sensor_df["value"], errors='raise')

    # delete any temperature measurements between 14th an 19th july 2023 which are less than 0 degrees
    sensor_df = sensor_df[~((sensor_df["timestamp"] > "2023-07-14") & (sensor_df["timestamp"] < "2023-07-19") & (sensor_df["determinand"] == "ts") & (sensor_df["value"] < 0))]

    # Remove gc data where the tank is "AMB"
    sensor_df = sensor_df[~((sensor_df["tank"] == "AMB") & (sensor_df["determinand"] == "gc"))]

    return sensor_df

sensor_df = clean_sensor_df(sensor_df)
# sensor_df2 = clean_sensor_df(sensor_df2)

# sensor_df2.to_csv('sensor_data_cleaned_august_onward.csv', index=False)
sensor_df.to_csv('sensor_data_cleaned.csv', index=False)


# merge the two dataframes
# sensor_df = pd.concat([sensor_df2, sensor_df], axis=0, ignore_index=True)
sensor_df

In [None]:
# decimate the sensor data by 12 (optional)

small_sensor_df = pd.DataFrame()
for t in sensor_df["tank"].unique():
    for d in sensor_df["determinand"].unique():

        
        # select the data for the current tank and determinand
        df = sensor_df[(sensor_df["tank"] == t) & (sensor_df["determinand"] == d)]

        # decimate the data by 12, by doing an average of the value column, while keeping the other columns
        if d in ["gc"]: #don't do with gc data
            decimation = 1
        else:
            decimation = 12
        df = df.groupby(np.arange(len(df))//decimation).agg({'value':'mean', 'timestamp':'first', 'determinand':'first', 'tank':'first', 'unit':'first'})

        # round the value column to 2 decimal places
        df["value"] = df["value"].round(2)

        # #add the decimated data back into the main dataframe
        small_sensor_df = pd.concat([small_sensor_df, df], axis=0, ignore_index=True)

small_sensor_df.to_csv('sensor_data_decimated_cleaned.csv', index=False)


In [None]:
# A function to combine and average the data from the 3 replicate tanks

def combine_replicates(df_in, groupby="H"):

    df = df_in.copy()

    avg_list = ["INLET", "CON", "INS", "20C", "30C", "AMB"]
    # avg_list = ["30C"]

    avg_df = pd.DataFrame()

    for t in avg_list:

        df = df_in.copy()
        
        # select data 3 replicate tanks
        df = df[df["tank"].str.startswith(t)]

        # Convert 'timestamp' to datetime if it's not already
        df['timestamp'] = pd.to_datetime(df['timestamp'], utc=True)

        #convert to timezone naive timestamps IS THIS RIGHT?
        df['timestamp'] = df['timestamp'].dt.tz_localize(None)

        # split the 3 tanks into their own columns
        df = df.pivot_table(index=['timestamp', 'determinand', 'unit'], columns='tank', values='value')

        # add a new timestamp column
        df["timestamp2"] = df.index.get_level_values(0)

        # Round 'timestamp' to the nearest hour
        df['timestamp2'] = df['timestamp2'].dt.floor(groupby)


        # average rows where the timestamps and determinands are the same
        df = df.groupby(['timestamp2', 'determinand', 'unit']).mean()

        # add an extra column for the average of the 3 tanks
        df["value"] = df.mean(axis=1)

        # round the value column to 2 decimal places
        df = df.round(2)

        # remove the columns that start with t
        df = df.drop(columns=[c for c in df.columns if c.startswith(t)])



        # reset the index
        df = df.reset_index()

        # add a new column for the tank name
        df["tank"] = f"{t}.AVG"

        # rename timestamp2 to timestamp
        df = df.rename(columns={"timestamp2": "timestamp"})

        debug1_df = df.copy()                     

        # concat with avg_df
        avg_df = pd.concat([avg_df, df], axis=0, ignore_index=True)
    # return debug1_df
    return avg_df

In [None]:

# for the ph determinand, apply smoothing to the data, this needs to be done on a tank by tank basis
for t in sensor_df["tank"].unique():
    # select the ph data for the current tank
    ph_df = sensor_df[(sensor_df["tank"] == t) & (sensor_df["determinand"] == "ph")]
    smoothed_values = ph_df["value"].rolling(3, center=True).mean().round(2)

    # replace the original ph data with the smoothed data
    sensor_df.loc[ph_df.index, "value"] = smoothed_values


# Create a new "lab_df" from output_df, but with only the relevant columns
lab_df = output_df[["timestamp", "value", "tank", "determinand", "unit"]].copy()

avg_sensor_df = combine_replicates(sensor_df)
small_avg_sensor_df = combine_replicates(small_sensor_df)
avg_lab_df = combine_replicates(lab_df, groupby="D")

# join the sensor_df and lab_df dataframes
full_df = pd.concat([sensor_df, lab_df], axis=0, ignore_index=True)
small_df = pd.concat([small_sensor_df, lab_df], axis=0, ignore_index=True)
avg_df = pd.concat([avg_sensor_df, avg_lab_df], axis=0, ignore_index=True)
small_avg_df = pd.concat([small_avg_sensor_df, avg_lab_df], axis=0, ignore_index=True)

# sort the output dataframe by tank, and timestamp
# the tank column needs to be in the order INLET, CON.1, CON.2, CON.3, INS.1, INS.2, INS.3, 20C.1, 20C.2, 20C.3, 30C.1, 30C.2, 30C.3

full_order = {
    "INLET": 0,
    "AMB": 1, 
    "CON.AVG": 2,
    "CON.1": 3,
    "CON.2": 4,
    "CON.3": 5,
    "INS.AVG": 6,
    "INS.1": 7,
    "INS.2": 8,
    "INS.3": 9,
    "20C.AVG": 10,
    "20C.1": 11,
    "20C.2": 12,
    "20C.3": 13,
    "30C.AVG": 14,
    "30C.1": 15,
    "30C.2": 16,
    "30C.3": 17,
}

avg_order = {
    "INLET": 0,
    "CON.AVG": 1,
    "INS.AVG": 2,
    "20C.AVG": 3,
    "30C.AVG": 4,
}

# create a new column that is a number mapped from the tank name, in the order we want
full_df["tank_num"] = full_df["tank"].map(full_order)
small_df["tank_num"] = small_df["tank"].map(full_order)
avg_df["tank_num"] = avg_df["tank"].map(avg_order)
small_avg_df["tank_num"] = small_avg_df["tank"].map(avg_order)


full_df.sort_values(by=["tank_num", "timestamp"], inplace=True)
small_df.sort_values(by=["tank_num", "timestamp"], inplace=True)
avg_df.sort_values(by=["tank_num", "timestamp"], inplace=True)
small_avg_df.sort_values(by=["tank_num", "timestamp"], inplace=True)

full_df.to_csv('full_df.csv', index=False)
small_df.to_csv('small_df.csv', index=False)
avg_df.to_csv('avg_df.csv', index=False)
small_avg_df.to_csv('small_avg_df.csv', index=False)

In [None]:



def export_to_excel_simple_pivot(df_in, sheet_name, timestamp_group="H"):
    df = df_in.copy()
    output_data = pd.DataFrame()
    # convert the timestamp column to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    # df = df[(df["timestamp"] >= start_date) & (df["timestamp"] <= end_date)]

    # # remove the AMB data  
    # df = df[~df["tank"].str.contains("AMB")]

    # for each tank, keep only the ts data where the timestamp is closest to the gc data
    for t in df["tank"].unique():
        tank_data = pd.DataFrame()

        #Ambient data doesn't have a gc sensor, so we need to use the CON data
        if t == "AMB.AVG":
            gc_tank = "CON.AVG"
        elif t == "AMB":
            gc_tank = "CON.1"
        else:
            gc_tank = t

        gc_data = df[df["tank"] == gc_tank]
        gc_data = gc_data[gc_data["determinand"] == "gc"]
        gc_data = gc_data.rename(columns={"value": "gc"})
        gc_data = gc_data[["timestamp", "gc"]]
        gc_data = gc_data.sort_values("timestamp")

        other_data = df[df["tank"] == t]
        # filter for only the sensor determinands ["ts, tl, pr, ph"]
        sensor_data = other_data[other_data["determinand"].isin(["ts", "tl", "pr", "ph"])]
        
        for det in sensor_data["determinand"].unique():
            det_data = sensor_data.copy()
            det_data = det_data[det_data["determinand"] == det]
            det_data = det_data.rename(columns={"value": f"{det}"})
            det_data = det_data[["timestamp", f"{det}"]]
            det_data = det_data.sort_values("timestamp")

            merged_data = pd.merge_asof(gc_data, det_data, on="timestamp", direction="nearest")
            merged_data["tank"] = t

            # add the merged data to an output dataframe
            if tank_data.empty:
                tank_data = merged_data
            else:   
                tank_data = pd.merge(tank_data, merged_data, how="outer", on=["timestamp", "gc", "tank"])

        output_data = pd.concat([output_data, tank_data], axis=0, ignore_index=True)


    # output data will be empty if just lab data is present
    if not output_data.empty:
        # condense the tl ts pr ph and gc data into a single column, with a determinand column to say which is which
        output_data = output_data.melt(id_vars=["timestamp", "tank"], value_vars=["gc", "tl", "ts", "pr", "ph"], var_name="determinand", value_name="value")

    #concat in the lab data
    lab_data = df[~df["determinand"].isin(["ts", "tl", "pr", "ph", "gc"])]
    output_data = pd.concat([output_data, lab_data], axis=0, ignore_index=True)

    #round down timestamp to nearest hour, and convert to string for excel
    output_data["timestamp"] = output_data["timestamp"].dt.floor(timestamp_group)
    output_data["timestamp"] = output_data["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")

    # pivot the data so that the determinands are in columns
    pivot_data = output_data.pivot_table(index=['timestamp' ], columns=['determinand', 'tank'], values='value')

    # Order the columns according to full_order, but with determinand at the top level
    new_columns = pd.MultiIndex.from_product([pivot_data.columns.levels[0], full_order.keys()])
    pivot_data = pivot_data.reindex(columns=new_columns)

    # Drop any columns that are all NaN
    pivot_data = pivot_data.dropna(axis=1, how='all')

    # drop the amb gc column as it is not valid (actually contains con data)
    # pivot_data = pivot_data.drop(columns=[("gc", "AMB")])

    # if an amb gc column exists, remove it
    if ("gc", "AMB") in pivot_data.columns:
        pivot_data = pivot_data.drop(columns=[("gc", "AMB")])
    if ("gc", "AMB.AVG") in pivot_data.columns:
        pivot_data = pivot_data.drop(columns=[("gc", "AMB.AVG")])   

    # open excel workbok for appending
    with pd.ExcelWriter("simple_data.xlsx", engine='openpyxl', mode='a', if_sheet_exists="replace") as writer:

        # write the data to the excel file, replacing sheet if it exists
        pivot_data.to_excel(writer, sheet_name=sheet_name)
        output_data.to_excel(writer, sheet_name=sheet_name+" for excel Pivot")


export_to_excel_simple_pivot(sensor_df, "sensor data")
export_to_excel_simple_pivot(lab_df, "lab data", timestamp_group="D")
export_to_excel_simple_pivot(avg_sensor_df, "avg data")



In [43]:
output_data = df.copy()
sheet_name = "selected data"
timestamp_group = "H"

#round down timestamp to nearest hour, and convert to string for excel
# Ensure the 'timestamp' column is of datetime type
output_data["timestamp"] = pd.to_datetime(output_data["timestamp"])
output_data["timestamp"] = output_data["timestamp"].dt.floor(timestamp_group)
output_data["timestamp"] = output_data["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")

# pivot the data so that the determinands are in columns
pivot_data = output_data.pivot_table(index=['timestamp' ], columns=['determinand', 'tank'], values='value')

# Order the columns according to full_order, but with determinand at the top level
new_columns = pd.MultiIndex.from_product([pivot_data.columns.levels[0], full_order.keys()])
pivot_data = pivot_data.reindex(columns=new_columns)

# Drop any columns that are all NaN
pivot_data = pivot_data.dropna(axis=1, how='all')

# drop the amb gc column as it is not valid (actually contains con data)
# pivot_data = pivot_data.drop(columns=[("gc", "AMB")])

# if an amb gc column exists, remove it
if ("gc", "AMB") in pivot_data.columns:
    pivot_data = pivot_data.drop(columns=[("gc", "AMB")])
if ("gc", "AMB.AVG") in pivot_data.columns:
    pivot_data = pivot_data.drop(columns=[("gc", "AMB.AVG")])   

# open excel workbok for appending
with pd.ExcelWriter("simple_data.xlsx", engine='openpyxl', mode='a', if_sheet_exists="replace") as writer:

    # write the data to the excel file, replacing sheet if it exists
    pivot_data.to_excel(writer, sheet_name=sheet_name)
    output_data.to_excel(writer, sheet_name=sheet_name+" for excel Pivot")




'H' is deprecated and will be removed in a future version, please use 'h' instead.



In [None]:
start_date = "2024-10-15"
end_date = "2024-11-08"

df = sensor_df.copy()
df = df[(df["timestamp"] >= start_date) & (df["timestamp"] <= end_date)]
df = df[df["determinand"].isin(["ts", "tl", "gc", "ph"])]
df = df[df["tank"].str.startswith("20") | df["tank"].str.startswith("30")]
# df = df[df["tank"].str.startswith("CON") | df["tank"].str.startswith("INS")]


# remove the AMB data  
df = df[~df["tank"].str.contains("AMB")]

#

# format the timestamp data for excel
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["timestamp"] = df["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")

# export the data to a xlsx file
df.to_excel("selected_data.xlsx", index=False)
df.to_csv("selected_data.csv", index=False)


# plot selected data using plotly
import plotly.express as px

fig = px.line(df,
                x="timestamp",
                y="value",
                color="tank",
                # line_group="determinand",
                # hover_name="determinand",
                title="CH4 Concentration (%)",
                facet_row="determinand",)

fig.update_layout(
    title="CH4 Concentration (%)",
    xaxis_title="Date",
    # yaxis_title="CH4 Concentration (%)",
    legend_title="Tank",
)

fig.update_yaxes(matches=None)
fig.show()


In [None]:
start_date = "2023-06-01"
end_date = "2023-09-20"

output_data = pd.DataFrame()

df = sensor_df.copy()
df["timestamp"] = pd.to_datetime(df["timestamp"])
df = df[(df["timestamp"] >= start_date) & (df["timestamp"] <= end_date)]

# print type of timestamp column

# # remove the AMB data  
# df = df[~df["tank"].str.contains("AMB")]


gc_data = df[df["determinand"] == "gc"]

# for each determinand, keep only the data where the timestamp is closest to the gc data
for det in df["determinand"].unique():

    if det == "gc":
        continue

    det_data = df[df["determinand"] == det]

    for tank in det_data["tank"].unique():
        


    
        merged_data = pd.merge_asof(gc_data, det_data, on="timestamp", direction="nearest")
        merged_data["tank"] = t

        # add the merged data to an output dataframe
        if tank_data.empty:
            tank_data = merged_data
        else:   
            tank_data = pd.merge(tank_data, merged_data, how="outer", on=["timestamp", "gc", "tank"])

    output_data = pd.concat([output_data, tank_data], axis=0, ignore_index=True)
# output_data["timestamp"] = output_data["timestamp"].strftime("%Y-%m-%d %H:%M:%S")
output_data.to_csv('output_data.csv', index=False)

In [None]:
start_date = "2023-08-25"
end_date = "2023-09-25"

selected_data = sensor_df[(sensor_df["timestamp"] >= start_date) & (sensor_df["timestamp"] <= end_date)]
selected_data = selected_data[selected_data["determinand"].isin(["ts", "tl"])]

# format the timestamp data for excel
selected_data["timestamp"] = pd.to_datetime(selected_data["timestamp"])
selected_data["timestamp"] = selected_data["timestamp"].dt.strftime("%Y-%m-%d %H:%M:%S")

# export the data to a xlsx file
selected_data.to_excel("selected_data.xlsx", index=False)