In [68]:
import pandas as pd

file = "data/EXP_1 V2.xlsx"

xls = pd.ExcelFile(file)
sheets = xls.sheet_names
sheets

['Seedlings measurements',
 'Water quality measurments Senso',
 'Water quality parametersPortabl',
 'Nutrients  Water consumptions',
 'Head diameter',
 'Harvest measurements 842024']

### Seedlings measurements

In [69]:
# Load the "Seedlings measurements" sheet without headers
df_seedlings = pd.read_excel(file, sheet_name="Seedlings measurements", header=None)
df_seedlings.shape

(12, 11)

In [70]:
df_seedlings.head(12)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,,EXP 1,,,,,,,,,
1,Date,plant NO.,plant height (cm),shoot length (cm),root length (cm),head diameter (cm),stem diameter (cm),total weight (g),shoot weight (g),root weight (g),no. of leaves
2,2024-09-03 00:00:00,1,9,2,7,4*2,0.1,1.3,0.5,0.8,5
3,,2,10,3,7,3.5*3,0.1,1.2,0.55,0.65,5
4,,3,10,3,7,3.5*2.5,0.1,1.1,0.53,0.57,5
5,,4,9,2.5,6.5,3*3,0.1,1.14,0.45,0.69,4
6,,5,10,3,7,3.5*2.5,0.1,1,0.4,0.6,5
7,,6,10,3,7,3.5*3,0.1,1.1,0.44,0.66,5
8,,7,9,2,7,2*3.5,0.1,1,0.4,0.6,5
9,,8,9,2,7,4*3.5,0.1,1.5,0.6,0.9,5


In [71]:
# Promote the header
df = df_seedlings.copy()
df.columns = df.iloc[1]
df = df.drop(index=[0, 1])

df.head()


1,Date,plant NO.,plant height (cm),shoot length (cm),root length (cm),head diameter (cm),stem diameter (cm),total weight (g),shoot weight (g),root weight (g),no. of leaves
2,2024-09-03 00:00:00,1,9,2.0,7.0,4*2,0.1,1.3,0.5,0.8,5
3,,2,10,3.0,7.0,3.5*3,0.1,1.2,0.55,0.65,5
4,,3,10,3.0,7.0,3.5*2.5,0.1,1.1,0.53,0.57,5
5,,4,9,2.5,6.5,3*3,0.1,1.14,0.45,0.69,4
6,,5,10,3.0,7.0,3.5*2.5,0.1,1.0,0.4,0.6,5


In [72]:
# Clean column names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(r"\(.*?\)", "", regex=True)   # remove (cm)
    .str.replace("[^a-zA-Z0-9]+", "_", regex=True)
    .str.replace("_+", "_", regex=True)
    .str.strip("_")
)
df.head()


1,date,plant_no,plant_height,shoot_length,root_length,head_diameter,stem_diameter,total_weight,shoot_weight,root_weight,no_of_leaves
2,2024-09-03 00:00:00,1,9,2.0,7.0,4*2,0.1,1.3,0.5,0.8,5
3,,2,10,3.0,7.0,3.5*3,0.1,1.2,0.55,0.65,5
4,,3,10,3.0,7.0,3.5*2.5,0.1,1.1,0.53,0.57,5
5,,4,9,2.5,6.5,3*3,0.1,1.14,0.45,0.69,4
6,,5,10,3.0,7.0,3.5*2.5,0.1,1.0,0.4,0.6,5


In [73]:
# Fix the date column (fill downward)
df["date"] = df["date"].ffill()
df["date"] = pd.to_datetime(df["date"], errors="coerce")
df.head()



  df["date"] = df["date"].ffill()


1,date,plant_no,plant_height,shoot_length,root_length,head_diameter,stem_diameter,total_weight,shoot_weight,root_weight,no_of_leaves
2,2024-09-03,1,9,2.0,7.0,4*2,0.1,1.3,0.5,0.8,5
3,2024-09-03,2,10,3.0,7.0,3.5*3,0.1,1.2,0.55,0.65,5
4,2024-09-03,3,10,3.0,7.0,3.5*2.5,0.1,1.1,0.53,0.57,5
5,2024-09-03,4,9,2.5,6.5,3*3,0.1,1.14,0.45,0.69,4
6,2024-09-03,5,10,3.0,7.0,3.5*2.5,0.1,1.0,0.4,0.6,5


In [74]:
# Convert all numeric expressions into real numbers
import numpy as np

def clean_numeric(x):
    try:
        return eval(str(x))  # evaluate expressions safely
    except:
        return np.nan         # anything invalid → NaN

numeric_cols = df.columns.drop("date")
numeric_cols

for col in numeric_cols:
    df[col] = df[col].apply(clean_numeric)

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors="coerce")

df.head()


1,date,plant_no,plant_height,shoot_length,root_length,head_diameter,stem_diameter,total_weight,shoot_weight,root_weight,no_of_leaves
2,2024-09-03,1,9.0,2.0,7.0,8.0,0.1,1.3,0.5,0.8,5
3,2024-09-03,2,10.0,3.0,7.0,10.5,0.1,1.2,0.55,0.65,5
4,2024-09-03,3,10.0,3.0,7.0,8.75,0.1,1.1,0.53,0.57,5
5,2024-09-03,4,9.0,2.5,6.5,9.0,0.1,1.14,0.45,0.69,4
6,2024-09-03,5,10.0,3.0,7.0,8.75,0.1,1.0,0.4,0.6,5


In [75]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 2 to 11
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           10 non-null     datetime64[ns]
 1   plant_no       10 non-null     int64         
 2   plant_height   10 non-null     float64       
 3   shoot_length   10 non-null     float64       
 4   root_length    10 non-null     float64       
 5   head_diameter  10 non-null     float64       
 6   stem_diameter  10 non-null     float64       
 7   total_weight   10 non-null     float64       
 8   shoot_weight   10 non-null     float64       
 9   root_weight    10 non-null     float64       
 10  no_of_leaves   10 non-null     int64         
dtypes: datetime64[ns](1), float64(8), int64(2)
memory usage: 1012.0 bytes


In [76]:
# Save the clean V1 dataset
df.to_csv("data/Seedlings_V1_clean.csv", index=False)

### Water quality measurments Sensor

In [77]:
df_sensor = pd.read_excel(file, sheet_name="Water quality measurments Senso", header=None)
df_sensor.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,,,Replicate 1 T1,,,,,,,,
1,Date,Tme,pH,EC,TDS,Water temp.,Date,Tme,RH %,Air temp. C,CO2 PPM
2,,,,,,,2024-03-10 00:00:00,,,,406.96
3,,,,,,,2024-03-10 00:00:00,10:00:00,49.08,24.18,406.55
4,,,,,,,2024-03-10 00:00:00,11:00:00,47.51,23.55,413.38
5,,,,,,,2024-03-10 00:00:00,12:00:00,46.8,23.81,412.5
6,,,,,,,2024-03-10 00:00:00,13:00:00,47.12,24.48,408.94
7,,,,,,,2024-03-10 00:00:00,14:00:00,47.64,24.74,410.58
8,,,,,,,2024-03-10 00:00:00,15:00:00,48.55,24.43,413.71
9,,,,,,,2024-03-10 00:00:00,16:00:00,49.41,25.27,427.72


In [78]:
for i in range(15):
    print(i, df_sensor.iloc[i].tolist())


0 [nan, nan, 'Replicate 1 T1', nan, nan, nan, nan, nan, nan, nan, nan]
1 ['Date ', 'Tme ', 'pH', 'EC ', 'TDS ', 'Water temp. ', 'Date ', 'Tme ', 'RH %', 'Air temp. C', 'CO2 PPM']
2 [' ', nan, nan, nan, nan, nan, datetime.datetime(2024, 3, 10, 0, 0), nan, nan, nan, 406.96]
3 [nan, nan, nan, nan, nan, nan, datetime.datetime(2024, 3, 10, 0, 0), datetime.time(10, 0), 49.08, 24.18, 406.55]
4 [nan, nan, nan, nan, nan, nan, datetime.datetime(2024, 3, 10, 0, 0), datetime.time(11, 0), 47.51, 23.55, 413.38]
5 [nan, nan, nan, nan, nan, nan, datetime.datetime(2024, 3, 10, 0, 0), datetime.time(12, 0), 46.8, 23.81, 412.5]
6 [nan, nan, nan, nan, nan, nan, datetime.datetime(2024, 3, 10, 0, 0), datetime.time(13, 0), 47.12, 24.48, 408.94]
7 [nan, nan, nan, nan, nan, nan, datetime.datetime(2024, 3, 10, 0, 0), datetime.time(14, 0), 47.64, 24.74, 410.58]
8 [nan, nan, nan, nan, nan, nan, datetime.datetime(2024, 3, 10, 0, 0), datetime.time(15, 0), 48.55, 24.43, 413.71]
9 [nan, nan, nan, nan, nan, nan, dateti

In [79]:
# Promote the header
df = df_sensor.copy()
df.columns = df.iloc[1]
df = df.drop(index=[0, 1])

df.head(10)

1,Date,Tme,pH,EC,TDS,Water temp.,Date.1,Tme.1,RH %,Air temp. C,CO2 PPM
2,,,,,,,2024-03-10 00:00:00,,,,406.96
3,,,,,,,2024-03-10 00:00:00,10:00:00,49.08,24.18,406.55
4,,,,,,,2024-03-10 00:00:00,11:00:00,47.51,23.55,413.38
5,,,,,,,2024-03-10 00:00:00,12:00:00,46.8,23.81,412.5
6,,,,,,,2024-03-10 00:00:00,13:00:00,47.12,24.48,408.94
7,,,,,,,2024-03-10 00:00:00,14:00:00,47.64,24.74,410.58
8,,,,,,,2024-03-10 00:00:00,15:00:00,48.55,24.43,413.71
9,,,,,,,2024-03-10 00:00:00,16:00:00,49.41,25.27,427.72
10,,,,,,,2024-03-10 00:00:00,17:00:00,50.27,20.93,440.55
11,,,,,,,2024-03-10 00:00:00,18:00:00,51.56,16.91,443.4


In [80]:
# Clean column names
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace(r"\(.*?\)", "", regex=True)   # remove (cm)
    .str.replace("[^a-zA-Z0-9]+", "_", regex=True)
    .str.replace("_+", "_", regex=True)
    .str.strip("_")
)
df.head(10)

1,date,tme,ph,ec,tds,water_temp,date.1,tme.1,rh,air_temp_c,co2_ppm
2,,,,,,,2024-03-10 00:00:00,,,,406.96
3,,,,,,,2024-03-10 00:00:00,10:00:00,49.08,24.18,406.55
4,,,,,,,2024-03-10 00:00:00,11:00:00,47.51,23.55,413.38
5,,,,,,,2024-03-10 00:00:00,12:00:00,46.8,23.81,412.5
6,,,,,,,2024-03-10 00:00:00,13:00:00,47.12,24.48,408.94
7,,,,,,,2024-03-10 00:00:00,14:00:00,47.64,24.74,410.58
8,,,,,,,2024-03-10 00:00:00,15:00:00,48.55,24.43,413.71
9,,,,,,,2024-03-10 00:00:00,16:00:00,49.41,25.27,427.72
10,,,,,,,2024-03-10 00:00:00,17:00:00,50.27,20.93,440.55
11,,,,,,,2024-03-10 00:00:00,18:00:00,51.56,16.91,443.4


In [81]:
# Check for non-NaN counts for the left block
df[['date', 'tme', 'ph', 'ec', 'tds', 'water_temp']].notna().sum()

1
date            1
date          640
tme             0
tme           640
ph              0
ec              0
tds             0
water_temp      0
dtype: int64

In [82]:
# Check the column names
df.columns

Index(['date', 'tme', 'ph', 'ec', 'tds', 'water_temp', 'date', 'tme', 'rh',
       'air_temp_c', 'co2_ppm'],
      dtype='object', name=1)

In [83]:
# Rename the column
df.columns = [
    'date_water', 'time_water', 'ph', 'ec', 'tds', 'water_temp',
    'date_air', 'time_air', 'rh', 'air_temp_c', 'co2_ppm'
]

In [84]:
# Drop the empty left block
df = df.drop(columns=['date_water', 'time_water', 'ph', 'ec', 'tds', 'water_temp'])

In [85]:
df.head(700)

Unnamed: 0,date_air,time_air,rh,air_temp_c,co2_ppm
2,2024-03-10 00:00:00,,,,406.96
3,2024-03-10 00:00:00,10:00:00,49.08,24.18,406.55
4,2024-03-10 00:00:00,11:00:00,47.51,23.55,413.38
5,2024-03-10 00:00:00,12:00:00,46.8,23.81,412.5
6,2024-03-10 00:00:00,13:00:00,47.12,24.48,408.94
...,...,...,...,...,...
638,2024-04-08 00:00:00,09:00:00,55.71,24.31,432.27
639,2024-04-08 00:00:00,10:00:00,53.11,25.06,424.63
640,2024-04-08 00:00:00,11:00:00,51.32,25.55,419.67
641,2024-04-08 00:00:00,12:00:00,50.94,25.43,420.01


In [86]:
df['datetime'] = pd.to_datetime(
    df['date_air'].astype(str) + " " + df['time_air'].astype(str),
    errors='coerce'
)
df[['datetime', 'rh', 'air_temp_c', 'co2_ppm']].head()


  df['datetime'] = pd.to_datetime(


Unnamed: 0,datetime,rh,air_temp_c,co2_ppm
2,NaT,,,406.96
3,2024-03-10 10:00:00,49.08,24.18,406.55
4,2024-03-10 11:00:00,47.51,23.55,413.38
5,2024-03-10 12:00:00,46.8,23.81,412.5
6,2024-03-10 13:00:00,47.12,24.48,408.94


In [87]:
df = df.drop(columns=['date_air', 'time_air'])
df = df.dropna(subset=['rh'])

In [88]:
df.head(700)

Unnamed: 0,rh,air_temp_c,co2_ppm,datetime
3,49.08,24.18,406.55,2024-03-10 10:00:00
4,47.51,23.55,413.38,2024-03-10 11:00:00
5,46.8,23.81,412.5,2024-03-10 12:00:00
6,47.12,24.48,408.94,2024-03-10 13:00:00
7,47.64,24.74,410.58,2024-03-10 14:00:00
...,...,...,...,...
638,55.71,24.31,432.27,2024-04-08 09:00:00
639,53.11,25.06,424.63,2024-04-08 10:00:00
640,51.32,25.55,419.67,2024-04-08 11:00:00
641,50.94,25.43,420.01,2024-04-08 12:00:00


In [89]:
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values('datetime').reset_index(drop=True)
df = df.drop_duplicates(subset='datetime', keep='first')
df = df.dropna(subset=['datetime'])
df.head(700)

Unnamed: 0,rh,air_temp_c,co2_ppm,datetime
0,49.08,24.18,406.55,2024-03-10 10:00:00
1,47.51,23.55,413.38,2024-03-10 11:00:00
2,46.8,23.81,412.5,2024-03-10 12:00:00
3,47.12,24.48,408.94,2024-03-10 13:00:00
4,47.64,24.74,410.58,2024-03-10 14:00:00
...,...,...,...,...
634,55.71,24.31,432.27,2024-04-08 09:00:00
635,53.11,25.06,424.63,2024-04-08 10:00:00
636,51.32,25.55,419.67,2024-04-08 11:00:00
637,50.94,25.43,420.01,2024-04-08 12:00:00


In [90]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 636 entries, 0 to 638
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   rh          636 non-null    object        
 1   air_temp_c  636 non-null    object        
 2   co2_ppm     636 non-null    object        
 3   datetime    636 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(3)
memory usage: 24.8+ KB


In [91]:
df['rh'] = pd.to_numeric(df['rh'], errors='coerce')
df['air_temp_c'] = pd.to_numeric(df['air_temp_c'], errors='coerce')
df['co2_ppm'] = pd.to_numeric(df['co2_ppm'], errors='coerce')
df.isna().sum()


rh            0
air_temp_c    0
co2_ppm       0
datetime      0
dtype: int64

In [92]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 636 entries, 0 to 638
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   rh          636 non-null    float64       
 1   air_temp_c  636 non-null    float64       
 2   co2_ppm     636 non-null    float64       
 3   datetime    636 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(3)
memory usage: 24.8 KB


Unnamed: 0,rh,air_temp_c,co2_ppm,datetime
0,49.08,24.18,406.55,2024-03-10 10:00:00
1,47.51,23.55,413.38,2024-03-10 11:00:00
2,46.8,23.81,412.5,2024-03-10 12:00:00
3,47.12,24.48,408.94,2024-03-10 13:00:00
4,47.64,24.74,410.58,2024-03-10 14:00:00


In [93]:
# Save the clean V1 dataset
df.to_csv("data/Sensor_V1_clean.csv", index=False)

### Water quality parameters Portable

In [94]:
df_portable = pd.read_excel(file, sheet_name="Water quality parametersPortabl", header=None)
df_portable

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,Date,10:00:00,,,,,,,,,...,,,,,,,,Air parameters,,
1,,Replicate 1 T1,,,,Replicate 2 T1,,,,Replicate 3 T1,...,,,,Replicate 3 T2,,,,Lux,RH%,Air temp.
2,,pH,EC,TDS,Water temp.,pH,EC,TDS,Water temp.,pH,...,EC,TDS,Water temp.,pH,EC,TDS,Water temp.,,,
3,2024-10-03 00:00:00,-,-,-,-,-,-,-,-,-,...,1.56,0.77,22.9,6.9,1.56,0.76,23,48300,60.8,22.1
4,2024-11-03 00:00:00,5.3,2,1,19.3,5.8,2.02,1.01,19.1,5.6,...,2.02,1.01,22.3,5.6,2.16,1.08,22.9,61270,59.5,21.8
5,2024-12-03 00:00:00,5.7,1.95,0.975,22,5.8,2.02,1.01,21.2,5.7,...,2.02,1.01,25.5,6.3,1.86,0.93,26,45190,55,26.7
6,13/3/2024,5.9,1.97,0.985,22,6,2.02,1.01,22,5.8,...,2.01,1.005,25.4,6.5,1.87,0.935,26,,,
7,14/3/2024,6,1.92,0.96,26.6,6,2,1,25.1,5.8,...,2.02,1.01,25.6,6.4,1.82,0.91,28,,,
8,15/3/2024,5.8,1.94,0.97,24.3,5.9,2.03,1.015,24.9,5.7,...,-,-,-,-,-,-,-,,,
9,16/3/2024,5.7,1.94,0.97,23,5.9,2.01,1.005,24.1,5.7,...,2.02,1.01,23.4,6,1.92,0.96,24.7,,,


This dataset appears to be really messy. I want the end result to turn out like this:
DateTime, pH, EC, TDS, WaterTemp, Lux, RH, AirTemp

In [95]:
# First 3 rows are header info
header_rows = df_portable.iloc[0:3].copy()

# Recreate header_rows and data just to be safe
header_rows = df_portable.iloc[0:3].copy()
data = df_portable.iloc[3:].copy().reset_index(drop=True)

# 🔥 NEW: forward-fill header labels across columns
header_filled = header_rows.ffill(axis=1)

header_filled


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,Date,10:00:00,10:00:00,10:00:00,10:00:00,10:00:00,10:00:00,10:00:00,10:00:00,10:00:00,...,14:00:00,14:00:00,14:00:00,14:00:00,14:00:00,14:00:00,14:00:00,Air parameters,Air parameters,Air parameters
1,,Replicate 1 T1,Replicate 1 T1,Replicate 1 T1,Replicate 1 T1,Replicate 2 T1,Replicate 2 T1,Replicate 2 T1,Replicate 2 T1,Replicate 3 T1,...,Replicate 2 T2,Replicate 2 T2,Replicate 2 T2,Replicate 3 T2,Replicate 3 T2,Replicate 3 T2,Replicate 3 T2,Lux,RH%,Air temp.
2,,pH,EC,TDS,Water temp.,pH,EC,TDS,Water temp.,pH,...,EC,TDS,Water temp.,pH,EC,TDS,Water temp.,Water temp.,Water temp.,Water temp.


In [96]:
new_cols = []

for col_idx in df_portable.columns:
    h0 = header_filled.iloc[0, col_idx]
    h1 = header_filled.iloc[1, col_idx]
    h2 = header_filled.iloc[2, col_idx]

    if col_idx == 0:
        # First column is Date
        new_cols.append("Date")
    else:
        parts = []
        for x in [h0, h1, h2]:
            if pd.notna(x):
                parts.append(str(x).strip())
        if parts:
            new_cols.append("_".join(parts))
        else:
            new_cols.append(f"col_{col_idx}")  # spacer columns with no header

data.columns = new_cols

data.head()


Unnamed: 0,Date,10:00:00_Replicate 1 T1_pH,10:00:00_Replicate 1 T1_EC,10:00:00_Replicate 1 T1_TDS,10:00:00_Replicate 1 T1_Water temp.,10:00:00_Replicate 2 T1_pH,10:00:00_Replicate 2 T1_EC,10:00:00_Replicate 2 T1_TDS,10:00:00_Replicate 2 T1_Water temp.,10:00:00_Replicate 3 T1_pH,...,14:00:00_Replicate 2 T2_EC,14:00:00_Replicate 2 T2_TDS,14:00:00_Replicate 2 T2_Water temp.,14:00:00_Replicate 3 T2_pH,14:00:00_Replicate 3 T2_EC,14:00:00_Replicate 3 T2_TDS,14:00:00_Replicate 3 T2_Water temp.,Air parameters_Lux_Water temp.,Air parameters_RH%_Water temp.,Air parameters_Air temp._Water temp.
0,2024-10-03 00:00:00,-,-,-,-,-,-,-,-,-,...,1.56,0.77,22.9,6.9,1.56,0.76,23.0,48300.0,60.8,22.1
1,2024-11-03 00:00:00,5.3,2,1,19.3,5.8,2.02,1.01,19.1,5.6,...,2.02,1.01,22.3,5.6,2.16,1.08,22.9,61270.0,59.5,21.8
2,2024-12-03 00:00:00,5.7,1.95,0.975,22,5.8,2.02,1.01,21.2,5.7,...,2.02,1.01,25.5,6.3,1.86,0.93,26.0,45190.0,55.0,26.7
3,13/3/2024,5.9,1.97,0.985,22,6,2.02,1.01,22,5.8,...,2.01,1.005,25.4,6.5,1.87,0.935,26.0,,,
4,14/3/2024,6,1.92,0.96,26.6,6,2,1,25.1,5.8,...,2.02,1.01,25.6,6.4,1.82,0.91,28.0,,,


In [97]:
data["Date"] = pd.to_datetime(data["Date"], errors="coerce")
data.head()


Unnamed: 0,Date,10:00:00_Replicate 1 T1_pH,10:00:00_Replicate 1 T1_EC,10:00:00_Replicate 1 T1_TDS,10:00:00_Replicate 1 T1_Water temp.,10:00:00_Replicate 2 T1_pH,10:00:00_Replicate 2 T1_EC,10:00:00_Replicate 2 T1_TDS,10:00:00_Replicate 2 T1_Water temp.,10:00:00_Replicate 3 T1_pH,...,14:00:00_Replicate 2 T2_EC,14:00:00_Replicate 2 T2_TDS,14:00:00_Replicate 2 T2_Water temp.,14:00:00_Replicate 3 T2_pH,14:00:00_Replicate 3 T2_EC,14:00:00_Replicate 3 T2_TDS,14:00:00_Replicate 3 T2_Water temp.,Air parameters_Lux_Water temp.,Air parameters_RH%_Water temp.,Air parameters_Air temp._Water temp.
0,2024-10-03,-,-,-,-,-,-,-,-,-,...,1.56,0.77,22.9,6.9,1.56,0.76,23.0,48300.0,60.8,22.1
1,2024-11-03,5.3,2,1,19.3,5.8,2.02,1.01,19.1,5.6,...,2.02,1.01,22.3,5.6,2.16,1.08,22.9,61270.0,59.5,21.8
2,2024-12-03,5.7,1.95,0.975,22,5.8,2.02,1.01,21.2,5.7,...,2.02,1.01,25.5,6.3,1.86,0.93,26.0,45190.0,55.0,26.7
3,2024-03-13,5.9,1.97,0.985,22,6,2.02,1.01,22,5.8,...,2.01,1.005,25.4,6.5,1.87,0.935,26.0,,,
4,2024-03-14,6,1.92,0.96,26.6,6,2,1,25.1,5.8,...,2.02,1.01,25.6,6.4,1.82,0.91,28.0,,,


In [98]:
if data.columns[0] == "":
    data = data.drop(columns=[data.columns[0]])


In [99]:
clean_cols = {}

for col in data.columns:
    if col.startswith("Air parameters"):
        new_name = col.replace("_Water temp.", "").replace("_Water temp", "")
        clean_cols[col] = new_name

data = data.rename(columns=clean_cols)


In [100]:
[c for c in data.columns if c.startswith("Air parameters")]


['Air parameters_Lux',
 'Air parameters_RH%',
 'Air parameters_Air temp.',
 'Air parameters_Lux',
 'Air parameters_RH%',
 'Air parameters_Air temp.']

In [101]:
for col in data.columns:
    print(col)


Date
10:00:00_Replicate 1 T1_pH
10:00:00_Replicate 1 T1_EC
10:00:00_Replicate 1 T1_TDS
10:00:00_Replicate 1 T1_Water temp.
10:00:00_Replicate 2 T1_pH
10:00:00_Replicate 2 T1_EC
10:00:00_Replicate 2 T1_TDS
10:00:00_Replicate 2 T1_Water temp.
10:00:00_Replicate 3 T1_pH
10:00:00_Replicate 3 T1_EC
10:00:00_Replicate 3 T1_TDS
10:00:00_Replicate 3 T1_Water temp.
10:00:00_Replicate 1 T2_pH
10:00:00_Replicate 1 T2_EC
10:00:00_Replicate 1 T2_TDS
10:00:00_Replicate 1 T2_Water temp.
10:00:00_Replicate 2 T2_pH
10:00:00_Replicate 2 T2_EC
10:00:00_Replicate 2 T2_TDS
10:00:00_Replicate 2 T2_Water temp.
10:00:00_Replicate 3 T2_pH
10:00:00_Replicate 3 T2_EC
10:00:00_Replicate 3 T2_TDS
10:00:00_Replicate 3 T2_Water temp.
Air parameters_Lux
Air parameters_RH%
Air parameters_Air temp.
14:00:00_Replicate 1 T1_pH
14:00:00_Replicate 1 T1_EC
14:00:00_Replicate 1 T1_TDS
14:00:00_Replicate 1 T1_Water temp.
14:00:00_Replicate 2 T1_pH
14:00:00_Replicate 2 T1_EC
14:00:00_Replicate 2 T1_TDS
14:00:00_Replicate 2 T1_

In [102]:
cols = list(data.columns)
len(cols)


55

In [103]:
new_cols = []

# 0 — Date
new_cols.append("Date")

# 1–24 — keep original names (10 AM replicates)
new_cols.extend(cols[1:25])

# 25–27 — rename to Air10
new_cols.extend(["Air10_Lux", "Air10_RH", "Air10_Temp"])

# 28–51 — keep original (14:00 water replicates)
new_cols.extend(cols[28:52])

# 52–54 — rename to Air14
new_cols.extend(["Air14_Lux", "Air14_RH", "Air14_Temp"])


In [104]:
data.columns = new_cols


In [105]:
[data.columns[i] for i in [25, 26, 27, 52, 53, 54]]


['Air10_Lux', 'Air10_RH', 'Air10_Temp', 'Air14_Lux', 'Air14_RH', 'Air14_Temp']

Aggregation stage

In [106]:
def avg_param(df, time_prefix, param):
    """
    time_prefix: '10:00:00' or '14:00:00'
    param: 'pH', 'EC', 'TDS', 'Water temp.'
    """
    cols = [c for c in df.columns
            if c.startswith(time_prefix) and c.endswith(param)]

    if len(cols) == 0:
        return pd.Series([np.nan] * len(df))

    return df[cols].apply(pd.to_numeric, errors="coerce").mean(axis=1)


In [107]:
df_10 = pd.DataFrame({
    "Date": data["Date"],
    "pH": avg_param(data, "10:00:00", "pH"),
    "EC": avg_param(data, "10:00:00", "EC"),
    "TDS": avg_param(data, "10:00:00", "TDS"),
    "WaterTemp": avg_param(data, "10:00:00", "Water temp."),
    "Lux": pd.to_numeric(data["Air10_Lux"], errors="coerce"),
    "RH": pd.to_numeric(data["Air10_RH"], errors="coerce"),
    "AirTemp": pd.to_numeric(data["Air10_Temp"], errors="coerce"),
})

# Remove blank / footer rows
df_10 = df_10.dropna(subset=["Date"]).reset_index(drop=True)

# Build DateTime safely
df_10["DateTime"] = pd.to_datetime(df_10["Date"].dt.strftime("%Y-%m-%d") + " 10:00:00")


In [108]:
df_14 = pd.DataFrame({
    "Date": data["Date"],
    "pH": avg_param(data, "14:00:00", "pH"),
    "EC": avg_param(data, "14:00:00", "EC"),
    "TDS": avg_param(data, "14:00:00", "TDS"),
    "WaterTemp": avg_param(data, "14:00:00", "Water temp."),
    "Lux": pd.to_numeric(data["Air14_Lux"], errors="coerce"),
    "RH": pd.to_numeric(data["Air14_RH"], errors="coerce"),
    "AirTemp": pd.to_numeric(data["Air14_Temp"], errors="coerce"),
})

# Remove blank / footer rows
df_14 = df_14.dropna(subset=["Date"]).reset_index(drop=True)

# Build DateTime
df_14["DateTime"] = pd.to_datetime(df_14["Date"].dt.strftime("%Y-%m-%d") + " 14:00:00")

df_14.head()


Unnamed: 0,Date,pH,EC,TDS,WaterTemp,Lux,RH,AirTemp,DateTime
0,2024-10-03,6.9,1.571667,0.778333,22.65,48300.0,60.8,22.1,2024-10-03 14:00:00
1,2024-11-03,5.583333,2.025,1.0125,22.666667,61270.0,59.5,21.8,2024-11-03 14:00:00
2,2024-12-03,5.8,1.97,0.985,25.716667,45190.0,55.0,26.7,2024-12-03 14:00:00
3,2024-03-13,5.9,1.976667,0.988333,25.016667,,,,2024-03-13 14:00:00
4,2024-03-14,5.966667,1.951667,0.975833,25.9,,,,2024-03-14 14:00:00


In [109]:
df_final = pd.concat([df_10, df_14], ignore_index=True)

df_final = df_final[
    ["DateTime", "pH", "EC", "TDS", "WaterTemp", "Lux", "RH", "AirTemp"]
].sort_values("DateTime").reset_index(drop=True)

df_final


Unnamed: 0,DateTime,pH,EC,TDS,WaterTemp,Lux,RH,AirTemp
0,2024-01-04 10:00:00,6.6,1.55,0.775,26.416667,,,
1,2024-01-04 14:00:00,6.916667,1.211667,0.605833,27.283333,,,
2,2024-02-04 10:00:00,6.383333,1.718333,0.859167,26.366667,,,
3,2024-02-04 14:00:00,6.366667,1.711667,0.855833,27.933333,,,
4,2024-03-04 10:00:00,6.2,1.648333,0.824167,25.766667,,,
5,2024-03-04 14:00:00,6.333333,1.648333,,27.383333,65870.0,61.7,26.3
6,2024-03-13 10:00:00,5.933333,1.978333,0.989167,22.233333,,,
7,2024-03-13 14:00:00,5.9,1.976667,0.988333,25.016667,,,
8,2024-03-14 10:00:00,5.933333,1.946667,0.973333,24.966667,,,
9,2024-03-14 14:00:00,5.966667,1.951667,0.975833,25.9,,,


In [110]:
df_final[['Lux','RH','AirTemp']].isna().sum()


Lux        41
RH         41
AirTemp    41
dtype: int64

handle null values

In [111]:
df_final["Date"] = df_final["DateTime"].dt.date


In [112]:
def fill_air_pair(group):
    for col in ["Lux", "RH", "AirTemp"]:
        morning_val = group.loc[group["DateTime"].dt.hour == 10, col].values
        evening_val = group.loc[group["DateTime"].dt.hour == 14, col].values

        # unpack values
        m = morning_val[0] if len(morning_val) else np.nan
        e = evening_val[0] if len(evening_val) else np.nan

        # apply your rule
        if pd.isna(m) and not pd.isna(e):
            group.loc[group["DateTime"].dt.hour == 10, col] = e
        if pd.isna(e) and not pd.isna(m):
            group.loc[group["DateTime"].dt.hour == 14, col] = m

        # if both NaN → do nothing
    return group


In [113]:
df_filled = df_final.groupby("Date", group_keys=False).apply(fill_air_pair)


  df_filled = df_final.groupby("Date", group_keys=False).apply(fill_air_pair)


In [114]:
df_filled = df_filled.sort_values("DateTime").reset_index(drop=True)


In [115]:
df_filled

Unnamed: 0,DateTime,pH,EC,TDS,WaterTemp,Lux,RH,AirTemp,Date
0,2024-01-04 10:00:00,6.6,1.55,0.775,26.416667,,,,2024-01-04
1,2024-01-04 14:00:00,6.916667,1.211667,0.605833,27.283333,,,,2024-01-04
2,2024-02-04 10:00:00,6.383333,1.718333,0.859167,26.366667,,,,2024-02-04
3,2024-02-04 14:00:00,6.366667,1.711667,0.855833,27.933333,,,,2024-02-04
4,2024-03-04 10:00:00,6.2,1.648333,0.824167,25.766667,65870.0,61.7,26.3,2024-03-04
5,2024-03-04 14:00:00,6.333333,1.648333,,27.383333,65870.0,61.7,26.3,2024-03-04
6,2024-03-13 10:00:00,5.933333,1.978333,0.989167,22.233333,,,,2024-03-13
7,2024-03-13 14:00:00,5.9,1.976667,0.988333,25.016667,,,,2024-03-13
8,2024-03-14 10:00:00,5.933333,1.946667,0.973333,24.966667,,,,2024-03-14
9,2024-03-14 14:00:00,5.966667,1.951667,0.975833,25.9,,,,2024-03-14


In [116]:
# Group columns
water_cols = ["pH", "EC", "TDS", "WaterTemp"]
air_cols = ["Lux", "RH", "AirTemp"]


In [117]:
mask_all_nan = df_filled[water_cols + air_cols].isna().all(axis=1)
df_filled[mask_all_nan]


Unnamed: 0,DateTime,pH,EC,TDS,WaterTemp,Lux,RH,AirTemp,Date
11,2024-03-15 14:00:00,,,,,,,,2024-03-15
26,2024-03-23 10:00:00,,,,,,,,2024-03-23
38,2024-03-29 10:00:00,,,,,,,,2024-03-29
46,2024-05-04 10:00:00,,,,,,,,2024-05-04
47,2024-05-04 14:00:00,,,,,,,,2024-05-04
48,2024-06-04 10:00:00,,,,,,,,2024-06-04
49,2024-06-04 14:00:00,,,,,,,,2024-06-04
50,2024-07-04 10:00:00,,,,,,,,2024-07-04
51,2024-07-04 14:00:00,,,,,,,,2024-07-04
52,2024-08-04 10:00:00,,,,,,,,2024-08-04


In [118]:
df_clean = df_filled[~mask_all_nan].reset_index(drop=True)
df_clean

Unnamed: 0,DateTime,pH,EC,TDS,WaterTemp,Lux,RH,AirTemp,Date
0,2024-01-04 10:00:00,6.6,1.55,0.775,26.416667,,,,2024-01-04
1,2024-01-04 14:00:00,6.916667,1.211667,0.605833,27.283333,,,,2024-01-04
2,2024-02-04 10:00:00,6.383333,1.718333,0.859167,26.366667,,,,2024-02-04
3,2024-02-04 14:00:00,6.366667,1.711667,0.855833,27.933333,,,,2024-02-04
4,2024-03-04 10:00:00,6.2,1.648333,0.824167,25.766667,65870.0,61.7,26.3,2024-03-04
5,2024-03-04 14:00:00,6.333333,1.648333,,27.383333,65870.0,61.7,26.3,2024-03-04
6,2024-03-13 10:00:00,5.933333,1.978333,0.989167,22.233333,,,,2024-03-13
7,2024-03-13 14:00:00,5.9,1.976667,0.988333,25.016667,,,,2024-03-13
8,2024-03-14 10:00:00,5.933333,1.946667,0.973333,24.966667,,,,2024-03-14
9,2024-03-14 14:00:00,5.966667,1.951667,0.975833,25.9,,,,2024-03-14


In [119]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   DateTime   49 non-null     datetime64[ns]
 1   pH         38 non-null     float64       
 2   EC         38 non-null     float64       
 3   TDS        41 non-null     float64       
 4   WaterTemp  38 non-null     float64       
 5   Lux        32 non-null     float64       
 6   RH         32 non-null     float64       
 7   AirTemp    32 non-null     float64       
 8   Date       49 non-null     object        
dtypes: datetime64[ns](1), float64(7), object(1)
memory usage: 3.6+ KB


In [120]:
df_clean = df_clean.drop(columns=["Date"])

In [121]:
# Save as CSV
df_clean.to_csv("data/Portable_V1_clean.csv", index=False)