# Step 1: Data Collection

In [1]:
# import library
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [2]:
# Soil Temperature data in Kaipara
st = pd.read_csv('../Data/soil data/Kaipara Heads-644211-20240909095124/KaiparaSoilTempContinuous@644211-20240909095119.csv',  parse_dates=['Timestamp (UTC+12:00)'])
st.head(5)

# smheadtail = (np.r_[0:5, -5:0])
stheadtail = st.iloc[np.r_[0:5, -5:0]]
print(stheadtail)

       Timestamp (UTC+12:00)  SoilTemp(c)
0        2014-07-08 00:30:00       13.400
1        2014-07-08 00:45:00       13.300
2        2014-07-08 01:00:00       13.300
3        2014-07-08 01:15:00       13.300
4        2014-07-08 01:30:00       13.300
356053   2024-09-09 20:45:00       14.675
356054   2024-09-09 21:00:00       14.682
356055   2024-09-09 21:15:00       14.608
356056   2024-09-09 21:30:00       14.608
356057   2024-09-09 21:45:00       14.600


In [3]:
# Rainfall data in Kaipara
rain = pd.read_csv('../Data/soil data/Kaipara Heads-644211-20240909095124/KaiparaRainfall.Continuous@644211-20240909095101.csv',  parse_dates=['Timestamp (UTC+12:00)'])
rain.head(5)

# smheadtail = (np.r_[0:5, -5:0])
rainheadtail = rain.iloc[np.r_[0:5, -5:0]]
print(rainheadtail)

       Timestamp (UTC+12:00)  Rain(mm)
0        1999-03-05 16:19:00      0.49
1        1999-03-05 16:41:00      0.48
2        1999-03-05 16:51:00      0.49
3        1999-03-05 16:58:00      0.49
4        1999-03-05 17:08:00      0.49
209737   2024-09-09 20:45:00      0.00
209738   2024-09-09 21:00:00      0.00
209739   2024-09-09 21:15:00      0.00
209740   2024-09-09 21:30:00      0.00
209741   2024-09-09 21:45:00      0.00


In [4]:
# Soil Moisture data in Kaipara
sm = pd.read_csv('../Data/soil data/Kaipara Heads-644211-20240909095124/KaiparaSoilMoistureContinuous@644211-20240909095109.csv',  parse_dates=['Timestamp (UTC+12:00)'])
sm.head(5)

# smheadtail = (np.r_[0:5, -5:0])
smheadtail = sm.iloc[np.r_[0:5, -5:0]]
print(smheadtail)

       Timestamp (UTC+12:00)  SoilMoisture(%)
0        2014-06-05 14:00:00           34.530
1        2014-06-05 14:15:00           34.489
2        2014-06-05 14:30:00           34.487
3        2014-06-05 14:45:00           34.489
4        2014-06-05 15:00:00           34.492
331610   2024-09-09 20:45:00           40.810
331611   2024-09-09 21:00:00           40.717
331612   2024-09-09 21:15:00           40.717
331613   2024-09-09 21:30:00           40.814
331614   2024-09-09 21:45:00           40.817


Data Description

In [5]:
# Display info to check datatype
print("Kaipara Rainfall Data Info:")
rain.info()
print("\nKaipara Soil Moisture Data Info:")
sm.info()
print("\nnKaipara Soil Temperature Data Info:")
st.info()

Kaipara Rainfall Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209742 entries, 0 to 209741
Data columns (total 2 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Timestamp (UTC+12:00)  209742 non-null  datetime64[ns]
 1   Rain(mm)               209740 non-null  float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 3.2 MB

Kaipara Soil Moisture Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331615 entries, 0 to 331614
Data columns (total 2 columns):
 #   Column                 Non-Null Count   Dtype         
---  ------                 --------------   -----         
 0   Timestamp (UTC+12:00)  331615 non-null  datetime64[ns]
 1   SoilMoisture(%)        331549 non-null  float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 5.1 MB

nKaipara Soil Temperature Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 356058 entries, 0 to 356057
Data

In [6]:
print("Kaipara Data statistics :")

print('Rain(mm)', rain['Rain(mm)'].describe())
print('SoilTemp(c)', st['SoilTemp(c)'].describe())
print('SoilMoisture(%)', sm['SoilMoisture(%)'].describe())

Kaipara Data statistics :
Rain(mm) count    209740.000000
mean          0.129901
std           0.220047
min           0.000000
25%           0.000000
50%           0.000000
75%           0.450000
max           3.340000
Name: Rain(mm), dtype: float64
SoilTemp(c) count    355966.000000
mean         17.811233
std           4.731248
min        -400.000000
25%          14.100000
50%          17.400000
75%          21.300000
max          36.800000
Name: SoilTemp(c), dtype: float64
SoilMoisture(%) count    331549.000000
mean         36.089983
std           3.833247
min           0.000000
25%          33.175000
50%          37.283000
75%          39.031000
max          47.686000
Name: SoilMoisture(%), dtype: float64


In [7]:
# Identify potential outliers for Soil Temperature
# First, check for the unreasonable -400 value
abnormal_temp = st[st['SoilTemp(c)'] < -50]  # Adjust threshold as needed

# Calculate Q1 and Q3
Q1_temp = st['SoilTemp(c)'].quantile(0.25)
Q3_temp = st['SoilTemp(c)'].quantile(0.75)
IQR_temp = Q3_temp - Q1_temp

# Define bounds
lower_bound_temp = Q1_temp - 1.5 * IQR_temp
upper_bound_temp = Q3_temp + 1.5 * IQR_temp

# Identify outliers
temp_outliers = st[(st['SoilTemp(c)'] < lower_bound_temp) | (st['SoilTemp(c)'] > upper_bound_temp)]
print(temp_outliers)

       Timestamp (UTC+12:00)  SoilTemp(c)
90112    2017-01-31 16:00:00         32.2
90113    2017-01-31 16:15:00         32.2
90685    2017-02-06 15:15:00         32.3
90686    2017-02-06 15:30:00         32.4
90687    2017-02-06 15:45:00         32.5
...                      ...          ...
351313   2024-07-22 12:15:00       -400.0
351314   2024-07-22 12:30:00       -400.0
351315   2024-07-22 12:45:00       -400.0
351316   2024-07-22 13:00:00       -400.0
351319   2024-07-22 13:29:00          0.0

[132 rows x 2 columns]


Data Cleaning


In [8]:
# Display missing value
print("'Missing Value in Soil Moisture Data")
sm.isna().sum()
print(sm[sm['SoilMoisture(%)'].isnull()])

'Missing Value in Soil Moisture Data
       Timestamp (UTC+12:00)  SoilMoisture(%)
123351   2017-12-11 10:08:00              NaN
123352   2018-01-05 04:59:00              NaN
124586   2018-01-18 01:00:00              NaN
124587   2018-01-18 01:29:00              NaN
126462   2018-02-06 13:45:00              NaN
...                      ...              ...
306325   2023-12-21 11:44:00              NaN
326868   2024-07-22 11:00:00              NaN
326869   2024-07-22 11:59:00              NaN
326875   2024-07-22 13:00:00              NaN
326876   2024-07-22 13:29:00              NaN

[66 rows x 2 columns]


In [9]:
# Display missing value

print("Missing Value in Soil Temperature Data")
st.isna().sum()

print(st[st['SoilTemp(c)'].isnull()])

Missing Value in Soil Temperature Data
       Timestamp (UTC+12:00)  SoilTemp(c)
122249   2018-01-01 09:00:00          NaN
122250   2018-01-01 11:59:00          NaN
149992   2018-10-17 10:00:00          NaN
149993   2018-10-17 13:14:00          NaN
152078   2018-11-08 06:00:00          NaN
...                      ...          ...
330765   2023-12-21 11:44:00          NaN
351309   2024-07-22 11:00:00          NaN
351310   2024-07-22 11:59:00          NaN
351317   2024-07-22 13:00:00          NaN
351318   2024-07-22 13:29:00          NaN

[92 rows x 2 columns]


In [10]:
# Display missing value
print("Missing Value in Rainfall Data")
rain.isna().sum()
print(rain[rain['Rain(mm)'].isnull()])

Missing Value in Rainfall Data
      Timestamp (UTC+12:00)  Rain(mm)
63898   2018-12-26 00:15:00       NaN
63899   2019-01-14 07:59:00       NaN


# Step 2: Handle Missing Values

In [11]:
# soil moisture and soil temperature using linear to fill data
sm.interpolate(method='linear', inplace=True)
st.interpolate(method='linear', inplace=True)

# soil moisture and soil temperature using linear to fill data
rain.fillna(0, inplace=True)


In [12]:
# double check missing value

# sm.isna().sum()
# st.isna().sum()
# rain.isna().sum()


# Step 3: Resample Data to 15 minutes interval

In [13]:
sm.set_index('Timestamp (UTC+12:00)', inplace=True)
st.set_index('Timestamp (UTC+12:00)', inplace=True)
rain.set_index('Timestamp (UTC+12:00)', inplace=True)


In [14]:
# soil_moisture_30min = sm.resample('30T').mean()
soil_moisture_15mins = sm.resample('15T').mean()

soil_temp_15mins = st.resample('15T').mean()
rainfall_15mins = rain.resample('15T').sum()


  soil_moisture_15mins = sm.resample('15T').mean()
  soil_temp_15mins = st.resample('15T').mean()
  rainfall_15mins = rain.resample('15T').sum()


# Step 4: Combine Data

In [15]:
# Determine the latest common start date
latest_start_date = max(rainfall_15mins.index.min(), soil_moisture_15mins.index.min(), soil_temp_15mins.index.min())
end_date = min(rainfall_15mins.index.max(), soil_moisture_15mins.index.max(), soil_temp_15mins.index.max())

In [16]:
# Filter datasets to the common time range
soil_moisture_15mins = soil_moisture_15mins[latest_start_date:end_date]
soil_temp_15mins = soil_temp_15mins[latest_start_date:end_date]
rainfall_15mins = rainfall_15mins[latest_start_date:end_date]


In [17]:
# Merge datasets on the timestamp
combined_data_15mins = rainfall_15mins.join(soil_temp_15mins, how='inner', rsuffix='_rain')
combined_data_15mins = combined_data_15mins.join(soil_moisture_15mins, how='inner', rsuffix='_soil_moisture')
combined_data_15mins.dropna(inplace=True)  # Drop any rows with missing values after merging


In [18]:
# Save the combined data to a new CSV file
combined_data_15mins.to_csv('../Data/soil data/Kaipara Heads-644211-20240909095124/Kaipara_Data_15mins.csv')
print("Combined data (15-minute intervals) saved successfully.")

Combined data (15-minute intervals) saved successfully.


In [19]:
kaipara = pd.read_csv('../Data/soil data/Kaipara Heads-644211-20240909095124/Kaipara_Data_15mins.csv', parse_dates=['Timestamp (UTC+12:00)'])
kaipara.head()

Unnamed: 0,Timestamp (UTC+12:00),Rain(mm),SoilTemp(c),SoilMoisture(%)
0,2014-07-08 00:30:00,0.0,13.4,39.218
1,2014-07-08 00:45:00,0.0,13.3,39.22
2,2014-07-08 01:00:00,0.0,13.3,39.22
3,2014-07-08 01:15:00,0.0,13.3,39.22
4,2014-07-08 01:30:00,0.0,13.3,39.221


In [20]:
# kaipara['Location'] = 'North Auckland'
# kaipara['Land position'] = 'Hill terrace'
# kaipara['Slope(degree)'] = '0-3'
# kaipara['Soil Order'] = 'Brown'
# kaipara['Soil Types / Series'] = 'Red Hill sandy loam'
# kaipara['NZ Soil Classification'] = 'Typic Sandy Brown'
# kaipara['Soil Profile Description'] = '0-10cm dark grey (almost black) sandy silt loam, structure nutty (when dry), firm consistence, compact ; 10-15cm dark grey sandy loam (contains some silt, structure nutty (when dry), somewhat firm, less compact ; 15-50cm dark brown sandy loam (contains some silt), structure single grain (when dry), loose consistence'
# kaipara['Pastoral land use'] = 'Drystock'
# kaipara.head()
# kaipara.to_csv('./Data/soil data/awhitu_soil_data.csv')


Data Transformation


Feature Engineering: Scaling, Normalization and Standardization


Feature Selection


Handling Imbalanced Data


Encoding Categorical Features


Data Splitting