In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

In [3]:
# Load the data
df = pd.read_csv('../transformed_rainfall_data.csv')
df.head()

Unnamed: 0,Date,Vavuniya,Anuradhapura,Maha Illuppallama
0,20210101,0.0,0.0,0.0
1,20210102,2.9,0.2,0.0
2,20210103,3.4,14.2,3.8
3,20210104,15.0,7.2,19.0
4,20210105,0.0,1.1,2.8


In [4]:
#Check for null values
df.isna().sum()

Date                 0
Vavuniya             0
Anuradhapura         0
Maha Illuppallama    0
dtype: int64

In [6]:
# Convert the Date column to datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')

# Extract year, month, and day as features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Select features
features = ['Year', 'Month', 'Day']

print(features)

['Year', 'Month', 'Day']


In [7]:
#Check for null values
df.isna().sum()

Date                 0
Vavuniya             0
Anuradhapura         0
Maha Illuppallama    0
Year                 0
Month                0
Day                  0
dtype: int64

In [8]:
df.head()

Unnamed: 0,Date,Vavuniya,Anuradhapura,Maha Illuppallama,Year,Month,Day
0,2021-01-01,0.0,0.0,0.0,2021,1,1
1,2021-01-02,2.9,0.2,0.0,2021,1,2
2,2021-01-03,3.4,14.2,3.8,2021,1,3
3,2021-01-04,15.0,7.2,19.0,2021,1,4
4,2021-01-05,0.0,1.1,2.8,2021,1,5


In [11]:
# Ensure 'Date' is in datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d')

# Create additional features
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['DayOfYear'] = df['Date'].dt.dayofyear

# Create lag features and rolling mean features for each station
stations = ['Vavuniya', 'Anuradhapura', 'Maha Illuppallama']
for station in stations:
    df[f'{station}_lag1'] = df[station].shift(1)
    df[f'{station}_lag2'] = df[station].shift(2)
    df[f'{station}_lag3'] = df[station].shift(3)
    df[f'{station}_rolling_mean3'] = df[station].rolling(window=3).mean()
    df[f'{station}_rolling_mean7'] = df[station].rolling(window=7).mean()
df.head()

Unnamed: 0,Date,Vavuniya,Anuradhapura,Maha Illuppallama,Year,Month,Day,DayOfYear,Vavuniya_lag1,Vavuniya_lag2,...,Anuradhapura_lag1,Anuradhapura_lag2,Anuradhapura_lag3,Anuradhapura_rolling_mean3,Anuradhapura_rolling_mean7,Maha Illuppallama_lag1,Maha Illuppallama_lag2,Maha Illuppallama_lag3,Maha Illuppallama_rolling_mean3,Maha Illuppallama_rolling_mean7
0,2021-01-01,0.0,0.0,0.0,2021,1,1,1,,,...,,,,,,,,,,
1,2021-01-02,2.9,0.2,0.0,2021,1,2,2,0.0,,...,0.0,,,,,0.0,,,,
2,2021-01-03,3.4,14.2,3.8,2021,1,3,3,2.9,0.0,...,0.2,0.0,,4.8,,0.0,0.0,,1.266667,
3,2021-01-04,15.0,7.2,19.0,2021,1,4,4,3.4,2.9,...,14.2,0.2,0.0,7.2,,3.8,0.0,0.0,7.6,
4,2021-01-05,0.0,1.1,2.8,2021,1,5,5,15.0,3.4,...,7.2,14.2,0.2,7.5,,19.0,3.8,0.0,8.533333,


In [10]:
# Drop the rows with NaN values created by the shift operation
df.dropna(inplace=True)

# Prepare the dataset for each station
results = {}
predictions = {}

for station in stations:
    # Define features and target
    features = ['Year', 'Month', 'DayOfYear',
                f'{station}_lag1', f'{station}_lag2', f'{station}_lag3',
                f'{station}_rolling_mean3', f'{station}_rolling_mean7']
    X = df[features]
    y = df[station]

    # Impute missing values with median
    imputer = SimpleImputer(strategy='median')
    X = imputer.fit_transform(X)

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the GBR model with Grid Search
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5],
        'subsample': [0.8, 1.0]
    }
    gbr = GradientBoostingRegressor(random_state=42)
    grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    
    # Train the model
    grid_search.fit(X_train, y_train)
    
    # Predict and evaluate
    best_gbr = grid_search.best_estimator_
    y_pred = best_gbr.predict(X_test)
    
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    

Results for Vavuniya:
  MAE: 8.178337051200222
  MSE: 234.03763755668973
  R2: 0.009797863768064019
Results for Anuradhapura:
  MAE: 6.651143559761282
  MSE: 181.30703447336435
  R2: 0.08945637653553928
Results for Maha Illuppallama:
  MAE: 6.7324854046695295
  MSE: 161.62735033025368
  R2: 0.10539237281095903
