In [59]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

weather = pd.read_csv('../data/niwa_cleaned/aucklandDailyWeatherData.csv', parse_dates=['Date(NZST)'])
weather.head()

Unnamed: 0,Date(NZST),Pmsl(hPa),Pstn(hPa),Amount(mm),Period(min),daily_total_rainfall,daily_avg_duration,daily_min_duration,daily_max_duration,daily_median_duration,...,Twet(C),RH(%),Tdew(C),Dir(DegT),Speed(m/s)_y,Percent(%),soil_avg_moist,soil_min_moist,soil_max_moist,soil_median_moist
0,2019-01-01,1016.5,993.2,0.0,4878.8,0.75,1225.3,3.9,4878.8,9.25,...,16.2,82.0,14.9,188,6.2,40.9,40.9,40.9,40.9,40.9
1,2019-01-02,1013.6,990.5,0.0,534.9,0.15,184.1,6.3,534.9,11.1,...,17.7,82.0,16.5,13,6.2,40.4,40.4,40.4,40.4,40.4
2,2019-01-03,1011.3,988.2,0.0,1416.8,0.1,477.233333,3.9,1416.8,11.0,...,17.2,82.0,16.0,201,8.8,40.5,40.5,40.5,40.5,40.5
3,2019-01-06,1016.2,992.9,0.0,4621.0,1.32,632.922222,1.5,4621.0,11.6,...,17.4,88.0,16.6,353,9.8,37.2,37.2,37.2,37.2,37.2
4,2019-01-07,1019.2,995.6,0.0,56.8,0.28,22.85,9.3,56.8,12.65,...,12.9,68.0,10.1,162,9.3,36.6,36.6,36.6,36.6,36.6


In [60]:
weather['Rainfall(mm)'].max()

166.6

In [61]:
weather.shape

(945, 31)

In [62]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 945 entries, 0 to 944
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Date(NZST)             945 non-null    datetime64[ns]
 1   Pmsl(hPa)              945 non-null    float64       
 2   Pstn(hPa)              945 non-null    float64       
 3   Amount(mm)             941 non-null    float64       
 4   Period(min)            945 non-null    float64       
 5   daily_total_rainfall   945 non-null    float64       
 6   daily_avg_duration     945 non-null    float64       
 7   daily_min_duration     945 non-null    float64       
 8   daily_max_duration     945 non-null    float64       
 9   daily_median_duration  945 non-null    float64       
 10  Rainfall(mm)           945 non-null    float64       
 11  Deficit(mm)            945 non-null    float64       
 12  Sunshine(Hrs)          945 non-null    float64       
 13  WindD

#TMIN and TMAX are highly positively correlated to each other as observed from above correlation matrix.

In [63]:
weather.isna().sum()

Date(NZST)               0
Pmsl(hPa)                0
Pstn(hPa)                0
Amount(mm)               4
Period(min)              0
daily_total_rainfall     0
daily_avg_duration       0
daily_min_duration       0
daily_max_duration       0
daily_median_duration    0
Rainfall(mm)             0
Deficit(mm)              0
Sunshine(Hrs)            0
WindDir(DegT)            0
Speed(m/s)_x             0
WindDir StdDev           0
WindSpd StdDev           0
Tmax(C)                  0
Tmin(C)                  0
Tgmin(C)                 0
Tmean(C)                 0
Twet(C)                  0
RH(%)                    0
Tdew(C)                  0
Dir(DegT)                0
Speed(m/s)_y             0
Percent(%)               0
soil_avg_moist           0
soil_min_moist           0
soil_max_moist           0
soil_median_moist        0
dtype: int64

In [64]:
weather['Amount(mm)'] .fillna(value=0, inplace=True)

In [65]:
# Features (X) and target variable (y)
features = ['Pmsl(hPa)', 'Pstn(hPa)', 'Amount(mm)', 'Period(min)', 'daily_total_rainfall', 'daily_avg_duration', 'daily_min_duration','daily_max_duration', 'daily_median_duration','Deficit(mm)', 'Sunshine(Hrs)', 'WindDir(DegT)', 'Speed(m/s)_x','WindDir StdDev', 'WindSpd StdDev', 'Tmax(C)', 'Tmin(C)', 'Tgmin(C)','Tmean(C)', 'Twet(C)', 'RH(%)', 'Tdew(C)', 'Dir(DegT)', 'Speed(m/s)_y','Percent(%)', 'soil_avg_moist', 'soil_min_moist','soil_max_moist', 'soil_median_moist']
target = 'Rainfall(mm)'

X = weather[features]
y = weather[target]

In [66]:
#Splitting train and test dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [67]:
#Default Random forest classifier
from sklearn.ensemble import RandomForestClassifier
model =RandomForestClassifier()

In [68]:
#Fitting train data into model
model.fit(X_train,y_train)

ValueError: Unknown label type: continuous. Maybe you are trying to fit a classifier, which expects discrete classes on a regression target with continuous values.

In [None]:
# Make predictions
predictions = model.predict(X_test)

# Evaluate the model (you can choose a different metric if needed)
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, predictions)
print(f"Mean Squared Error: {mse}")

In [None]:
# Printing confusion matrix
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,predictions))

[[3108    0]
 [   0 3138]]


In [None]:
#Classification report
from sklearn import metrics
print(metrics.classification_report(y_test, predictions))

              precision    recall  f1-score   support

       False       1.00      1.00      1.00      3108
        True       1.00      1.00      1.00      3138

    accuracy                           1.00      6246
   macro avg       1.00      1.00      1.00      6246
weighted avg       1.00      1.00      1.00      6246



In [None]:
#Feature Importance
model.feature_importances_

array([0.5059247 , 0.00229308, 0.00799729, 0.00152762, 0.48225731])