# Data Preprocessing Tools

## Importing the libraries

In [25]:
%pip install numpy pandas scikit-learn torch




In [26]:
import pandas as pd
import numpy as np

## Importing the dataset

In [27]:
df = pd.read_csv('Data/Preprocessed Data/final_data.csv')

In [28]:
df.head(5)

Unnamed: 0,Year,Month,station_name,elevation,humidity,TMPMAX,TMPMIN,precipitation,wind_speed_8_30am,wind_speed_17_30pm,patients,Date
0,2007,4,Batticaloa,8,90.1,31.2,25.1,233.3,1.9,5.8,12.0,2007-04-01
1,2007,4,Colombo,7,91.433333,31.7,24.8,374.100007,1.9,5.0,35.0,2007-04-01
2,2007,4,Galle,12,92.233333,30.8,24.9,342.799994,4.1,6.9,1.0,2007-04-01
3,2007,4,Gampaha,8,93.9,31.8,24.3,249.899997,3.5,7.7,16.0,2007-04-01
4,2007,4,Jaffna,3,,33.8,25.5,39.0,5.4,8.0,0.0,2007-04-01


## Taking care of missing data

In [29]:
# convert date into date64 datatype
df['Date'] = pd.to_datetime(df['Date']).dt.normalize()
df['patients'] = df.patients.astype('int64')

In [30]:
df.dtypes

Year                           int64
Month                          int64
station_name                  object
elevation                      int64
humidity                     float64
TMPMAX                       float64
TMPMIN                       float64
precipitation                float64
wind_speed_8_30am            float64
wind_speed_17_30pm           float64
patients                       int64
Date                  datetime64[ns]
dtype: object

In [31]:
# check null value count for the each feature
df.isna().sum() 

Year                    0
Month                   0
station_name            0
elevation               0
humidity               60
TMPMAX                 24
TMPMIN                 37
precipitation           7
wind_speed_8_30am     204
wind_speed_17_30pm    208
patients                0
Date                    0
dtype: int64

In [32]:
df.columns

Index(['Year', 'Month', 'station_name', 'elevation', 'humidity', 'TMPMAX',
       'TMPMIN', 'precipitation', 'wind_speed_8_30am', 'wind_speed_17_30pm',
       'patients', 'Date'],
      dtype='object')

### KNN imputator

* The KNNImputer class is imported from the sklearn.impute module. 
* This class is used for imputing missing values using the k-nearest neighbors algorithm.
* An instance of the KNNImputer class is initialized with the parameter n_neighbors set to 5. This specifies that the algorithm should consider the 5 nearest neighbors when imputing missing values.

In [33]:
from sklearn.impute import KNNImputer
import pandas as pd

# Initialize the KNNImputer
knn_imp = KNNImputer(n_neighbors=5)

knn_df = df.copy()

# Select the columns you want to impute
columns_to_impute = ['humidity', 'TMPMAX', 'TMPMIN', 'precipitation', 'wind_speed_8_30am', 'wind_speed_17_30pm']

In [34]:

# Perform the imputation on the selected columns
imputed_data = knn_imp.fit_transform(knn_df[columns_to_impute])

# Convert the imputed data back into a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=columns_to_impute)

# Replace the original columns in the DataFrame with the imputed ones
knn_df[columns_to_impute] = imputed_df


In [35]:
knn_df.isna().sum()

Year                  0
Month                 0
station_name          0
elevation             0
humidity              0
TMPMAX                0
TMPMIN                0
precipitation         0
wind_speed_8_30am     0
wind_speed_17_30pm    0
patients              0
Date                  0
dtype: int64

In [36]:
knn_df['station_name'].nunique()

9

In [37]:
knn_df['station_name'].unique()

array(['Batticaloa', 'Colombo', 'Galle', 'Gampaha', 'Jaffna', 'Kandy',
       'Kurunegala', 'Puttalam', 'Ratnapura'], dtype=object)

In [38]:
knn_df.head()

Unnamed: 0,Year,Month,station_name,elevation,humidity,TMPMAX,TMPMIN,precipitation,wind_speed_8_30am,wind_speed_17_30pm,patients,Date
0,2007,4,Batticaloa,8,90.1,31.2,25.1,233.3,1.9,5.8,12,2007-04-01
1,2007,4,Colombo,7,91.433333,31.7,24.8,374.100007,1.9,5.0,35,2007-04-01
2,2007,4,Galle,12,92.233333,30.8,24.9,342.799994,4.1,6.9,1,2007-04-01
3,2007,4,Gampaha,8,93.9,31.8,24.3,249.899997,3.5,7.7,16,2007-04-01
4,2007,4,Jaffna,3,77.184516,33.8,25.5,39.0,5.4,8.0,0,2007-04-01


### Iterative Imputer

In [39]:
#TODO:  we have to look this approach as well
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer


imp_mean = IterativeImputer(random_state=0)
imp_df = imp_mean.fit_transform(df[['humidity', 'TMPMAX', 'TMPMIN', 'precipitation','wind_speed_8_30am', 'wind_speed_17_30pm']])

## Encoding categorical data

### Encoding the Station name Variable

In [40]:
df_encoded = pd.get_dummies(knn_df, columns=['station_name'])

# Assuming df_encoded is your dataframe after get_dummies
bool_cols = [col for col in df_encoded.columns if df_encoded[col].dtype == np.dtype('bool')]
df_encoded[bool_cols] = df_encoded[bool_cols].astype(float)


In [41]:
df_encoded.head()

Unnamed: 0,Year,Month,elevation,humidity,TMPMAX,TMPMIN,precipitation,wind_speed_8_30am,wind_speed_17_30pm,patients,Date,station_name_Batticaloa,station_name_Colombo,station_name_Galle,station_name_Gampaha,station_name_Jaffna,station_name_Kandy,station_name_Kurunegala,station_name_Puttalam,station_name_Ratnapura
0,2007,4,8,90.1,31.2,25.1,233.3,1.9,5.8,12,2007-04-01,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2007,4,7,91.433333,31.7,24.8,374.100007,1.9,5.0,35,2007-04-01,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2007,4,12,92.233333,30.8,24.9,342.799994,4.1,6.9,1,2007-04-01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2007,4,8,93.9,31.8,24.3,249.899997,3.5,7.7,16,2007-04-01,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,2007,4,3,77.184516,33.8,25.5,39.0,5.4,8.0,0,2007-04-01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [42]:
df_encoded.sort_values('Date', inplace=True)

In [43]:
df_encoded.head()

Unnamed: 0,Year,Month,elevation,humidity,TMPMAX,TMPMIN,precipitation,wind_speed_8_30am,wind_speed_17_30pm,patients,Date,station_name_Batticaloa,station_name_Colombo,station_name_Galle,station_name_Gampaha,station_name_Jaffna,station_name_Kandy,station_name_Kurunegala,station_name_Puttalam,station_name_Ratnapura
39,2007,1,8,85.83871,32.0,22.1,39.399999,12.5,14.5,64,2007-01-01,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
36,2007,1,8,82.064516,28.4,24.0,170.100004,8.6,10.1,0,2007-01-01,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,2007,1,7,82.258065,31.8,22.9,91.100001,5.7,6.0,244,2007-01-01,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,2007,1,12,89.612903,29.6,23.3,78.2,2.2,6.9,22,2007-01-01,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
40,2007,1,3,81.886667,29.7,22.2,12.5,5.3,7.12,0,2007-01-01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [44]:
# drop the elevation coulmn since we encoded the station name coulmn 
df_encoded.dropna()
df_encoded.drop(columns=['Date', 'Year','Month','elevation'], inplace=True)

In [45]:
df_encoded.head()

Unnamed: 0,humidity,TMPMAX,TMPMIN,precipitation,wind_speed_8_30am,wind_speed_17_30pm,patients,station_name_Batticaloa,station_name_Colombo,station_name_Galle,station_name_Gampaha,station_name_Jaffna,station_name_Kandy,station_name_Kurunegala,station_name_Puttalam,station_name_Ratnapura
39,85.83871,32.0,22.1,39.399999,12.5,14.5,64,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
36,82.064516,28.4,24.0,170.100004,8.6,10.1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,82.258065,31.8,22.9,91.100001,5.7,6.0,244,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,89.612903,29.6,23.3,78.2,2.2,6.9,22,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
40,81.886667,29.7,22.2,12.5,5.3,7.12,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [46]:
df_encoded = df_encoded.iloc[:, [7,8,9,10,11,12,13,14,15,0,1,2,3,4,5,6]]

In [47]:
df_encoded.head()

Unnamed: 0,station_name_Batticaloa,station_name_Colombo,station_name_Galle,station_name_Gampaha,station_name_Jaffna,station_name_Kandy,station_name_Kurunegala,station_name_Puttalam,station_name_Ratnapura,humidity,TMPMAX,TMPMIN,precipitation,wind_speed_8_30am,wind_speed_17_30pm,patients
39,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,85.83871,32.0,22.1,39.399999,12.5,14.5,64
36,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,82.064516,28.4,24.0,170.100004,8.6,10.1,0
37,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,82.258065,31.8,22.9,91.100001,5.7,6.0,244
38,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,89.612903,29.6,23.3,78.2,2.2,6.9,22
40,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,81.886667,29.7,22.2,12.5,5.3,7.12,0


## Splitting the dataset into the Training set and Test set

In [48]:
# train-test split for time series
train_size = int(len(df_encoded) * 0.9)
test_size = len(df_encoded) - train_size
train_data, test_data = df_encoded[:train_size], df_encoded[train_size:]

In [136]:
# from sklearn.model_selection import train_test_split
# train_data, test_data = train_test_split(df_encoded, test_size = 0.2, random_state = 0)

In [49]:
df_encoded.shape

(1836, 16)

In [50]:
train_data.shape

(1652, 16)

In [51]:
test_data.shape

(184, 16)

# Model Training - CNN


## Importing the libraries

In [52]:
# multivariate data preparation
from numpy import array
from numpy import hstack
from numpy import array
import numpy as np
from tensorflow.keras.models import Sequential      #type: ignore
from tensorflow.keras.layers import Dense           #type: ignore
from tensorflow.keras.layers import Flatten         #type: ignore
from tensorflow.keras.layers import Conv1D          #type: ignore
from tensorflow.keras.layers import MaxPooling1D    #type: ignore
from sklearn.metrics import mean_squared_error

In [53]:
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
  X, y = list(), list()
  for i in range(len(sequences)):
    # find the end of this pattern
    end_ix = i + n_steps
    # check if we are beyond the dataset
    if end_ix > len(sequences):
      break
    # gather input and output parts of the pattern
    seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
    X.append(seq_x)
    y.append(seq_y)
  return array(X), array(y)

In [54]:
# Convert DataFrame to NumPy array
train_data_array = train_data.values

In [55]:
print(train_data_array)
train_data_array.shape

[[  0.    0.    0.  ...  12.5  14.5  64. ]
 [  1.    0.    0.  ...   8.6  10.1   0. ]
 [  0.    1.    0.  ...   5.7   6.  244. ]
 ...
 [  0.    0.    0.  ...   0.5   1.2  93. ]
 [  0.    0.    0.  ...   5.6  10.7 213. ]
 [  0.    0.    1.  ...   1.3   2.6 115. ]]


(1652, 16)

In [109]:
n_steps = 108     # choose a number of time steps
X_train, y_train = split_sequences(train_data_array, n_steps)        # convert into input/output
n_features = X_train.shape[2]     # the dataset knows the number of features
print(X_train.shape, y_train.shape)

(1361, 108, 15) (1361,)


## Building the CNN

In [110]:
# define model
model = Sequential()
model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(n_steps, n_features)))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(50, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

  super().__init__(


In [111]:
# fit model
model.fit(X_train, y_train, epochs=1000, verbose=0)

<keras.src.callbacks.history.History at 0x2294f95ec30>

In [112]:
# Convert the test data into the same format as the training data
X_test, y_test = split_sequences(test_data.values, n_steps)
print(X_test.shape, y_test.shape)

(261, 108, 15) (261,)


In [141]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate the accuracy metric (e.g., mean squared error)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print("Root Mean Squared Error (RMSE):", rmse)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Root Mean Squared Error (RMSE): 495.84383714870756
