In [1]:
%pip install numpy pandas scikit-learn torch

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.1.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('Data/Preprocessed Data/final_data.csv')

In [4]:
df.head(5)

Unnamed: 0,Year,Month,station_name,elevation,humidity,TMPMAX,TMPMIN,precipitation,wind_speed_8_30am,wind_speed_17_30pm,patients,Date
0,2007,4,Batticaloa,8,90.1,31.2,25.1,233.3,1.9,5.8,12.0,2007-04-01
1,2007,4,Colombo,7,91.433333,31.7,24.8,374.100007,1.9,5.0,35.0,2007-04-01
2,2007,4,Galle,12,92.233333,30.8,24.9,342.799994,4.1,6.9,1.0,2007-04-01
3,2007,4,Gampaha,8,93.9,31.8,24.3,249.899997,3.5,7.7,16.0,2007-04-01
4,2007,4,Jaffna,3,,33.8,25.5,39.0,5.4,8.0,0.0,2007-04-01


### Feature engineering

In [1]:
# convert date into date64 datatype
df['Date'] = pd.to_datetime(df['Date']).dt.normalize()
df['patients'] = df.patients.astype('int64')

NameError: name 'pd' is not defined

In [6]:
df.dtypes

Year                           int64
Month                          int64
station_name                  object
elevation                      int64
humidity                     float64
TMPMAX                       float64
TMPMIN                       float64
precipitation                float64
wind_speed_8_30am            float64
wind_speed_17_30pm           float64
patients                       int64
Date                  datetime64[ns]
dtype: object

In [7]:
df.isna().sum() # check null value count for the each feature

Year                    0
Month                   0
station_name            0
elevation               0
humidity               60
TMPMAX                 24
TMPMIN                 37
precipitation           7
wind_speed_8_30am     204
wind_speed_17_30pm    208
patients                0
Date                    0
dtype: int64

In [8]:
df.columns

Index(['Year', 'Month', 'station_name', 'elevation', 'humidity', 'TMPMAX',
       'TMPMIN', 'precipitation', 'wind_speed_8_30am', 'wind_speed_17_30pm',
       'patients', 'Date'],
      dtype='object')

## Imputations 

#### KNN imputator

In [9]:
from sklearn.impute import KNNImputer
import pandas as pd

# Initialize the KNNImputer
knn_imp = KNNImputer(n_neighbors=5)

knn_df = df.copy()

# Select the columns you want to impute
columns_to_impute = ['humidity', 'TMPMAX', 'TMPMIN', 'precipitation', 'wind_speed_8_30am', 'wind_speed_17_30pm']

In [10]:

# Perform the imputation on the selected columns
imputed_data = knn_imp.fit_transform(knn_df[columns_to_impute])

# Convert the imputed data back into a DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=columns_to_impute)

# Replace the original columns in the DataFrame with the imputed ones
knn_df[columns_to_impute] = imputed_df


In [11]:
knn_df.isna().sum()

Year                  0
Month                 0
station_name          0
elevation             0
humidity              0
TMPMAX                0
TMPMIN                0
precipitation         0
wind_speed_8_30am     0
wind_speed_17_30pm    0
patients              0
Date                  0
dtype: int64

#### Use Iterative Imputer

In [12]:
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer


imp_mean = IterativeImputer(random_state=0)
imp_df = imp_mean.fit_transform(df[['humidity', 'TMPMAX', 'TMPMIN', 'precipitation','wind_speed_8_30am', 'wind_speed_17_30pm']])

## Model Training

In [13]:
import torch
import torch.nn as nn

In [14]:
# check cuda availability

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")


GPU: NVIDIA GeForce MX330 is available.


In [15]:
knn_df['station_name'].nunique()
knn_df['station_name'].unique()

array(['Batticaloa', 'Colombo', 'Galle', 'Gampaha', 'Jaffna', 'Kandy',
       'Kurunegala', 'Puttalam', 'Ratnapura'], dtype=object)

In [16]:
knn_df.head()

Unnamed: 0,Year,Month,station_name,elevation,humidity,TMPMAX,TMPMIN,precipitation,wind_speed_8_30am,wind_speed_17_30pm,patients,Date
0,2007,4,Batticaloa,8,90.1,31.2,25.1,233.3,1.9,5.8,12,2007-04-01
1,2007,4,Colombo,7,91.433333,31.7,24.8,374.100007,1.9,5.0,35,2007-04-01
2,2007,4,Galle,12,92.233333,30.8,24.9,342.799994,4.1,6.9,1,2007-04-01
3,2007,4,Gampaha,8,93.9,31.8,24.3,249.899997,3.5,7.7,16,2007-04-01
4,2007,4,Jaffna,3,77.184516,33.8,25.5,39.0,5.4,8.0,0,2007-04-01


#### Create timeseries sequence by sorting according to the date and remove the date afterwards

In [17]:
knn_df.sort_values('Date',inplace=True)
knn_df.drop('Date',axis=1,inplace=True)

In [18]:
knn_df.head()

Unnamed: 0,Year,Month,station_name,elevation,humidity,TMPMAX,TMPMIN,precipitation,wind_speed_8_30am,wind_speed_17_30pm,patients
39,2007,1,Gampaha,8,85.83871,32.0,22.1,39.399999,12.5,14.5,64
36,2007,1,Batticaloa,8,82.064516,28.4,24.0,170.100004,8.6,10.1,0
37,2007,1,Colombo,7,82.258065,31.8,22.9,91.100001,5.7,6.0,244
38,2007,1,Galle,12,89.612903,29.6,23.3,78.2,2.2,6.9,22
40,2007,1,Jaffna,3,81.886667,29.7,22.2,12.5,5.3,7.12,0


In [19]:
knn_df.dropna()
knn_df.drop(columns=['Year','Month','elevation'],inplace=True)

#### Use one hot encoding for the station name

In [20]:
df_encoded = pd.get_dummies(knn_df, columns=['station_name'])

# Assuming df_encoded is your dataframe after get_dummies
bool_cols = [col for col in df_encoded.columns if df_encoded[col].dtype == np.dtype('bool')]
df_encoded[bool_cols] = df_encoded[bool_cols].astype(float)


In [21]:
df_encoded.head()

Unnamed: 0,humidity,TMPMAX,TMPMIN,precipitation,wind_speed_8_30am,wind_speed_17_30pm,patients,station_name_Batticaloa,station_name_Colombo,station_name_Galle,station_name_Gampaha,station_name_Jaffna,station_name_Kandy,station_name_Kurunegala,station_name_Puttalam,station_name_Ratnapura
39,85.83871,32.0,22.1,39.399999,12.5,14.5,64,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
36,82.064516,28.4,24.0,170.100004,8.6,10.1,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
37,82.258065,31.8,22.9,91.100001,5.7,6.0,244,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
38,89.612903,29.6,23.3,78.2,2.2,6.9,22,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
40,81.886667,29.7,22.2,12.5,5.3,7.12,0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


#### LSTM Model 

In [22]:
from torch.autograd import Variable

# Define LSTM model
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1, num_layers=2):
        super(LSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #hidden state
        c_0 = Variable(torch.zeros(self.num_layers, x.size(0), self.hidden_size)) #internal state
        # Propagate input through LSTM
        output, (hn, cn) = self.lstm(x, (h_0, c_0)) # lstm with input, hidden, and internal state
        hn = hn.view(-1, self.hidden_size) # reshaping the data for Dense layer next
        out = self.fc(hn)
        return out


In [23]:
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader

In [24]:
df_encoded.dtypes

humidity                   float64
TMPMAX                     float64
TMPMIN                     float64
precipitation              float64
wind_speed_8_30am          float64
wind_speed_17_30pm         float64
patients                     int64
station_name_Batticaloa    float64
station_name_Colombo       float64
station_name_Galle         float64
station_name_Gampaha       float64
station_name_Jaffna        float64
station_name_Kandy         float64
station_name_Kurunegala    float64
station_name_Puttalam      float64
station_name_Ratnapura     float64
dtype: object

In [25]:
# split df to into features and labels and convert into numpy arrays
features = df_encoded.drop('patients', axis=1)
labels = df_encoded['patients'].astype(float)

# Convert to numpy arrays
features = features.to_numpy()
labels = labels.to_numpy()

In [26]:
# Split the data into training and testing sets
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Create DataLoader objects
train_data = TensorDataset(features_train, labels_train)
# test_data = TensorDataset(features_test, labels_test)

TypeError: 'int' object is not callable