In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight

In [2]:
#importing dataset by reading csv file (The data was downloaded from Google Cloud using BIGQUERY)
df=pd.read_csv('/root/code/thesis/codeFolder/LatestDataInUse/csv/coding_challenge.csv')

In [3]:
df.head(5)

Unnamed: 0,date,station_number,mean_temp,mean_dew_point,mean_sealevel_pressure,mean_visibility,mean_wind_speed,max_temperature,total_precipitation,snow_depth,fog,snow,hail,thunder
0,2006-01-24,725300,33.200001,26.799999,1008.5,7.0,15.2,23.0,0.0,1.2,False,False,False,False
1,2006-01-25,725300,33.299999,21.4,1021.099976,10.0,14.0,28.9,0.0,1.2,False,False,False,False
2,2006-01-26,725300,30.299999,19.700001,1031.400024,9.9,4.6,19.4,0.0,2.66,False,False,False,False
3,2006-01-27,725300,39.900002,25.299999,1027.599976,10.0,10.7,35.599998,0.0,2.66,False,False,False,False
4,2006-01-28,725300,43.200001,33.799999,1019.900024,9.4,6.6,35.099998,0.0,2.66,False,False,False,False


In [4]:
df = df.sort_values(by='date') 
df = df.set_index('date')

In [5]:
df.head()

Unnamed: 0_level_0,station_number,mean_temp,mean_dew_point,mean_sealevel_pressure,mean_visibility,mean_wind_speed,max_temperature,total_precipitation,snow_depth,fog,snow,hail,thunder
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2006-01-24,725300,33.200001,26.799999,1008.5,7.0,15.2,23.0,0.0,1.2,False,False,False,False
2006-01-24,725315,34.400002,28.0,1013.700012,7.2,17.9,25.0,0.0,0.0,False,False,False,False
2006-01-24,725314,39.400002,24.799999,1018.099976,8.4,9.4,24.1,0.0,0.0,False,False,False,False
2006-01-24,725326,33.099998,27.0,22.528572,7.6,15.6,19.4,0.01,0.0,False,False,False,False
2006-01-24,725327,32.799999,28.1,1010.299988,6.7,16.700001,26.6,0.0,0.0,False,False,False,False


In [6]:
# Converting the categorical values to numerical values
df['fog'] = df['fog'].astype(int)
df['hail'] = df['hail'].astype(int)
df['thunder'] = df['thunder'].astype(int)
df['snow'] = df['snow'].astype(int)

In [7]:
df['snow'].value_counts()

0    13919
1     1413
Name: snow, dtype: int64

In [8]:
df = df.sort_values(by=['station_number', 'date'])

In [9]:
df['snow_tomorrow'] = df.groupby('station_number')['snow'].shift(-1)
df.dropna(inplace=True)

In [10]:
# Create lag features
df['mean_temp_lag_1'] = df.groupby('station_number')['mean_temp'].shift(1)
df['mean_dew_point_lag_1'] = df.groupby('station_number')['mean_dew_point'].shift(1)
df['mean_sealevel_pressure'] = df.groupby('station_number')['mean_sealevel_pressure'].shift(1)
df['mean_visibility_lag_1'] = df.groupby('station_number')['mean_visibility'].shift(1)
df['mean_wind_speed_lag_1'] = df.groupby('station_number')['mean_wind_speed'].shift(1)
df['total_precipitation_lag_1'] = df.groupby('station_number')['total_precipitation'].shift(1)
df['snow_depth'] = df.groupby('station_number')['snow_depth'].shift(1)
df['fog_lag_1'] = df.groupby('station_number')['fog'].shift(1)
df['snow_lag_1'] = df.groupby('station_number')['snow'].shift(1)
df['hail_lag_1'] = df.groupby('station_number')['hail'].shift(1)
df['thunder_lag_1'] = df.groupby('station_number')['thunder'].shift(1)

In [11]:
df.dropna(inplace=True)

In [12]:
df.columns

Index(['station_number', 'mean_temp', 'mean_dew_point',
       'mean_sealevel_pressure', 'mean_visibility', 'mean_wind_speed',
       'max_temperature', 'total_precipitation', 'snow_depth', 'fog', 'snow',
       'hail', 'thunder', 'snow_tomorrow', 'mean_temp_lag_1',
       'mean_dew_point_lag_1', 'mean_visibility_lag_1',
       'mean_wind_speed_lag_1', 'total_precipitation_lag_1', 'fog_lag_1',
       'snow_lag_1', 'hail_lag_1', 'thunder_lag_1'],
      dtype='object')

In [13]:
features=['station_number', 'mean_temp', 'mean_dew_point',
       'mean_sealevel_pressure', 'mean_visibility', 'mean_wind_speed',
       'total_precipitation', 'snow_depth', 'fog', 'snow',
       'hail', 'thunder', 'mean_temp_lag_1',
       'mean_dew_point_lag_1', 'mean_visibility_lag_1',
       'mean_wind_speed_lag_1', 'total_precipitation_lag_1', 'fog_lag_1',
       'snow_lag_1', 'hail_lag_1', 'thunder_lag_1']

In [14]:
# Train-test split, ensuring the split is done per station to maintain the time series nature
def train_test_split_per_station(df, test_size=0.2):
    train_data = pd.DataFrame()
    test_data = pd.DataFrame()
    for station in df['station_number'].unique():
        station_data = df[df['station_number'] == station]
        train_station, test_station = train_test_split(station_data, test_size=test_size, shuffle=False)
        train_data = pd.concat([train_data, train_station])
        test_data = pd.concat([test_data, test_station])
    return train_data, test_data

train_data, test_data = train_test_split_per_station(df)
X_train = train_data[features].values
y_train = train_data['snow_tomorrow'].values
X_test = test_data[features].values
y_test = test_data['snow_tomorrow'].values

# Normalizing the features to ensure equal weightage of all the variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

#Converting to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)  # unsqueeze to add a dimension
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

In [15]:
X_train

tensor([[-1.8892, -0.9182, -1.0240,  ..., -0.3121, -0.3121, -0.3121],
        [-1.8892, -1.0713, -1.1146,  ..., -0.3121, -0.3121, -0.3121],
        [-1.8892, -0.5812, -0.8161,  ..., -0.3121, -0.3121, -0.3121],
        ...,
        [ 1.4532,  0.9147,  1.1561,  ..., -0.3121, -0.3121, -0.3121],
        [ 1.4532,  0.6288,  0.9695,  ...,  3.2042,  3.2042,  3.2042],
        [ 1.4532,  0.7411,  1.1987,  ..., -0.3121, -0.3121, -0.3121]])

In [16]:
# Defining the neural network model
class SnowForecastModel(nn.Module):
    def __init__(self):
        super(SnowForecastModel, self).__init__()
        self.fc1 = nn.Linear(len(features), 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

In [17]:
# Function to calculate accuracy
def calculate_accuracy(preds, labels):
    predicted = preds.round()
    correct = (predicted == labels).float()
    accuracy = correct.sum() / len(correct)
    return accuracy*100

In [18]:
np.unique(y_train)

array([0., 1.], dtype=float32)

In [19]:

# Ensure y_train is a numpy array
if isinstance(y_train, torch.Tensor):
    y_train_copy = y_train.cpu().numpy()


# Flatten y_train to ensure it's a 1D array
y_train_flat = y_train_copy.flatten()

# Convert to int
y_train_int = y_train_flat.astype(int)

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_int), y=y_train_int)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Use class weights in the loss function
criterion = nn.BCEWithLogitsLoss(pos_weight=class_weights[1])

# Instantiating the model, defining the loss function and the optimizer
model = SnowForecastModel()
#criterion = nn.BCELoss()  # Binary Cross Entropy loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.00001, weight_decay=0.01)

# Training & evaluating the model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    train_loss = criterion(outputs, y_train)
    train_loss.backward()
    optimizer.step()

    train_accuracy = calculate_accuracy(outputs, y_train)

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test)
        test_loss = criterion(test_outputs, y_test)
        test_accuracy = calculate_accuracy(test_outputs, y_test)

    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Train Accuracy: {train_accuracy.item():.4f}, Test Loss: {test_loss.item():.4f}, Test Accuracy: {test_accuracy.item():.4f}')

Epoch [1/100], Train Loss: 1.1224, Train Accuracy: 72.0830, Test Loss: 1.1522, Test Accuracy: 72.3328
Epoch [2/100], Train Loss: 1.1224, Train Accuracy: 72.1483, Test Loss: 1.1522, Test Accuracy: 72.5285
Epoch [3/100], Train Loss: 1.1224, Train Accuracy: 72.2054, Test Loss: 1.1521, Test Accuracy: 72.5938
Epoch [4/100], Train Loss: 1.1223, Train Accuracy: 72.3116, Test Loss: 1.1521, Test Accuracy: 72.5938
Epoch [5/100], Train Loss: 1.1223, Train Accuracy: 72.4586, Test Loss: 1.1521, Test Accuracy: 72.6264
Epoch [6/100], Train Loss: 1.1223, Train Accuracy: 72.5484, Test Loss: 1.1521, Test Accuracy: 72.6917
Epoch [7/100], Train Loss: 1.1223, Train Accuracy: 72.6219, Test Loss: 1.1521, Test Accuracy: 72.6917
Epoch [8/100], Train Loss: 1.1223, Train Accuracy: 72.7035, Test Loss: 1.1521, Test Accuracy: 72.6917
Epoch [9/100], Train Loss: 1.1223, Train Accuracy: 72.7933, Test Loss: 1.1521, Test Accuracy: 72.7243
Epoch [10/100], Train Loss: 1.1223, Train Accuracy: 72.9158, Test Loss: 1.1521, Te

In [20]:
# printing classification report
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_predictions = test_outputs.round().numpy()
    print(f'Classification Report:\n{classification_report(y_test.numpy(), test_predictions)}')

Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      0.85      0.88      2739
         1.0       0.15      0.22      0.18       326

    accuracy                           0.78      3065
   macro avg       0.53      0.54      0.53      3065
weighted avg       0.82      0.78      0.80      3065

