In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

##### Query for downloading the data from Google Cloud BIGQUERY

##### The dataset was downloaded as csv and later uploaded in Visual Studio where futher processing was performed

In [2]:
#importing dataset by reading csv file (The data was downloaded from Google Cloud using BIGQUERY)
df=pd.read_csv('/root/code/thesis/codeFolder/LatestDataInUse/csv/7learnings.csv')

  df=pd.read_csv('/root/code/thesis/codeFolder/LatestDataInUse/csv/7learnings.csv')


In [3]:
df.head()

Unnamed: 0,station_number,wban_number,year,month,day,mean_temp,num_mean_temp_samples,mean_dew_point,num_mean_dew_point_samples,mean_sealevel_pressure,...,min_temperature_explicit,total_precipitation,snow_depth,fog,rain,snow,hail,thunder,tornado,formatted_date
0,725940,99999,2005,1,25,51.299999,4,45.299999,4.0,1013.5,...,,0.0,,False,False,False,False,False,False,2005-01-25
1,725940,99999,2005,3,7,51.5,4,48.0,4.0,1025.0,...,,0.0,,False,False,False,False,False,False,2005-03-07
2,725940,99999,2005,5,11,51.0,4,45.700001,4.0,1021.799988,...,,0.01,,False,False,False,False,False,False,2005-05-11
3,725869,99999,2005,1,27,30.200001,5,26.6,5.0,,...,,0.0,,False,False,False,False,False,False,2005-01-27
4,725827,99999,2005,10,23,55.0,5,28.799999,5.0,1015.599976,...,,0.0,,False,False,False,False,False,False,2005-10-23


# Data Preprocessing

In [4]:
# Filtering stations 725300 to 725330 that have information from 2005 till 2009.
df = df[(df['station_number'] >= 725300) & (df['station_number'] <= 725330)]

# Specify the columns to drop 
# Some of these column like were irrelevant, some coulumns had missing values
columns_to_drop = ['min_temperature', 'min_temperature_explicit','mean_station_pressure','mean_sealevel_pressure', 'num_mean_station_pressure_samples','year','month','day','snow_depth', 'num_mean_sealevel_pressure_samples', 'wban_number', 'num_mean_temp_samples','num_mean_dew_point_samples', 'num_mean_visibility_samples' ,'max_sustained_wind_speed','max_gust_wind_speed','max_temperature_explicit', 'num_mean_wind_speed_samples','tornado','max_temperature']

# Drop the specified columns
df_dropped = df.drop(columns=columns_to_drop)


In [5]:
df_dropped.shape

(1544, 12)

In [6]:
df_dropped

Unnamed: 0,station_number,mean_temp,mean_dew_point,mean_visibility,mean_wind_speed,total_precipitation,fog,rain,snow,hail,thunder,formatted_date
60,725314,72.099998,62.799999,9.7,3.6,0.00,False,False,False,False,False,2005-08-31
66,725317,49.799999,31.900000,10.0,9.2,0.00,False,False,False,False,False,2005-04-25
85,725327,72.599998,57.000000,10.0,4.5,0.00,False,False,False,False,False,2005-07-08
95,725317,53.200001,45.000000,7.3,6.9,0.03,False,False,False,False,False,2005-10-31
99,725314,76.500000,65.699997,8.6,2.2,0.00,False,False,False,False,False,2005-07-08
...,...,...,...,...,...,...,...,...,...,...,...,...
32678,725326,72.099998,59.700001,8.4,5.8,0.00,False,False,False,False,False,2005-06-04
32741,725326,54.000000,36.900002,10.0,3.4,0.00,False,False,False,False,False,2005-09-30
32803,725326,58.200001,37.500000,10.0,6.5,0.00,False,False,False,False,False,2005-11-04
32810,725326,42.400002,33.900002,9.1,1.7,0.00,False,False,False,False,False,2005-10-26


In [7]:
# Checking columns with NAN values
columns_with_nan = df_dropped.columns[df_dropped.isna().any()].tolist()

print(columns_with_nan)


['mean_wind_speed', 'total_precipitation']


##### Strategies for dealing with missing values

In [8]:
#### Strategy 1: Filling the missing values with mean values
#data.fillna(data.mean(), inplace=True)

### Strategy 2: Interpolating the missing values
#df_dropped['mean_wind_speed'] = df_dropped['mean_wind_speed'].interpolate()
#df_dropped['total_precipitation'] = df_dropped['total_precipitation'].interpolate()
#df_dropped['mean_sealevel_pressure'] = df_dropped['mean_sealevel_pressure'].interpolate()

### Strategy 3: Droping the missing rows
df_dropped=df_dropped.dropna()

#### Droping the rows with missing values reduces data size but other strategies are more likely to compromise data quality

In [9]:
df_dropped

Unnamed: 0,station_number,mean_temp,mean_dew_point,mean_visibility,mean_wind_speed,total_precipitation,fog,rain,snow,hail,thunder,formatted_date
60,725314,72.099998,62.799999,9.7,3.6,0.00,False,False,False,False,False,2005-08-31
66,725317,49.799999,31.900000,10.0,9.2,0.00,False,False,False,False,False,2005-04-25
85,725327,72.599998,57.000000,10.0,4.5,0.00,False,False,False,False,False,2005-07-08
95,725317,53.200001,45.000000,7.3,6.9,0.03,False,False,False,False,False,2005-10-31
99,725314,76.500000,65.699997,8.6,2.2,0.00,False,False,False,False,False,2005-07-08
...,...,...,...,...,...,...,...,...,...,...,...,...
32678,725326,72.099998,59.700001,8.4,5.8,0.00,False,False,False,False,False,2005-06-04
32741,725326,54.000000,36.900002,10.0,3.4,0.00,False,False,False,False,False,2005-09-30
32803,725326,58.200001,37.500000,10.0,6.5,0.00,False,False,False,False,False,2005-11-04
32810,725326,42.400002,33.900002,9.1,1.7,0.00,False,False,False,False,False,2005-10-26


In [10]:
# Find columns with NaN values
columns_with_nan = df_dropped.columns[df_dropped.isna().any()].tolist()

print(columns_with_nan)


[]


In [11]:
# Sorting the dataframe with respect to data and resting the dataframe index
final_df = df_dropped.sort_values(by='formatted_date') 
final_df.reset_index(drop=True)

Unnamed: 0,station_number,mean_temp,mean_dew_point,mean_visibility,mean_wind_speed,total_precipitation,fog,rain,snow,hail,thunder,formatted_date
0,725314,60.799999,57.599998,8.9,4.8,0.40,False,False,False,False,False,2005-01-02
1,725300,32.299999,29.400000,7.7,9.3,0.34,True,True,True,True,True,2005-01-04
2,725330,24.799999,20.400000,9.7,7.7,0.03,False,False,False,False,False,2005-01-07
3,725305,16.700001,13.100000,8.4,6.9,0.01,False,False,False,False,False,2005-01-07
4,725316,28.600000,25.500000,7.3,5.4,0.10,True,True,True,True,True,2005-01-08
...,...,...,...,...,...,...,...,...,...,...,...,...
1519,725330,25.500000,16.500000,9.9,5.8,0.00,False,False,False,False,False,2009-12-29
1520,725327,22.799999,18.400000,7.7,6.7,0.00,False,False,False,False,False,2009-12-30
1521,725316,26.100000,21.299999,6.9,7.8,0.01,False,False,False,False,False,2009-12-30
1522,725315,24.000000,19.799999,7.2,7.6,0.00,False,False,False,False,False,2009-12-30


In [12]:
final_df.head(5)

Unnamed: 0,station_number,mean_temp,mean_dew_point,mean_visibility,mean_wind_speed,total_precipitation,fog,rain,snow,hail,thunder,formatted_date
7019,725314,60.799999,57.599998,8.9,4.8,0.4,False,False,False,False,False,2005-01-02
3583,725300,32.299999,29.4,7.7,9.3,0.34,True,True,True,True,True,2005-01-04
19855,725330,24.799999,20.4,9.7,7.7,0.03,False,False,False,False,False,2005-01-07
22833,725305,16.700001,13.1,8.4,6.9,0.01,False,False,False,False,False,2005-01-07
3563,725316,28.6,25.5,7.3,5.4,0.1,True,True,True,True,True,2005-01-08


In [13]:
final_df.columns

Index(['station_number', 'mean_temp', 'mean_dew_point', 'mean_visibility',
       'mean_wind_speed', 'total_precipitation', 'fog', 'rain', 'snow', 'hail',
       'thunder', 'formatted_date'],
      dtype='object')

In [14]:
# Converting the categorical values to numerical values
final_df['fog'] = final_df['fog'].astype(int)
final_df['rain'] = final_df['rain'].astype(int)
final_df['hail'] = final_df['hail'].astype(int)
final_df['thunder'] = final_df['thunder'].astype(int)
final_df['snow'] = final_df['snow'].astype(int)

In [15]:
final_df.head()

Unnamed: 0,station_number,mean_temp,mean_dew_point,mean_visibility,mean_wind_speed,total_precipitation,fog,rain,snow,hail,thunder,formatted_date
7019,725314,60.799999,57.599998,8.9,4.8,0.4,0,0,0,0,0,2005-01-02
3583,725300,32.299999,29.4,7.7,9.3,0.34,1,1,1,1,1,2005-01-04
19855,725330,24.799999,20.4,9.7,7.7,0.03,0,0,0,0,0,2005-01-07
22833,725305,16.700001,13.1,8.4,6.9,0.01,0,0,0,0,0,2005-01-07
3563,725316,28.6,25.5,7.3,5.4,0.1,1,1,1,1,1,2005-01-08


In [16]:
final_df.shape

(1524, 12)

In [17]:
np.unique(final_df['station_number'])

array([725300, 725305, 725314, 725315, 725316, 725317, 725320, 725326,
       725327, 725330])

### The dataset has huge class imbalance with 1350 values representing class 0 "no snow" and only 174 values prepresenting class 1 "snow"

In [24]:
final_df['snow'].value_counts()

0    1350
1     174
Name: snow, dtype: int64

# Splitting the data into train and Test Split

In [18]:

# Create features and labels
features = ['station_number', 'mean_temp', 'mean_dew_point', 'mean_visibility',
            'mean_wind_speed', 'total_precipitation', 'fog', 'rain', 'hail',
            'thunder']
X = final_df[features]

y = final_df['snow'].shift(-1).fillna(0).astype(int)  # Forecasting snow on the next day

# Splitting the data into train test split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Converting the labels to numpy arrays
y_train = y_train.values
y_test = y_test.values



In [19]:
# Standardizing the features to ensure equal weitage of each feature
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# Model

In [20]:
# Create a custom dataset class
class ClimateDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]


In [21]:
# Define the neural network model
class SnowForecastNN(nn.Module):
    def __init__(self, input_size):
        super(SnowForecastNN, self).__init__()
        self.fc1 = nn.Linear(input_size, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

# Training and Testing

In [22]:
train_dataset = ClimateDataset(X_train, y_train)
test_dataset = ClimateDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


# Initialize the model, loss function, and optimizer
model = SnowForecastNN(input_size=X_train.shape[1])
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training the model
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for inputs, labels in train_loader:
        labels = labels.view(-1, 1)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        predicted = (outputs > 0.5).float()
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    train_loss = running_loss / len(train_loader)
    train_accuracy = 100 * correct / total
    
    # Evaluate the model
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    all_labels = []
    all_predictions = []
    with torch.no_grad():
        for inputs, labels in test_loader:
            labels = labels.view(-1, 1)
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            predicted = (outputs > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            all_labels.extend(labels.cpu().numpy())
            all_predictions.extend(predicted.cpu().numpy())
    
    test_loss /= len(test_loader)
    test_accuracy = 100 * correct / total
    
    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%, Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%')
    

Epoch 1/100, Train Loss: 0.5190, Train Accuracy: 88.60%, Test Loss: 0.4087, Test Accuracy: 88.52%
Epoch 2/100, Train Loss: 0.3817, Train Accuracy: 88.60%, Test Loss: 0.3578, Test Accuracy: 88.52%
Epoch 3/100, Train Loss: 0.3455, Train Accuracy: 88.60%, Test Loss: 0.3486, Test Accuracy: 88.52%
Epoch 4/100, Train Loss: 0.3409, Train Accuracy: 88.60%, Test Loss: 0.3458, Test Accuracy: 88.52%
Epoch 5/100, Train Loss: 0.3404, Train Accuracy: 88.60%, Test Loss: 0.3439, Test Accuracy: 88.52%
Epoch 6/100, Train Loss: 0.3492, Train Accuracy: 88.60%, Test Loss: 0.3422, Test Accuracy: 88.52%
Epoch 7/100, Train Loss: 0.3370, Train Accuracy: 88.60%, Test Loss: 0.3411, Test Accuracy: 88.52%
Epoch 8/100, Train Loss: 0.3355, Train Accuracy: 88.60%, Test Loss: 0.3407, Test Accuracy: 88.52%
Epoch 9/100, Train Loss: 0.3342, Train Accuracy: 88.60%, Test Loss: 0.3394, Test Accuracy: 88.52%
Epoch 10/100, Train Loss: 0.3388, Train Accuracy: 88.60%, Test Loss: 0.3402, Test Accuracy: 88.52%
Epoch 11/100, Train

In [25]:
# Print classification report
print("Classification Report:")
print(classification_report(all_labels, all_predictions, target_names=['No Snow', 'Snow']))

Classification Report:
              precision    recall  f1-score   support

     No Snow       0.88      0.97      0.92       270
        Snow       0.00      0.00      0.00        35

    accuracy                           0.86       305
   macro avg       0.44      0.48      0.46       305
weighted avg       0.78      0.86      0.82       305



### Although 86% test accuracy is achieved but the model is only predicting the majority class. In such cases techniques like stratifies group k-fold cross-validation and weighting random sampling can be used to mitigate the class imabalce