In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

##### Query for downloading the data from Google Cloud BIGQUERY

##### The dataset was downloaded as csv and later uploaded in Visual Studio where futher processing was performed

In [2]:
#importing dataset by reading csv file (The data was downloaded from Google Cloud using BIGQUERY)
df=pd.read_csv('/root/code/thesis/codeFolder/LatestDataInUse/csv/7learnings.csv')

  df=pd.read_csv('/root/code/thesis/codeFolder/LatestDataInUse/csv/7learnings.csv')


In [3]:
df.head()

Unnamed: 0,station_number,wban_number,year,month,day,mean_temp,num_mean_temp_samples,mean_dew_point,num_mean_dew_point_samples,mean_sealevel_pressure,...,min_temperature_explicit,total_precipitation,snow_depth,fog,rain,snow,hail,thunder,tornado,formatted_date
0,725940,99999,2005,1,25,51.299999,4,45.299999,4.0,1013.5,...,,0.0,,False,False,False,False,False,False,2005-01-25
1,725940,99999,2005,3,7,51.5,4,48.0,4.0,1025.0,...,,0.0,,False,False,False,False,False,False,2005-03-07
2,725940,99999,2005,5,11,51.0,4,45.700001,4.0,1021.799988,...,,0.01,,False,False,False,False,False,False,2005-05-11
3,725869,99999,2005,1,27,30.200001,5,26.6,5.0,,...,,0.0,,False,False,False,False,False,False,2005-01-27
4,725827,99999,2005,10,23,55.0,5,28.799999,5.0,1015.599976,...,,0.0,,False,False,False,False,False,False,2005-10-23


# Data Preprocessing

In [4]:
# Filtering stations 725300 to 725330 that have information from 2005 till 2009.
df = df[(df['station_number'] >= 725300) & (df['station_number'] <= 725330)]

# Specify the columns to drop 
# Some of these column like were irrelevant, some coulumns had missing values
columns_to_drop = ['min_temperature', 'min_temperature_explicit','mean_station_pressure','mean_sealevel_pressure', 'num_mean_station_pressure_samples','year','month','day','snow_depth', 'num_mean_sealevel_pressure_samples', 'wban_number', 'num_mean_temp_samples','num_mean_dew_point_samples', 'num_mean_visibility_samples' ,'max_sustained_wind_speed','max_gust_wind_speed','max_temperature_explicit', 'num_mean_wind_speed_samples','tornado','max_temperature']

# Drop the specified columns
df_dropped = df.drop(columns=columns_to_drop)


In [5]:
# Checking columns with NAN values
columns_with_nan = df_dropped.columns[df_dropped.isna().any()].tolist()

print(columns_with_nan)


['mean_wind_speed', 'total_precipitation']


### Strategies for dealing with missing values

In [6]:
#### Strategy 1: Filling the missing values with mean values
#data.fillna(data.mean(), inplace=True)

### Strategy 2: Interpolating the missing values
#df_dropped['mean_wind_speed'] = df_dropped['mean_wind_speed'].interpolate()
#df_dropped['total_precipitation'] = df_dropped['total_precipitation'].interpolate()
#df_dropped['mean_sealevel_pressure'] = df_dropped['mean_sealevel_pressure'].interpolate()

### Strategy 3: Droping the missing rows
df_dropped=df_dropped.dropna()

##### Droping the rows with missing values reduces data size but other strategies are more likely to compromise data quality

In [7]:
# Finding columns with NaN values
columns_with_nan = df_dropped.columns[df_dropped.isna().any()].tolist()

print(columns_with_nan)


[]


In [8]:
# Sorting the dataframe with respect to data and resting the dataframe index
final_df = df_dropped.sort_values(by='formatted_date') 
final_df = final_df.reset_index(drop=True)

In [9]:
final_df.head(5)

Unnamed: 0,station_number,mean_temp,mean_dew_point,mean_visibility,mean_wind_speed,total_precipitation,fog,rain,snow,hail,thunder,formatted_date
0,725314,60.799999,57.599998,8.9,4.8,0.4,False,False,False,False,False,2005-01-02
1,725300,32.299999,29.4,7.7,9.3,0.34,True,True,True,True,True,2005-01-04
2,725330,24.799999,20.4,9.7,7.7,0.03,False,False,False,False,False,2005-01-07
3,725305,16.700001,13.1,8.4,6.9,0.01,False,False,False,False,False,2005-01-07
4,725316,28.6,25.5,7.3,5.4,0.1,True,True,True,True,True,2005-01-08


In [10]:
final_df.columns

Index(['station_number', 'mean_temp', 'mean_dew_point', 'mean_visibility',
       'mean_wind_speed', 'total_precipitation', 'fog', 'rain', 'snow', 'hail',
       'thunder', 'formatted_date'],
      dtype='object')

In [11]:
# Converting the categorical values to numerical values
final_df['fog'] = final_df['fog'].astype(int)
final_df['rain'] = final_df['rain'].astype(int)
final_df['hail'] = final_df['hail'].astype(int)
final_df['thunder'] = final_df['thunder'].astype(int)
final_df['snow'] = final_df['snow'].astype(int)

In [12]:
final_df.head()

Unnamed: 0,station_number,mean_temp,mean_dew_point,mean_visibility,mean_wind_speed,total_precipitation,fog,rain,snow,hail,thunder,formatted_date
0,725314,60.799999,57.599998,8.9,4.8,0.4,0,0,0,0,0,2005-01-02
1,725300,32.299999,29.4,7.7,9.3,0.34,1,1,1,1,1,2005-01-04
2,725330,24.799999,20.4,9.7,7.7,0.03,0,0,0,0,0,2005-01-07
3,725305,16.700001,13.1,8.4,6.9,0.01,0,0,0,0,0,2005-01-07
4,725316,28.6,25.5,7.3,5.4,0.1,1,1,1,1,1,2005-01-08


In [13]:
final_df.shape

(1524, 12)

In [14]:
np.unique(final_df['station_number'])

array([725300, 725305, 725314, 725315, 725316, 725317, 725320, 725326,
       725327, 725330])

### The dataset has huge class imbalance with 1350 values representing class 0 "no snow" and only 174 values prepresenting class 1 "snow"

In [15]:
final_df['snow'].value_counts()

0    1350
1     174
Name: snow, dtype: int64

In [16]:
# Converting date to datetime
final_df['formatted_date'] = pd.to_datetime(final_df['formatted_date'])

# Sorting the data by station number and date
final_df = final_df.sort_values(by=['station_number', 'formatted_date'])


# Splitting the data into train and Test Split

In [17]:
# Extracting features and target
features = ['mean_temp', 'mean_dew_point', 'mean_visibility', 'mean_wind_speed', 'total_precipitation', 'fog', 'rain', 'hail', 'thunder']
target = 'snow'

X = final_df[features].values
y = final_df[target].values

# Normalizing the features to ensure equal weightage of all the variables
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Spliting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Converting to PyTorch tensors
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)  # unsqueeze to add a dimension
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

# Model

In [18]:
# Defining the neural network model
class SnowForecastModel(nn.Module):
    def __init__(self):
        super(SnowForecastModel, self).__init__()
        self.fc1 = nn.Linear(len(features), 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

### Function to calculate accuracy

In [19]:
# Function to calculate accuracy
def calculate_accuracy(preds, labels):
    predicted = preds.round()
    correct = (predicted == labels).float()
    accuracy = correct.sum() / len(correct)
    return accuracy

# Model Trainging and Evaluation

In [20]:
# Instantiating the model, defining the loss function and the optimizer
model = SnowForecastModel()
criterion = nn.BCELoss()  # Binary Cross Entropy loss for binary classification
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training & evaluating the model
num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train)
    train_loss = criterion(outputs, y_train)
    train_loss.backward()
    optimizer.step()

    train_accuracy = calculate_accuracy(outputs, y_train)

    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test)
        test_loss = criterion(test_outputs, y_test)
        test_accuracy = calculate_accuracy(test_outputs, y_test)

    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {train_loss.item():.4f}, Train Accuracy: {train_accuracy.item():.4f}, Test Loss: {test_loss.item():.4f}, Test Accuracy: {test_accuracy.item():.4f}')

Epoch [1/50], Train Loss: 0.5888, Train Accuracy: 0.8778, Test Loss: 0.5705, Test Accuracy: 0.9180
Epoch [2/50], Train Loss: 0.5783, Train Accuracy: 0.8786, Test Loss: 0.5601, Test Accuracy: 0.9246
Epoch [3/50], Train Loss: 0.5679, Train Accuracy: 0.8884, Test Loss: 0.5498, Test Accuracy: 0.9410
Epoch [4/50], Train Loss: 0.5575, Train Accuracy: 0.9221, Test Loss: 0.5395, Test Accuracy: 0.9705
Epoch [5/50], Train Loss: 0.5472, Train Accuracy: 0.9598, Test Loss: 0.5291, Test Accuracy: 0.9902
Epoch [6/50], Train Loss: 0.5368, Train Accuracy: 0.9795, Test Loss: 0.5188, Test Accuracy: 1.0000
Epoch [7/50], Train Loss: 0.5265, Train Accuracy: 0.9893, Test Loss: 0.5085, Test Accuracy: 1.0000
Epoch [8/50], Train Loss: 0.5160, Train Accuracy: 0.9918, Test Loss: 0.4981, Test Accuracy: 1.0000
Epoch [9/50], Train Loss: 0.5056, Train Accuracy: 0.9975, Test Loss: 0.4877, Test Accuracy: 1.0000
Epoch [10/50], Train Loss: 0.4950, Train Accuracy: 0.9984, Test Loss: 0.4772, Test Accuracy: 1.0000
Epoch [11

In [21]:
# printing classification report
model.eval()
with torch.no_grad():
    test_outputs = model(X_test)
    test_predictions = test_outputs.round().numpy()
    print(f'Classification Report:\n{classification_report(y_test.numpy(), test_predictions)}')

Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       280
         1.0       1.00      1.00      1.00        25

    accuracy                           1.00       305
   macro avg       1.00      1.00      1.00       305
weighted avg       1.00      1.00      1.00       305

