In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
from torchvision import datasets, transforms
import torch.optim as optim
from torch.utils.data import random_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
print(torch.cuda.is_available())

frac_train = 0.8
batch_size = 64
learning_rate = 0.0001
learning_rates = [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
optimizers = {
    'SGD': torch.optim.SGD,
    'RMSprop': torch.optim.RMSprop,
    'Adam': torch.optim.Adam,
    #'LBFGS' : torch.optim.LBFGS,
}
epoch_num = 20

False


In [3]:
# Download training data from open datasets.
dataset = pd.read_csv("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [4]:
dataset.head(3)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No


In [5]:
numeric_columns = dataset.select_dtypes(exclude=['object']).columns

for column in numeric_columns:
    median_value = int(dataset[column].median())
    dataset[column].fillna(median_value, inplace=True)

In [6]:
columns_to_encode = ['RainToday', 'RainTomorrow']

for column in columns_to_encode:
    mode_value = dataset[column].mode()[0]
    dataset[column].fillna(mode_value, inplace=True)
    
dataset['RainToday'].value_counts()

RainToday
No     113580
Yes     31880
Name: count, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for column in columns_to_encode:
    dataset[column] = label_encoder.fit_transform(dataset[column])
    
#from deduction, all the null values remained null, hence produced encoded value of 2
dataset['RainToday'].replace(2, 0, inplace=True)
dataset['RainTomorrow'].replace(2, 0, inplace=True)

In [8]:
dataset['Date'] = pd.to_datetime(dataset['Date'], errors='coerce')
dataset['Year'] = dataset['Date'].dt.year
dataset['Month'] = dataset['Date'].dt.month
dataset['Day'] = dataset['Date'].dt.day

In [9]:
#one-hot encode the locations
encoded_cols = pd.get_dummies(dataset[['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']])
dataset = pd.concat([dataset, encoded_cols], axis=1)
dataset.drop(['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'Date'], axis=1, inplace=True)
#changing the boolean values to integers
boolean_columns = dataset.select_dtypes(include=bool).columns
dataset[boolean_columns] = dataset[boolean_columns].astype(int)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Columns: 118 entries, MinTemp to WindDir3pm_WSW
dtypes: float64(16), int32(3), int64(99)
memory usage: 129.3 MB


In [10]:
temp = dataset.select_dtypes(include=['number'])
X = temp.drop(columns=['RainTomorrow'])  # Features
y = dataset['RainTomorrow']  # Target

In [11]:
y.value_counts()

RainTomorrow
0    113583
1     31877
Name: count, dtype: int64

In [12]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y.view(-1, 1)  # Reshape y to have shape [num_samples, 1]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

custom_dataset = CustomDataset(X_tensor, y_tensor)

In [13]:
train_size = int(0.6 * len(custom_dataset))  # 60% for training
val_size = int(0.2 * len(custom_dataset))   # 20% for validation
test_size = len(custom_dataset) - train_size - val_size  # Remaining 20% for testing

# Split the dataset into train, validation, and test sets
train_dataset, val_dataset, test_dataset = random_split(custom_dataset, [train_size, val_size, test_size])

# Create DataLoader for train, validation, and test data
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [14]:
for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([64, 117])
Shape of y: torch.Size([64, 1]) torch.float32


In [15]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

class EarlyStopping:
    def __init__(self, min_delta=0.001, patience=20, restore_best_weights=True):
        self.min_delta = min_delta
        self.patience = patience
        self.restore_best_weights = restore_best_weights
        self.best_loss = float('inf')
        self.counter = 0
        self.best_model = None

    def __call__(self, current_loss):
        if current_loss < self.best_loss - self.min_delta:
            self.best_loss = current_loss
            self.counter = 0
            if self.restore_best_weights:
                self.best_model = model.state_dict()
        else:
            self.counter += 1
            if self.counter >= self.patience:
                if self.restore_best_weights:
                    model.load_state_dict(self.best_model)
                return True
        return False

# Define model
class RainPredictionModel(nn.Module):
    def __init__(self, input_size):
        super(RainPredictionModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 32)
        self.fc2 = nn.Linear(32, 32)
        self.fc3 = nn.Linear(32, 16)
        self.dropout1 = nn.Dropout(0.25)
        self.fc4 = nn.Linear(16, 8)
        self.dropout2 = nn.Dropout(0.5)
        self.fc5 = nn.Linear(8, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc4(x))
        x = self.dropout2(x)
        x = torch.sigmoid(self.fc5(x))
        return x

input_size = len(dataset.columns) - 1
model = RainPredictionModel(input_size).to(device)
print(model)

Using cpu device
RainPredictionModel(
  (fc1): Linear(in_features=117, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=16, bias=True)
  (dropout1): Dropout(p=0.25, inplace=False)
  (fc4): Linear(in_features=16, out_features=8, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (fc5): Linear(in_features=8, out_features=1, bias=True)
)


In [16]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

In [17]:
def train(dataloader, model, loss_fn, optimizer, early_stopping=None, patience=20):
    size = len(dataloader.dataset)
    model.train()
    best_val_loss = float('inf')
    no_improvement = 0
    
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss_value, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss_value:>7f}  [{current:>5d}/{size:>5d}]")
            
            # Validate the model and check for early stopping
            if early_stopping is not None:
                val_loss = validate(val_dataloader, model, loss_fn)
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    no_improvement = 0
                else:
                    no_improvement += 1
                    if no_improvement >= patience:
                        print(f"Early stopping after {batch} batches.")
                        break

def validate(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    model.eval()
    val_loss = 0.0
    
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            val_loss += loss_fn(pred, y).item()
    
    val_loss /= len(dataloader)
    return val_loss

In [18]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [19]:
# epochs = 5
for t in range(epoch_num):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 1.296614  [   64/87276]
loss: 0.714250  [ 6464/87276]
loss: 0.565615  [12864/87276]
loss: 0.642743  [19264/87276]
loss: 0.645066  [25664/87276]
loss: 0.597777  [32064/87276]
loss: 0.540913  [38464/87276]
loss: 0.515097  [44864/87276]
loss: 0.508729  [51264/87276]
loss: 0.666823  [57664/87276]
loss: 0.618542  [64064/87276]
loss: 0.503358  [70464/87276]
loss: 0.535612  [76864/87276]
loss: 0.572420  [83264/87276]
Test Error: 
 Accuracy: 50.1%, Avg loss: 0.533891 

Epoch 2
-------------------------------
loss: 0.532730  [   64/87276]
loss: 0.553558  [ 6464/87276]
loss: 0.667963  [12864/87276]
loss: 0.586269  [19264/87276]
loss: 0.589268  [25664/87276]
loss: 0.562558  [32064/87276]
loss: 0.575293  [38464/87276]
loss: 0.508452  [44864/87276]
loss: 0.496037  [51264/87276]
loss: 0.517039  [57664/87276]
loss: 0.607502  [64064/87276]
loss: 0.539752  [70464/87276]
loss: 0.543677  [76864/87276]
loss: 0.604666  [83264/87276]
Test Error: 
 Accuracy: 50.1

In [20]:
model.eval()    # 평가시에는 dropout이 OFF 된다.
correct = 0
for data, target in test_dataloader:
    data = data.to(device)
    target = target.to(device)
    output = model(data)
    prediction = output.data.max(1)[1]
    correct += prediction.eq(target.data).sum()
print('Test set Accuracy : {:.2f}%'.format(correct / len(test_dataloader.dataset)))

Test set Accuracy : 50.05%


In [21]:
# Create an empty list to store results
results = []

# Number id
number_id = 0

# Iterate through optimizers and learning rates
for optimizer_name, optimizer_class in optimizers.items():
    for lr in learning_rates:
        print(f"Training with optimizer: {optimizer_name}, Learning Rate: {lr}")

        optimizer = optimizer_class(model.parameters(), lr=lr)

        for epoch in range(epoch_num):
            print(f"Epoch {epoch + 1}\n-------------------------------")
            train(train_dataloader, model, loss_fn, optimizer)
            test(test_dataloader, model, loss_fn)

        model.eval()
        correct = 0
        test_loss = 0
        for data, target in test_dataloader:
            data = data.to(device)
            target = target.to(device)
            output = model(data)
            test_loss += loss_fn(output, target).item()  # Calculating loss
            prediction = output.data.max(1)[1]
            correct += prediction.eq(target.data).sum()

        accuracy = correct / len(test_dataloader.dataset)
        avg_loss = test_loss / len(test_dataloader.dataset)  # Average test loss
        
        print(f'Test set Accuracy : {accuracy:.2f}%')
        print(f'Average Test Loss : {avg_loss:.4f}')

        results.append((number_id, optimizer_name, lr, accuracy.item(), avg_loss))

        number_id += 1

results_df = pd.DataFrame(results, columns=['Number', 'Optimizer', 'Learning Rate', 'Accuracy', 'Average Loss'])

Training with optimizer: SGD, Learning Rate: 0.01
Epoch 1
-------------------------------
loss: 0.553930  [   64/87276]
loss: 0.514273  [ 6464/87276]
loss: 0.555884  [12864/87276]
loss: 0.565869  [19264/87276]
loss: 0.432834  [25664/87276]
loss: 0.484872  [32064/87276]
loss: 0.636015  [38464/87276]
loss: 0.518173  [44864/87276]
loss: 0.631445  [51264/87276]
loss: 0.578633  [57664/87276]
loss: 0.432113  [64064/87276]
loss: 0.506124  [70464/87276]
loss: 0.499836  [76864/87276]
loss: 0.558418  [83264/87276]
Test Error: 
 Accuracy: 50.1%, Avg loss: 0.529472 

Epoch 2
-------------------------------
loss: 0.466951  [   64/87276]
loss: 0.546825  [ 6464/87276]
loss: 0.477695  [12864/87276]
loss: 0.577441  [19264/87276]
loss: 0.542198  [25664/87276]
loss: 0.583887  [32064/87276]
loss: 0.566990  [38464/87276]
loss: 0.530054  [44864/87276]
loss: 0.493131  [51264/87276]
loss: 0.582123  [57664/87276]
loss: 0.477099  [64064/87276]
loss: 0.600345  [70464/87276]
loss: 0.602751  [76864/87276]
loss: 0.

In [22]:
results_df.to_csv("Results.csv", index=False)
results_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21 entries, 0 to 20
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Number         21 non-null     int64  
 1   Optimizer      21 non-null     object 
 2   Learning Rate  21 non-null     float64
 3   Accuracy       21 non-null     float64
 4   Average Loss   21 non-null     float64
dtypes: float64(3), int64(1), object(1)
memory usage: 968.0+ bytes
