In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
from torchvision import datasets, transforms
import torch.optim as optim
from torch.utils.data import random_split

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
print(torch.cuda.is_available())

frac_train = 0.8
batch_size = 64
learning_rate = 0.0001
learning_rates = [0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005, 0.00001]
optimizers = {
    'SGD': torch.optim.SGD,
    'RMSprop': torch.optim.RMSprop,
    'Adam': torch.optim.Adam,
    #'LBFGS' : torch.optim.LBFGS,
}
epoch_num = 20

True


In [3]:
# Download training data from open datasets.
dataset = pd.read_csv("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 145460 entries, 0 to 145459
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           145460 non-null  object 
 1   Location       145460 non-null  object 
 2   MinTemp        143975 non-null  float64
 3   MaxTemp        144199 non-null  float64
 4   Rainfall       142199 non-null  float64
 5   Evaporation    82670 non-null   float64
 6   Sunshine       75625 non-null   float64
 7   WindGustDir    135134 non-null  object 
 8   WindGustSpeed  135197 non-null  float64
 9   WindDir9am     134894 non-null  object 
 10  WindDir3pm     141232 non-null  object 
 11  WindSpeed9am   143693 non-null  float64
 12  WindSpeed3pm   142398 non-null  float64
 13  Humidity9am    142806 non-null  float64
 14  Humidity3pm    140953 non-null  float64
 15  Pressure9am    130395 non-null  float64
 16  Pressure3pm    130432 non-null  float64
 17  Cloud9am       89572 non-null

In [4]:
dataset.head(3)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Albury,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Albury,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Albury,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No


In [5]:
numeric_columns = dataset.select_dtypes(exclude=['object']).columns

for column in numeric_columns:
    median_value = int(dataset[column].median())
    dataset[column].fillna(median_value, inplace=True)

In [6]:
columns_to_encode = ['RainToday', 'RainTomorrow']

for column in columns_to_encode:
    mode_value = dataset[column].mode()
    dataset[column].fillna(mode_value, inplace=True)
    
dataset['RainToday'].value_counts()

RainToday
No     110319
Yes     31880
Name: count, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for column in columns_to_encode:
    dataset[column] = label_encoder.fit_transform(dataset[column])
    
#from deduction, all the null values remained null, hence produced encoded value of 2
dataset['RainToday'].replace(2, 0, inplace=True)
dataset['RainTomorrow'].replace(2, 0, inplace=True)

In [8]:
dataset['Date'] = pd.to_datetime(dataset['Date'], errors='coerce')
dataset['Year'] = dataset['Date'].dt.year
dataset['Month'] = dataset['Date'].dt.month
dataset['Day'] = dataset['Date'].dt.day

In [9]:
object_columns = dataset.select_dtypes(include=['object']).columns
dataset = dataset.drop(object_columns, axis=1)

In [10]:
temp = dataset.select_dtypes(include=['number'])
X = temp.drop(columns=['RainTomorrow'])  # Features
y = dataset['RainTomorrow']  # Target

In [11]:
y.value_counts()

RainTomorrow
0    113583
1     31877
Name: count, dtype: int64

In [12]:
class CustomDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y.view(-1, 1)  # Reshape y to have shape [num_samples, 1]
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_tensor = torch.tensor(X.values, dtype=torch.float32)
y_tensor = torch.tensor(y.values, dtype=torch.float32)

custom_dataset = CustomDataset(X_tensor, y_tensor)

In [13]:
train_size = int(0.8 * len(custom_dataset))  # 80% of the data for training
test_size = len(custom_dataset) - train_size  # Remaining 20% for testing

# Split the dataset
train_dataset, test_dataset = random_split(custom_dataset, [train_size, test_size])

# Create DataLoader for training data
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Create DataLoader for test data
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [14]:
for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([64, 20])
Shape of y: torch.Size([64, 1]) torch.float32


In [15]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
device = torch.device('cpu')
print(f"Using {device} device")

# Sample layers of Neural Network Class
# nn.Linear(28*28, 512),
# nn.ReLU(),
# nn.Linear(512, 512),
# nn.ReLU(),
# nn.Linear(512, 10)

# Define model
class RainPredictionModel(nn.Module):
    def __init__(self, input_size):
        super(RainPredictionModel, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(input_size, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.flatten(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        x = self.sigmoid(x)
        return x

input_size = 20
model = RainPredictionModel(input_size).to(device)
print(model)

Using cpu device
RainPredictionModel(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (fc1): Linear(in_features=20, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [16]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

In [17]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [18]:
def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [19]:
# epochs = 5
for t in range(epoch_num):
    print(f"Epoch {t+1}\n-------------------------------")
    train(train_dataloader, model, loss_fn, optimizer)
    test(test_dataloader, model, loss_fn)
print("Done!")

Epoch 1
-------------------------------
loss: 10.717715  [   64/116368]
loss: 9.303443  [ 6464/116368]
loss: 7.727368  [12864/116368]
loss: 8.543629  [19264/116368]
loss: 10.766202  [25664/116368]
loss: 8.546373  [32064/116368]
loss: 7.794606  [38464/116368]
loss: 8.491985  [44864/116368]
loss: 14.788044  [51264/116368]
loss: 11.605762  [57664/116368]
loss: 13.767898  [64064/116368]
loss: 8.312973  [70464/116368]
loss: 13.150848  [76864/116368]
loss: 8.494238  [83264/116368]
loss: 10.214846  [89664/116368]
loss: 9.213172  [96064/116368]
loss: 13.926218  [102464/116368]
loss: 9.344378  [108864/116368]
loss: 16.139441  [115264/116368]
Test Error: 
 Accuracy: 50.1%, Avg loss: 10.700687 

Epoch 2
-------------------------------
loss: 8.437740  [   64/116368]
loss: 5.425517  [ 6464/116368]
loss: 9.235620  [12864/116368]
loss: 14.690083  [19264/116368]
loss: 11.514810  [25664/116368]
loss: 11.579418  [32064/116368]
loss: 7.783665  [38464/116368]
loss: 9.327372  [44864/116368]
loss: 9.251909 

In [20]:
model.eval()    # 평가시에는 dropout이 OFF 된다.
correct = 0
for data, target in test_dataloader:
    data = data.to(device)
    target = target.to(device)
    output = model(data)
    prediction = output.data.max(1)[1]
    correct += prediction.eq(target.data).sum()
print('Test set Accuracy : {:.2f}%'.format(correct / len(test_dataloader.dataset)))

Test set Accuracy : 50.12%
