In [1]:
import pandas as pd
import numpy as np
from numpy import save
import torch
from torch import nn
import copy
import torch.optim as optim
import tqdm

from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedKFold

In [2]:
df = pd.read_csv("raw_data.csv")

In [3]:
df.dropna(subset=['RainTomorrow'], inplace=True)

In [4]:
df["TodayRain"] = pd.Series(np.where(df.RainToday.values == 'Yes', 1, 0),df.index)

In [5]:
df["target"] = pd.Series(np.where(df.RainTomorrow.values == 'Yes', 1, 0),df.index)

In [6]:
df["Timestamp"] = pd.to_datetime(df.Date)

In [7]:
df["year"] = df.Timestamp.dt.year
df["month"] = df.Timestamp.dt.month
df["dayofyear"] = df.Timestamp.dt.dayofyear
df["weekofyear"] = df.Timestamp.dt.isocalendar().week

In [8]:
df = df.drop(columns=["RainToday", "RainTomorrow", "Date", "Timestamp"])

In [9]:
temp = df.copy()
print(len(df))
for cat_col in df.columns:
    if df[f"{cat_col}"].dtype == "object":
        encoder = OneHotEncoder()
        one_hot_array = encoder.fit_transform(temp[[f'{cat_col}']]).toarray()

        # create new dataframe from numpy array
        one_hot_df = pd.DataFrame(one_hot_array, columns=encoder.get_feature_names_out())

        df = pd.concat([df, one_hot_df], ignore_index=False, sort=False, axis=1)
        df = df.reindex(temp.index)
        df = df.drop(columns=[f"{cat_col}"])

142193


In [10]:
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
df = pd.DataFrame(imp_mean.fit_transform(df), columns = df.columns)

In [11]:
df = df.sample(frac=1)

In [12]:
X = df.drop(columns=["target"])

In [13]:
y = df["target"]

In [14]:
data_folder = "./data/"

In [15]:
save(f"{data_folder}X.npy", X)
save(f"{data_folder}y.npy", y)

In [24]:
from numpy import load

In [25]:
X = load(f"{data_folder}X.npy")
y = load(f"{data_folder}y.npy")

In [32]:
X = pd.DataFrame(X)
y = pd.DataFrame(y)

In [16]:
# Make device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [17]:
class Wide(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden = nn.Linear(121, 363)
        self.relu = nn.ReLU()
        self.output = nn.Linear(363, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.hidden(x))
        x = self.sigmoid(self.output(x))
        return x

In [18]:
class Deep(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(121, 121)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(121, 121)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(121, 121)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(121, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x

In [19]:
def model_train(model, X_train, y_train, X_val, y_val):
    model.cuda()
    
    # loss function and optimizer
    loss_fn = nn.BCELoss()  # binary cross entropy
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    n_epochs = 50   # number of epochs to run
    batch_size = 200  # size of each batch
    batch_start = torch.arange(0, len(X_train), batch_size)

    # Hold the best model
    best_acc = - np.inf   # init to negative infinity
    best_weights = None

    for epoch in range(n_epochs):
        model.train()
        with tqdm.tqdm(batch_start, unit="batch", mininterval=0, disable=True) as bar:
            bar.set_description(f"Epoch {epoch}")
            for start in bar:
                # take a batch
                X_batch = X_train[start:start+batch_size]
                y_batch = y_train[start:start+batch_size]
                # forward pass
                y_pred = model(X_batch)
                loss = loss_fn(y_pred, y_batch)
                # backward pass
                optimizer.zero_grad()
                loss.backward()
                # update weights
                optimizer.step()
                # print progress
                acc = (y_pred.round() == y_batch).float().mean()
                bar.set_postfix(
                    loss=float(loss),
                    acc=float(acc)
                )
        # evaluate accuracy at end of each epoch
        model.eval()
        y_pred = model(X_val)
        acc = (y_pred.round() == y_val).float().mean()
        acc = float(acc)
        if epoch % 10==0:
            print(f"Acc is {acc}")
        if acc > best_acc:
            best_acc = acc
            best_weights = copy.deepcopy(model.state_dict())
    # restore model and return best accuracy
    model.load_state_dict(best_weights)
    return best_acc

In [None]:
sss = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
counter = 0

for train_index, test_index in sss.split(X, y):
    Xtrain = X.iloc[train_index]
    Xtest = X.iloc[test_index]
    ytrain = y.iloc[train_index]
    ytest = y.iloc[test_index]

    X_train_tensor = torch.tensor(Xtrain.values, dtype=torch.float32).to(device)
    y_train_tensor = torch.tensor(ytrain.values, dtype=torch.float32).to(device).reshape(-1,1)
    X_test_tensor = torch.tensor(Xtest.values, dtype=torch.float32).to(device)
    y_test_tensor = torch.tensor(ytest.values, dtype=torch.float32).to(device).reshape(-1, 1)

    print(f"Split {counter}")

    model_w = Wide()
    acc = model_train(model_w, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)
    print("Accuracy (wide): %.2f" % acc)

    model_d = Deep()
    acc = model_train(model_d, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)
    print("Accuracy (wide): %.2f" % acc)

Split 0
Acc is 0.8323428630828857
