In [58]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
%matplotlib inline

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
from scipy import stats

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [59]:
df = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')
df.Date = pd.to_datetime(df.Date)
df.T

In [60]:
df.info()

In [61]:
df.describe()

In [62]:
fig,axs = plt.subplots(3, 1, figsize=(10, 6))
df.resample(rule='W', on='Date')['Rainfall'].mean().plot(ax=axs[0])
df.resample(rule='M', on='Date')['Rainfall'].mean().plot(ax=axs[1])
df.resample(rule='Y', on='Date')['Rainfall'].mean().plot(ax=axs[2])

fig.tight_layout()

In [63]:
fig, ax = plt.subplots(figsize = (12, 6))
sns.heatmap(df.isnull(), ax=ax)
ax.set_title('Null values')

In [64]:
df['RainToday'].value_counts() \
    .plot(kind='pie', autopct='%1.1f%%', radius = 1.5, textprops = {"fontsize" : 16}, title='% of rainy days')

In [65]:
sns.heatmap(
pd.crosstab(df.Date.dt.year, df.Date.dt.month, values=df.Rainfall, aggfunc='mean'),
linewidth=0.1, cmap="Blues"
)

In [66]:
sns.heatmap(pd.DataFrame(df.groupby(df.Date.dt.month).mean().Rainfall))

Drop all rows having all null values

All rows must have columns filled at least half.

In [67]:
df.dropna(thresh=len(df.columns)//2 + 1, inplace=True)

Filling numerical missing values with the mean, and missing categorical values with the last obsereved value.

In [68]:
num_cols = []
cat_cols = []

for col in df:
    if df[col].dtype.name == 'object':
        df[col].fillna(method='ffill', inplace=True)
        cat_cols.append(col)
        print('filled', col)
    
    elif df[col].dtype.name == 'float64':
        df[col].fillna(df[col].mean(), inplace=True)
        num_cols.append(col)
        print('filled', col)
        

In [69]:
p = df.Rainfall.plot.hist()
p.set_xlim((0, 150))

In [70]:
sns.heatmap(
pd.crosstab(df.Date.dt.month,df.Date.dt.day, values=df.Rainfall, aggfunc='mean'),
linewidth=0.1, cmap="Blues"
)

## Find & clean outliers!

In [71]:
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
df.loc[:, num_cols] = std_scaler.fit_transform(df[num_cols])

In [16]:
fig, axs = plt.subplots(len(num_cols),1, figsize=(10,20))
for ax,col in zip(axs.flatten(), num_cols):
    sns.boxplot(data=df,x=col, ax=ax)

fig.tight_layout()

In [72]:
n_outliers = len(df)

In [73]:
df = df[(df[num_cols].abs() < 4).all(axis=1)]

In [74]:
n_outliers -= len(df)
n_outliers

In [75]:
df.iloc[:, 1:-2].hist(figsize=(10, 6))
plt.tight_layout()

In [22]:
fig, axs = plt.subplots(4,4, figsize=(12,6))
for ax,col in zip(axs.flatten(), num_cols):
    sns.lineplot(data=df,x=col,y='Rainfall', ax=ax)

fig.tight_layout()

## Remove columns of high colinarity

In [76]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
VIFs = pd.DataFrame()
VIFs['cols'] = num_cols
VIFs['VIFs'] = [variance_inflation_factor(df[num_cols].values, i) for i in range(len(num_cols)) ]
VIFs

In [24]:
vifs = [variance_inflation_factor(df[num_cols].values, i) for i in range(len(num_cols)) ]
hVIF_cols = dict(zip(num_cols, vifs))

In [None]:
VIF_cols = dict(zip(num_cols, vifs))
max_cols = []
while max(VIF_cols.values()) > 5:
    max_col = max(VIF_cols, key=VIF_cols.get)
    max_cols.append(max_col)
    new_cols = list(set(num_cols).difference(max_cols))
    new_vifs = [variance_inflation_factor(df[new_cols].values, i) for i in range(len(new_cols)) ]
    VIF_cols = dict(zip(new_cols, new_vifs))
    
print(f'Cols Removed (n={len(max_cols)}): ', *max_cols)
num_cols = list(VIF_cols.keys())
pd.DataFrame(VIF_cols, index=[0]).T

## Categorical data

In [77]:
cat_cols

In [78]:
from sklearn.preprocessing import LabelEncoder
for col in cat_cols:
    df[col] = LabelEncoder().fit_transform(df[col])

In [79]:
cat_cols.remove('RainTomorrow') 
#cat_cols.remove('Location') 

In [80]:
print(f'Trainable categorical features (n={len(cat_cols)}):', *cat_cols)

In [81]:
# inspired by https://jovian.ai/aakanksha-ns/shelter-outcome
embd_lens = {col: len(df[col].unique()) 
             for col in cat_cols if len(df[col].unique()) > 2}
embd_lens

In [82]:
embd_sizes = list(map(lambda x:  (x,min(50, (x + 1)//2)) , embd_lens.values()))
embd_sizes

## Encoding Month/Day 

In [83]:
# Inspired by https://ianlondon.github.io/blog/encoding-cyclical-features-24hour-time/
c = 2 * np.pi
month_d = c *  df.Date.dt.month / 12
df['month_cos'] = np.cos(month_d)
df['month_sin'] = np.sin(month_d)


day_d = c *  df.Date.dt.day / 31 
df['day_cos'] = np.cos(day_d)
df['day_sin'] = np.sin(day_d)

df.drop(['Date'], axis=1, inplace=True)

In [84]:
fig, axs = plt.subplots(1,2, figsize=(8, 4))

df.plot.scatter('month_cos', 'month_sin', ax=axs[0])
df.plot.scatter('day_cos', 'day_sin', ax=axs[1])
fig.tight_layout()

In [85]:
num_cols +=  ['day_cos', 'day_sin', 'month_cos', 'month_sin']
num_cols

## Data balance

In [86]:
selected_cols = num_cols + cat_cols
X = df[selected_cols]
y = df['RainTomorrow']

In [87]:
X.T.head(n=len(selected_cols))

## Data split

In [88]:
from sklearn.model_selection import train_test_split

# 80% Training, 10% Validation, 10% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05,
                                                    random_state=1337)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.06,
                                                    random_state=1337)

In [89]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=1337)
X_train, y_train = smote.fit_resample(X_train, y_train)

In [90]:
y_train.value_counts().plot(kind='bar')

## Model

In [91]:
import torch
from torch.utils.data import Dataset, DataLoader
class Dataset(Dataset):
    def __init__(self, X, y_df):
        # inspired by https://jovian.ai/aakanksha-ns/shelter-outcome
        self.X_cat = X[cat_cols].copy().reset_index(drop=True).values.astype(np.int64)
        self.X_cont = X[num_cols].copy().reset_index(drop=True).values.astype(np.float32)
        self.y = y_df.copy().reset_index(drop=True)
        assert len(self.X_cont) == len(self.y) == len(self.X_cat)
   
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        x_cont = self.X_cont[idx]
        x_cat = self.X_cat[idx]
        y = self.y[idx]
        return x_cont, x_cat, y


ld_params = {'batch_size': 128,
          'shuffle': True,}

train_ds = Dataset(X_train, y_train)
train_ld = DataLoader(train_ds, **ld_params)  

val_ds = Dataset(X_val, y_val)
val_ld = DataLoader(val_ds, **ld_params)  

In [93]:
# embedding inspired by https://jovian.ai/aakanksha-ns/shelter-outcome

import torch.nn as nn
class NNet(nn.Module):
    def __init__(self, n_cont):
        super(NNet, self).__init__()
        self.embds = nn.ModuleList([nn.Embedding(n_cats, size) 
                                         for n_cats,size in embd_sizes])
        
        n_emb = sum(map(lambda e:e.embedding_dim, self.embds)) 
        self.emb_drop = nn.Dropout(0.05)
        self.act = nn.ReLU()
        n_in = (n_emb + n_cont) 

        self.init = nn.Sequential(
            nn.Linear(n_in, n_in*2),
            nn.BatchNorm1d(n_in*2),
            self.act,
            nn.Dropout(0.3),    
            
            nn.Linear(n_in*2, 256),
            nn.BatchNorm1d(256),
            self.act,
            nn.Dropout(0.3),
            
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            self.act,
            nn.Dropout(0.3),  
            
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            self.act,
            nn.Dropout(0.3), 
            
            nn.Linear(64, 16),
            nn.BatchNorm1d(16),
            self.act,
            nn.Dropout(0.1), 
            
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
            

    def forward(self, x_cont, x_cat):
            x = [ e(x_cat[:,i]) for i,e in enumerate(self.embds)]
            x = torch.cat(x, 1)   
            x = torch.cat([x, x_cont], 1)
            x = self.init(x)
      
                
            
            return x

## Train 

In [94]:
import torch.optim as optim
model = NNet(len(num_cols))
model = model
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
print(model)

In [142]:
model.load_state_dict(torch.load('./model'))

In [95]:
from sklearn.metrics import f1_score, accuracy_score
def calc_scores(model, data_ld, criterion):
    model.eval()
    total_loss = 0.0
    acc = 0.0
    f1 = 0.0
    n = len(data_ld)
    for x_cont, x_cat, lbls in data_ld:
#         x_cont = x_cont.cuda()
#         x_cat = x_cat.cuda()
#         lbls = lbls.cuda()
        
        preds = model(x_cont, x_cat).squeeze()
        total_loss += criterion(preds, lbls.float())
        lbl_preds = torch.round(preds).tolist()
        acc +=  accuracy_score(lbls, lbl_preds)
        f1 +=  f1_score(lbls, lbl_preds)
        
        del x_cont, x_cat, lbls
        
    return total_loss / n, acc / n, f1 / n
        

In [96]:
print("val loss: %0.4f acc: %0.4f f1: %0.4f" % calc_scores(model, val_ld, criterion))


In [97]:
max_val_f1 = 0.6405

In [98]:
import gc
gc.collect()

In [99]:
scores = dict()
for epoch in range(200):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(train_ld, 0):
        # get the inputs; data is a list of [inputs, labels]
        model.train(True)
        x_cont, x_cat, labels = data
        
#         x_cont = x_cont.cuda()
#         x_cat = x_cat.cuda()
#         labels = labels.cuda()
        
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(x_cont, x_cat)
        loss = criterion(outputs, labels.float().unsqueeze(1))
        loss.backward()
        optimizer.step()
        
        del x_cont
        del x_cat
        del labels
#         torch.cuda.empty_cache()

    val_loss, val_acc, val_f1 = calc_scores(model, val_ld, criterion)
    t_loss, t_acc, t_f1 = calc_scores(model, train_ld, criterion)

    print(epoch, "val loss:   %0.6f acc: %0.4f f1: %0.4f" %  (val_loss, val_acc, val_f1) )
    print(epoch, "train loss: %0.6f acc: %0.4f f1: %0.4f" %  (t_loss, t_acc, t_f1) )
    if val_f1 > max_val_f1:
        torch.save(model.state_dict(), 'model')
        max_val_f1 = val_f1
        print('model saved')
        
    print()
    gc.collect()
    

print('Finished Training')

In [106]:
from sklearn.metrics import confusion_matrix

In [None]:
test_ds = Dataset(X_test, y_test)
test_ld = DataLoader(test_ds, **ld_params)  

In [133]:
y_preds = []
y_full = []
model.eval()
for x_cont, x_cat, lbls in test_ld:
    preds = model(x_cont, x_cat).squeeze()
    lbl_preds = torch.round(preds).tolist()
    y_preds += lbl_preds
    y_full += lbls.tolist()

In [145]:
print("test loss:   %0.6f acc: %0.4f f1: %0.4f" %  calc_scores(model, test_ld, criterion) )

In [146]:
print("test loss:   %0.6f acc: %0.4f f1: %0.4f" %  calc_scores(model, train_ld, criterion) )

In [147]:
print("test loss:   %0.6f acc: %0.4f f1: %0.4f" %  calc_scores(model, val_ld, criterion) )

In [135]:
sns.heatmap(confusion_matrix(y_full, y_preds, normalize='true'), annot=True)