# Catalyst example on table-data
@DBusAI

In [None]:
from collections import OrderedDict
import numpy as np
from matplotlib.pylab import plt
%matplotlib inline
from sklearn.datasets.california_housing import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

from catalyst.dl import SupervisedRunner
from catalyst.dl.callbacks import SchedulerCallback
from catalyst.contrib.nn import Lookahead
from catalyst.utils import set_global_seed

### Reproduce all
Catalyst provides a special utils for research results reproducibility.

In [None]:
SEED=42
set_global_seed(SEED)

### Get some data
In this tutorial we will use 
[California dataset](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html )<br>
Also, we split all data: <b>75/25</b> - for training /validation

In [None]:
X, y = fetch_california_housing(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=SEED)

### Dataset definition

We have to normalize all X-data

In [None]:
mscl = StandardScaler()

In [None]:
x_train = mscl.fit_transform(x_train)
x_test = mscl.transform(x_test)

And prepare PyTorch Datasets

In [None]:
train_ds = TensorDataset(torch.FloatTensor(x_train), torch.FloatTensor(y_train.reshape(-1,1)))
test_ds = TensorDataset(torch.FloatTensor(x_test), torch.FloatTensor(y_test.reshape(-1,1)))

### DataLoader definition

We have to define bacth size and shuffle train data: 

In [None]:
batch = 120

train_dl = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=2)
test_dl = DataLoader(test_ds, batch_size=batch, shuffle=False, num_workers=2)

Catalyst loader:

In [None]:
data = OrderedDict()
data['train'] = train_dl
data['valid'] = test_dl

### Define model

Our Neural Network structure will be very simple. Just MLP with 40,20,1 linear layers. Also, default initialization. 

In [None]:
class Net(nn.Module):
    def __init__(self, num_features):
        super(Net,self).__init__()
        layers = [40, 20]
        self.L1 = nn.Linear(num_features, layers[0])
        torch.nn.init.xavier_uniform_(self.L1.weight) 
        torch.nn.init.zeros_(self.L1.bias)
        
        self.L2 = nn.Linear(layers[0], layers[1])
        torch.nn.init.xavier_uniform_(self.L2.weight) 
        torch.nn.init.zeros_(self.L2.bias)
        
        self.L3 = nn.Linear(layers[1], 1)
        torch.nn.init.xavier_uniform_(self.L3.weight) 
        torch.nn.init.zeros_(self.L3.bias)
    def forward(self, x):
        x = F.relu(self.L1(x))
        x = F.relu(self.L2(x))
        x = F.relu(self.L3(x))
        return x

In [None]:
model = Net(x_train.shape[1])

Default optimizer and <b>L2 loss</b>

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
crit = nn.MSELoss()

For model training we need SupervisedRunner and train method:

In [None]:
runner = SupervisedRunner()
runner.train(
    model=model,
    criterion=crit,
    optimizer=optimizer,
    loaders=data,
    logdir="run",
    num_epochs=20)

### Inference

Inference part is mush easier: <br>
<b>/checkpoints/best.pth</b> - is default dir for checkpoints<br>
<b>run</b> - our logdir

In [None]:
predictions = runner.predict_loader(
    model, data["valid"], resume=f"run/checkpoints/best.pth", verbose=True
)

### Results

Let's calculate MSE error 

In [None]:
mean_squared_error(y_test, predictions)

###  Prediction Viz

And finally - show scatterplot for our predictions

In [None]:
plt.scatter(y_test, predictions.flatten())