[![Open in Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/biodatlab/deep-learning-skooldio/blob/master/student_notebooks/04_pytorch_for_tabular_data.ipynb)


## **Predicting second hand car price from Rodkaidee: Regression**

This notebook contains the following steps for predicting second hand car price from [Rodkaidee website](https://rod.kaidee.com/)

- Import libraries
- Prepare data: Categorical and numerical features
- Train linear regression model
- Train one-layer neural network model for regression (equivalent to linear regression)
- Train multi-layer neural network model for regression

In [None]:
import os
import os.path as op

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.metrics import mean_squared_error
import joblib

## **Data Preparation**

- Download data from [`github.com/biodatlab/deep-learning-skooldio`](https://github.com/biodatlab/deep-learning-skooldio)
- Clean and split data

In [None]:
!git clone https://github.com/biodatlab/deep-learning-skooldio

In [None]:
df = pd.read_csv(op.join("deep-learning-skooldio", "rodkaidee-dataset", "Rodkaidee Data.csv"))
df.head(5)

## Clean and split data

In [None]:
import os.path as op

# TODO: Clean price and mileage from string to float

# TODO: Split dataset using `train_test_split` to 80% train and 20% test

In [None]:
# TODO: Check image size

## Apply One-hot Encoding to categorical values

For example, if a column has three unique values 'A', 'B', and 'C', rows that contain 'C' will be transformed to [0, 0, 1].

In [None]:
# TODO: Create one-hot encoder class using OneHotEncoder for transforming categorical columns in variable name ``ohe``

In [None]:
# TODO: Use ``ohe`` to transform our dataframe


## Normalize continuous values

We will use `MinMaxScaler` to normalize continuous value.
For example, column X has values from 0 to 1000000, so we need to normalize them to 0 to 1.
Alternatively `StandardScaler` is also an option.

In [None]:
# TODO: Create MinMaxScaler for numerical (continuous) columns

In [None]:
# TODO: Use Scaler to transform our dataframe

## Apply one-hot encoding and normalization to create train and test dataset

In [None]:
# TODOs: Apply one-hot encoder and MinMaxScaler to the training and validation dataframe

In [None]:
X_train.shape, y_train.shape, X_val.shape, y_val.shape

In [None]:
# save one-hot encoder and scaler objects with joblib
save_dir = "trained_models"
os.makedirs(save_dir, exist_ok=True)
joblib.dump(ohe, op.join(save_dir, "one_hot_encoder.joblib"))
joblib.dump(year_scaler, op.join(save_dir, "year_scaler.joblib"))
joblib.dump(mileage_scaler, op.join(save_dir, "mileage_scaler.joblib"))
joblib.dump(price_scaler, op.join(save_dir, "price_scaler.joblib"))

## **Train a Linear Regression Model**

In [None]:
# TODO: create a linear regression model and fit to the training data.
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [None]:
y_pred_lr = lr_model.predict(X_val)
y_pred_lr.shape

In [None]:
df_val["Predicted_Price_LR"] = price_scaler.inverse_transform(y_pred_lr).ravel()

In [None]:
df_val

## **Evaluate and plot relationship between actual and predicted values**

In [None]:
# Calculate the mean squared error between actual value and prediction

# TODO: Create your own MSE

# TODO: Use `mean_squared_error` from scikit learn

In [None]:
# Plot a perfect prediction line.
plt.plot([0, 1], [0, 1],  "--", color="black",)
# Plot the predictions vs the actual values.
plt.Figure(figsize=(10, 10))
plt.scatter(y_pred, y_val, alpha=0.2)

plt.xlabel("Predictions value")
plt.ylabel("Actual value")
plt.title("Predictions vs Actual")
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.show()

In [None]:
joblib.dump(lr_model, "trained_models/carprice_model_linear_regression.joblib")

## **Train a Neural Network to predict car price**

In [None]:
from torch.utils.data import Dataset, DataLoader

# TODO: Create dataset from numpy array X, y
class CarPriceDataset(Dataset):
    def __init__(self, X, y = None):


    def __len__(self):


    def __getitem__(self, idx):


In [None]:
# TODO: Create dataset and dataloader


In [None]:
class CarPriceModel(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        # TODO: Create a linear layer with input_size and output_size

    def forward(self, x):
        # TODO: Create a forward pass
        return x

In [None]:
# TODO: Create a model

In [None]:
def train(
    model,
    train_dataloader,
    test_dataloader,
    optimizer,
    loss_fn = nn.MSELoss(reduction='mean'),
    num_epochs = 100,
    update_frequency = 10,
):
    for epoch in tqdm(range(num_epochs)):
        # Set the model to training mode.
        model.train()
        for x, y in train_dataloader:
            # Forward pass.
            y_pred = model(x.float())
            # Calculate the loss.
            loss = loss_fn(y_pred, y.float())
            # Backward pass.
            loss.backward()
            # Update the model parameters.
            optimizer.step()
            # Reset the gradients.
            optimizer.zero_grad()

        # Set the model to evaluation mode and calculate the test loss at the end of each epoch.
        model.eval()
        with torch.no_grad():
            for x, y in test_dataloader:
                # Forward pass.
                y_pred = model(x.float())
                # Calculate the loss.
                loss = loss_fn(y_pred, y.float())

            # Print the loss if the epoch is a multiple of 10.
            if epoch % update_frequency == 0:
                print(f"Epoch {epoch}: {loss.item()}")

In [None]:
# Define the loss function and the optimizer.
learning_rate = 1e-3
loss_fn = nn.MSELoss(reduction="mean")
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model.
num_epochs = 120
update_frequency = 10

train(
    model = model,
    train_dataloader = train_dataloader,
    test_dataloader = val_dataloader,
    optimizer = optimizer,
    loss_fn = loss_fn,
    num_epochs = num_epochs,
    update_frequency = update_frequency,
)

### Evaluate the model on the test set with Mean Squared Error

In [None]:
def predict(model, test_dataloader):
    model.eval()
    y_pred_list = []
    y_true_list = []
    for x, y in test_dataloader:
        y_pred = model(x.float())
        prediction = y_pred.detach().numpy()
        y = y.numpy()
        y_pred_list.extend(prediction)
        y_true_list.extend(y)
    y_pred_list = np.concatenate(y_pred_list)
    y_true_list = np.concatenate(y_true_list)
    return y_pred_list, y_true_list

In [None]:
# Get the predictions for the test set
y_pred, y_true = predict(model, val_dataloader)

In [None]:
mean_squared_error(y_true, y_pred)

In [None]:
# Plot a perfect prediction line.
plt.plot([0, 1], [0, 1],  "--", color="black",)
# Plot the predictions vs the actual values.
plt.scatter(y_pred, y_true, alpha=0.2)

plt.xlabel("Predictions")
plt.ylabel("Actual")
plt.title("Predictions vs Actual")
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.show()

In [None]:
torch.save(model.state_dict(), "trained_models/carprice_model_mse_0003.pth")

## **Train a Neural Network with multiple layers (MLP)**

In [None]:
class CarPriceTwoLayerModel(nn.Module):
    def __init__(self, input_size, output_size, intermediate_dim = 10):
        super().__init__()
        # TODOs: Create two layers neural network
        self.linear1 = nn.Linear(input_size, intermediate_dim)
        self.linear2 = nn.Linear(intermediate_dim, output_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        return x

model_two_layers = CarPriceTwoLayerModel(input_size=n_features, output_size=n_output)
model_two_layers

In [None]:
# Define the loss function and the optimizer.
learning_rate = 1e-3 # = 0.001
loss_fn = nn.MSELoss(reduction="mean")
optimizer = torch.optim.Adam(model_two_layers.parameters(), lr=learning_rate)

# Train the model.
num_epochs = 120
update_frequency = 10

train(
    model=model_two_layers,
    train_dataloader=train_dataloader,
    test_dataloader=val_dataloader,
    optimizer=optimizer,
    loss_fn=loss_fn,
    num_epochs=num_epochs,
    update_frequency=update_frequency,
)

In [None]:
y_test, y_true_dl = predict(model_two_layers, val_dataloader)
mse_score = mean_squared_error(y_test, y_true_dl)
mse_score

In [None]:
# Plot a perfect prediction line.
plt.plot([0, 1], [0, 1],  "--", color="black",)
# Plot the predictions vs the actual values.
plt.scatter(y_true_dl, y_test, alpha=0.2)

plt.xlabel("Predictions value")
plt.ylabel("Actual value")
plt.title("Predictions vs Actual")
plt.xlim(0, 1)
plt.ylim(0, 1)
plt.show()

In [None]:
torch.save(model_two_layers.state_dict(), f"trained_models/carprice_two_layer_model_mse_00015.pth")

## **Use the models to predict on new data**

- Create a model
- Load one-hot-encoder and scaler
- Transform dataframe
- Predict

In [None]:
import joblib
import numpy as np
import pandas as pd
import torch.nn as nn
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

In [None]:
# load data (similar to previous code)

df = pd.read_csv(op.join("deep-learning-skooldio", "rodkaidee-dataset", "Rodkaidee Data.csv"))
df["Price"] = df["Price"].apply(lambda x: float(x.replace(",", "")))
df["Mileage"] = df["Mileage"].apply(lambda x: float(x.replace(",", "")))

_, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [None]:
# TODO: create model and load trained weights
class CarPriceTwoLayerModel(nn.Module):
    def __init__(self, input_size, output_size, intermediate_dim = 10):
        super().__init__()
        # TODO: Create layers

    def forward(self, x):
        # TODO: Forward pass
        return x

model = CarPriceTwoLayerModel(138, 1)
# TODO: load trained weights

In [None]:
# TODO: Load one-hot encoder

In [None]:
class CarPriceDataset(Dataset):
    def __init__(self, X, y = None):
        self.X = X
        if y is not None:
            self.y = y
        else:
            self.y = None

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is not None:
            return self.X[idx], self.y[idx]
        else:
            return self.X[idx]

In [None]:
# prepare test dataset
CAT_COLUMNS = ["Brand", "Model", "Engine", "Segment", "Province", "Color"]
X_test = np.hstack([
    ohe.transform(df_test[CAT_COLUMNS]),
    year_scaler.transform(df_test[["Year"]]),
    mileage_scaler.transform(df_test[["Mileage"]]),
])
test_dataset = CarPriceDataset(X_test)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
def predict(model, test_dataloader):
    model.eval()
    y_pred_list = []
    for x in test_dataloader:
        y_pred = model(x.float())
        prediction = y_pred.detach().numpy()
        y_pred_list.extend(prediction)
    y_pred_list = np.concatenate(y_pred_list)
    return y_pred_list

In [None]:
y_pred = predict(model, test_dataloader)
y_pred_price = price_scaler.inverse_transform(y_pred.reshape(-1, 1))

In [None]:
df_test["Pred_Price"] = y_pred_price

In [None]:
df_test.head(10)

In [None]:
plt.plot([0, 1_000_000], [0, 1_000_000], color="black", alpha=0.5, linestyle="--")

plt.scatter(df_test["Pred_Price"].values, df_test["Price"], alpha=0.2)
plt.xlabel("Predictions (Baht)")
plt.ylabel("Actual (Baht)")
plt.title("Predictions vs Actual")

plt.xlim(0, 1_000_000)
plt.ylim(0, 1_000_000)
plt.show()