In [4]:

import pandas as pd
import numpy as np
import torch as t
from matplotlib import pyplot as plt

In [5]:
# read the training data
df_data = pd.read_csv('cars.csv')
df_data = df_data.drop(columns=['Car_ID', 'Brand', 'Model', 'Fuel_Type', 'Transmission'])
df_data['Owner_Type'] = df_data['Owner_Type'].replace({'First': 1, 'Second': 2, 'Third': 3})

print(df_data)

    Year  Kilometers_Driven  Owner_Type  Mileage  Engine  Power  Seats  \
0   2018              50000           1       15    1498    108      5   
1   2019              40000           2       17    1597    140      5   
2   2017              20000           1       10    4951    395      4   
3   2020              30000           3       23    1248     74      5   
4   2016              60000           2       18    1999    194      5   
..   ...                ...         ...      ...     ...    ...    ...   
95  2019              22000           1       16    1950    191      5   
96  2017              38000           2       13    2755    171      7   
97  2018              26000           3       18    1497    121      5   
98  2019              24000           2       17    1497    113      5   
99  2020              18000           1       20    1199     85      5   

      Price  
0    800000  
1   1000000  
2   2500000  
3    600000  
4    850000  
..      ...  
95  2900000  

In [6]:

df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   Year               100 non-null    int64
 1   Kilometers_Driven  100 non-null    int64
 2   Owner_Type         100 non-null    int64
 3   Mileage            100 non-null    int64
 4   Engine             100 non-null    int64
 5   Power              100 non-null    int64
 6   Seats              100 non-null    int64
 7   Price              100 non-null    int64
dtypes: int64(8)
memory usage: 6.4 KB


In [7]:
y_np = df_data['Price'].to_numpy()

df_corr = df_data.corr()
select_columns = df_corr['Price'][np.abs(df_corr['Price'])>= 0.03]
print(select_columns)
select_columns = select_columns.index.to_list()[:-1]
print(select_columns)

x_np = df_data[select_columns].to_numpy()

Year                -0.232687
Kilometers_Driven   -0.051104
Owner_Type          -0.301488
Mileage             -0.595252
Engine               0.714465
Power                0.856620
Price                1.000000
Name: Price, dtype: float64
['Year', 'Kilometers_Driven', 'Owner_Type', 'Mileage', 'Engine', 'Power']


In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


scaler = StandardScaler()

X_scaled = scaler.fit_transform(x_np)

y_scaler = StandardScaler()

y_scaled = y_scaler.fit_transform(y_np.reshape(-1, 1)).flatten()


X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2)

X_train_tensor = t.tensor(X_train, dtype=t.float32)
X_val_tensor = t.tensor(X_val, dtype=t.float32)
y_train_tensor = t.tensor(y_train, dtype=t.float32)
y_val_tensor = t.tensor(y_val, dtype=t.float32)

In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


scaler = StandardScaler()

X_scaled = scaler.fit_transform(x_np)

y_scaler = StandardScaler()

y_scaled = y_scaler.fit_transform(y_np.reshape(-1, 1)).flatten()


X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2)

X_train_tensor = t.tensor(X_train, dtype=t.float32)
X_val_tensor = t.tensor(X_val, dtype=t.float32)
y_train_tensor = t.tensor(y_train, dtype=t.float32)
y_val_tensor = t.tensor(y_val, dtype=t.float32)

In [10]:
import torch.nn.functional as F
import torch.nn as nn
from   torch.autograd import Variable

class Model(nn.Module):
    def __init__(self, input_dim = 6):
        super(Model, self).__init__()
        self.layer1 = nn.Linear(in_features=input_dim, out_features=15)
        #self.dropout1 = nn.Dropout(p=0.3)  # Drop 30% of output nodes from the previous layer during training only
        self.layer2 = nn.Linear(in_features=15, out_features=12)
        #self.dropout2 = nn.Dropout(p=0.25)
        self.layer3 = nn.Linear(in_features=12, out_features=1)


    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        x = self.layer3(x)

        return x

In [11]:
model = Model(input_dim=X_train.shape[1])
optimizer = t.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

print(model)

y0 = model.forward(X_train_tensor[0, :])
print(y0)

Model(
  (layer1): Linear(in_features=6, out_features=15, bias=True)
  (layer2): Linear(in_features=15, out_features=12, bias=True)
  (layer3): Linear(in_features=12, out_features=1, bias=True)
)
tensor([-0.1052], grad_fn=<ViewBackward0>)


In [12]:
def train_model(n_epochs, model, train_loader, optimizer, loss_fn,
                x_val, y_val):

    for epoch in range(n_epochs+1):
        train_loss = 0.0
        for xb,yb in train_loader: # for each batch
            model.train()       # set model in training mode = with dropout
            ym = model.forward(xb)
            loss = loss_fn(ym, yb.view(-1, 1))

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            train_loss += loss.item() * xb.size(0)

        if epoch == 1 or epoch % 10 == 0:
            model.eval() # set model in evaluation mode = no dropout
            with t.no_grad():  # no learning
                ym_val   = model.forward(x_val)
                loss_val = loss_fn(ym_val, y_val.view(-1, 1))


            print(f"Epoch {epoch}, Training loss {loss.item():.4f},"
f" Validation loss {loss_val.item():.4f}")

In [13]:

# prepare the data loaders
# DataLoader = a class that shuffles the data and splits in into batches
# you should use it during training (SGD - accumulate error over batches of data )
train_data = [(X_train_tensor[i], y_train_tensor[i]) for i in range(X_train_tensor.shape[0])]
print("Sample train_data = ", train_data[:3], " type = ", type(train_data))
trainloader = t.utils.data.DataLoader(train_data, batch_size = 64, shuffle=True)
for x,label in trainloader:  # shuffles the data
    print(x,label)

Sample train_data =  [(tensor([-0.3347, -0.0165, -1.0023,  1.1508, -0.5687, -0.7852]), tensor(-0.7275)), (tensor([ 0.5235,  1.3057,  0.4503, -0.0638, -0.4111, -0.2367]), tensor(-0.5767)), (tensor([-1.1928,  1.0853,  0.4503, -1.2783,  1.4324,  0.1681]), tensor(-0.1748))]  type =  <class 'list'>
tensor([[-1.1928e+00,  2.0384e-01, -1.0023e+00, -9.7470e-01,  1.7699e+00,
          1.1343e+00],
        [ 5.2347e-01, -4.5727e-01,  4.5032e-01, -6.3766e-02, -6.2602e-01,
         -7.1988e-01],
        [-3.3468e-01, -1.6528e-02, -1.0023e+00, -6.3766e-02,  1.7953e-01,
          3.7698e-01],
        [ 1.3816e+00, -1.1184e+00, -1.0023e+00,  8.4717e-01, -1.0447e+00,
         -9.5492e-01],
        [-1.1928e+00,  1.5261e+00,  4.5032e-01,  2.3654e+00, -5.6870e-01,
         -7.8517e-01],
        [-1.1928e+00,  1.0853e+00,  4.5032e-01, -3.6741e-01, -7.3268e-01,
         -1.3228e-01],
        [-3.3468e-01, -1.6528e-02, -1.0023e+00, -6.3766e-02,  1.7953e-01,
          3.7698e-01],
        [-1.1928e+00,  1.0

In [14]:
train_model(n_epochs = 100, model = model,
            train_loader = trainloader, optimizer = optimizer,
            loss_fn = loss_fn,
            x_val   = X_val_tensor,   y_val = y_val_tensor)

Epoch 0, Training loss 1.1704, Validation loss 0.5349
Epoch 1, Training loss 0.7201, Validation loss 0.5320
Epoch 10, Training loss 1.0043, Validation loss 0.5045
Epoch 20, Training loss 1.2834, Validation loss 0.4658
Epoch 30, Training loss 0.5016, Validation loss 0.4150
Epoch 40, Training loss 0.5570, Validation loss 0.3521
Epoch 50, Training loss 0.3593, Validation loss 0.2923
Epoch 60, Training loss 0.1342, Validation loss 0.2500
Epoch 70, Training loss 0.2566, Validation loss 0.2185
Epoch 80, Training loss 0.1634, Validation loss 0.1874
Epoch 90, Training loss 0.2184, Validation loss 0.1670
Epoch 100, Training loss 0.1754, Validation loss 0.1481
