In [29]:
import pandas as pd
import sklearn
from  sklearn import model_selection
import numpy as np
import torch

In [30]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

In [31]:
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower','Weight', 'Acceleration', 'Model Year', 'Origin']

In [32]:
df = pd.read_csv(url, names=column_names,na_values = "?", comment='\t',\
                 sep=" ", skipinitialspace=True)

In [33]:
df.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,17.0,8,302.0,140.0,3449.0,10.5,70,1


In [34]:
df = df.dropna()

df = df.reset_index(drop=True)

In [35]:
df_train, df_test = model_selection.train_test_split(df, train_size=0.8, random_state=1)

In [36]:
train_stats = df_train.describe().transpose()
train_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,313.0,23.404153,7.666909,9.0,17.5,23.0,29.0,46.6
Cylinders,313.0,5.402556,1.701506,3.0,4.0,4.0,8.0,8.0
Displacement,313.0,189.51278,102.675646,68.0,104.0,140.0,260.0,455.0
Horsepower,313.0,102.929712,37.919046,46.0,75.0,92.0,120.0,230.0
Weight,313.0,2961.198083,848.602146,1613.0,2219.0,2755.0,3574.0,5140.0
Acceleration,313.0,15.704473,2.725399,8.5,14.0,15.5,17.3,24.8
Model Year,313.0,75.929712,3.675305,70.0,73.0,76.0,79.0,82.0
Origin,313.0,1.591054,0.807923,1.0,1.0,1.0,2.0,3.0


In [37]:
numeric_columns = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']

In [38]:
df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

# standardisation of features

for col_name in numeric_columns:
    mean = train_stats.loc[col_name,'mean']
    std = train_stats.loc[col_name,'std']
    df_train_norm.loc[:,col_name] = (df_train_norm.loc[:, col_name]- mean)/std
    df_test_norm.loc[:,col_name] = (df_test_norm.loc[:, col_name]- mean)/std


In [39]:
df_train_norm

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
334,27.2,-0.824303,-0.530922,-0.499214,-0.555264,-0.001641,81,1
258,18.6,0.351127,0.345625,0.186457,0.776338,1.099115,78,1
139,29.0,-0.824303,-0.891280,-0.525586,-0.874613,0.291894,74,2
310,37.2,-0.824303,-1.008153,-1.000281,-1.110294,0.255202,80,3
349,33.0,-0.824303,-0.823104,-0.762934,-0.908786,-0.552019,81,2
...,...,...,...,...,...,...,...,...
203,28.0,-0.824303,-0.901020,-0.736562,-0.950031,0.255202,76,3
255,19.4,0.351127,0.413800,-0.340982,0.293190,0.548737,78,1
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1
235,30.5,-0.824303,-0.891280,-1.053025,-1.072585,0.475353,77,1


### Bucketize = pd.cut
bucket = { 0 if year < 73, 1 if73 ≤ year < 76, 2 if 76 ≤ year, 3 if year  >= 79

In [40]:
boundaries = torch.tensor([73,76,79])
v = torch.tensor(df_train_norm['Model Year'].values)
df_train_norm['Model_Year_bucketed'] = torch.bucketize(v, boundaries, right=True)
v = torch.tensor(df_test_norm['Model Year'].values)
df_test_norm['Model_Year_bucketed'] = torch.bucketize(v, boundaries, right=True)

numeric_columns.append('Model_Year_bucketed')
df_train_norm

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,Model_Year_bucketed
334,27.2,-0.824303,-0.530922,-0.499214,-0.555264,-0.001641,81,1,3
258,18.6,0.351127,0.345625,0.186457,0.776338,1.099115,78,1,2
139,29.0,-0.824303,-0.891280,-0.525586,-0.874613,0.291894,74,2,1
310,37.2,-0.824303,-1.008153,-1.000281,-1.110294,0.255202,80,3,3
349,33.0,-0.824303,-0.823104,-0.762934,-0.908786,-0.552019,81,2,3
...,...,...,...,...,...,...,...,...,...
203,28.0,-0.824303,-0.901020,-0.736562,-0.950031,0.255202,76,3,2
255,19.4,0.351127,0.413800,-0.340982,0.293190,0.548737,78,1,2
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1,0
235,30.5,-0.824303,-0.891280,-1.053025,-1.072585,0.475353,77,1,2


### One_hot encoding for Origin

In [43]:
from torch.nn.functional import one_hot

# train set
total_origin = len(set(df_train_norm['Origin'])) # 3

origin_encoded = one_hot(torch.from_numpy(df_train_norm['Origin'].values) % total_origin)
x_train_numeric = torch.tensor(df_train_norm[numeric_columns].values)
x_train = torch.cat([x_train_numeric, origin_encoded],1).float() # tensor form

# test set
origin_encoded = one_hot(torch.from_numpy(df_test_norm['Origin'].values) % total_origin)
x_test_numeric = torch.tensor(df_test_norm[numeric_columns].values)
x_test = torch.cat([x_test_numeric, origin_encoded],1).float() # tensor form

tensor([[-8.2430e-01, -5.3092e-01, -4.9921e-01, -5.5526e-01, -1.6412e-03,
          3.0000e+00],
        [ 3.5113e-01,  3.4562e-01,  1.8646e-01,  7.7634e-01,  1.0991e+00,
          2.0000e+00],
        [-8.2430e-01, -8.9128e-01, -5.2559e-01, -8.7461e-01,  2.9189e-01,
          1.0000e+00]], dtype=torch.float64)
tensor([[-8.2430e-01, -5.3092e-01, -4.9921e-01, -5.5526e-01, -1.6412e-03,
          3.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00],
        [ 3.5113e-01,  3.4562e-01,  1.8646e-01,  7.7634e-01,  1.0991e+00,
          2.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00],
        [-8.2430e-01, -8.9128e-01, -5.2559e-01, -8.7461e-01,  2.9189e-01,
          1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]])


In [45]:
# tensor form of target
y_train = torch.tensor(df_train_norm['MPG'].values).float()
y_test = torch.tensor(df_test_norm['MPG'].values).float()


### Training a DNN regression model

we will create a data loader that uses a batch size of 8 for the train data

For torch, we need to change the data from tensor to TensorDataset =====> DataLoader

In [47]:
from torch.utils.data import TensorDataset, DataLoader
train_ds = TensorDataset(x_train, y_train)
batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

## Model Parameters

In [59]:
hidden_units = [8,4]
input_size = x_train.shape[1] # number of x features
all_layers = []

for hidden in hidden_units:
    layer = torch.nn.Linear(input_size, hidden) # 1.loop => (9,8),  2.loop ==>(8,4)
    all_layers.append(layer)
    all_layers.append(torch.nn.ReLU())
    input_size = hidden

all_layers.append(torch.nn.Linear(hidden_units[-1],1))

model = torch.nn.Sequential( *all_layers)
model

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

## Calculate loss

In [60]:
loss_fn = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.001)

In [61]:
torch.manual_seed(1)
num_epochs = 200
log_epochs = 20

In [66]:
for epoch in range(num_epochs):
    loss_hist_train = 0
    for x_batch, y_batch in train_dl:
        pred = model(x_batch)[:, 0]
        loss = loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train += loss.item()
    if epoch % log_epochs ==0:
        print(f'Epoch {epoch} Loss '\
              f'{loss_hist_train/len(train_dl):.4f}')


Epoch 0 Loss 5.8677
Epoch 20 Loss 6.3415
Epoch 40 Loss 5.7029
Epoch 60 Loss 6.9662
Epoch 80 Loss 6.2425
Epoch 100 Loss 5.9412
Epoch 120 Loss 6.5680
Epoch 140 Loss 6.3236
Epoch 160 Loss 6.1156
Epoch 180 Loss 6.0745


In [64]:
with torch.no_grad():
    pred = model(x_test.float())[:, 0]
    loss = loss_fn(pred, y_test)
    print(f'Test MSE: {loss.item():.4f}')
    print(f'Test MAE: {torch.nn.L1Loss()(pred, y_test).item():.4f}')

Test MSE: 17.4365
Test MAE: 3.2026
