## Load Data & Preprocess it

In [39]:
import torch
import torch.nn as nn
import pandas as pd

url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

# Create Column Names and identify numerical and no numerical columns
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model_Year', 'Origin']
numerical_columns = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']
non_numerical_columns = ['MPG','Model_Year', 'Origin']

df = pd.read_csv(url, names=column_names, na_values='?', comment='\t', sep=" ", skipinitialspace=True)

# Drop rows containing na values and reset the index of the dataset
df = df.dropna()
df = df.reset_index(drop=True)

# Split the data into training and test data
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

# Standardize the numerical columns of the training and testing datasets
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

df_train_sc = sc.fit_transform(df_train[numerical_columns])
df_test_sc = sc.transform(df_test[numerical_columns])

#Turn the numerical columns back into a Pandas dataframe
df_train_df = pd.DataFrame(df_train_sc, columns=numerical_columns, index=df_train.index)
df_test_df = pd.DataFrame(df_test_sc, columns=numerical_columns, index=df_test.index)

# Concatencate the numerical columns back with the non numerical columns 
df_train_full = pd.concat([df_train_df, df_train[non_numerical_columns]], axis=1)
df_test_full = pd.concat([df_test_df, df_test[non_numerical_columns]], axis=1)

df_train_full.tail()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,MPG,Model_Year,Origin
203,-0.825623,-0.902462,-0.737741,-0.951552,0.255611,28.0,76,3
255,0.351689,0.414463,-0.341528,0.29366,0.549616,19.4,78,1
72,1.529001,1.146088,0.71504,1.341762,-0.626404,13.0,72,1
235,-0.825623,-0.892707,-1.054711,-1.074303,0.476114,30.5,77,1
37,1.529001,1.565553,1.639537,1.472775,-1.361417,14.0,71,1


## Bucketize 'Model_Year' & append to numeric_columns

In [48]:
boundries = torch.tensor([73, 76, 79]) #buckets all of the car model years will fall into

v = torch.tensor(df_train_full['Model_Year'].values) # Creating a tensor for the values in the 'Model_Year' column
df_train_full['Model_Year_Buckets'] = torch.bucketize(v, boundries, right=True) # Creating a new 'Model_Year_Buckets' column 

v = torch.tensor(df_test_full['Model_Year'].values)
df_test_full['Model_Year_Buckets'] = torch.bucketize(v, boundries, right=True)

numerical_columns.append('Model_Year_Buckets') # Appends 'Model_Year_Buckets' column to the numerical column names list

## Encode 'Origin' with One Hot Encoder

In [60]:
from torch.nn.functional import one_hot

total_origin = len(set(df_train_full['Origin'])) # creating a list of all of the unique values in the 'Origin' column

origin_encoded = one_hot(torch.from_numpy(df_train_full['Origin'].values) % total_origin) # Encoding the values in the 'Origin' column
X_train_numerical = torch.tensor(df_train_full[numerical_columns].values) # Creating a tensor of the values of the numerical columns
X_train = torch.cat([X_train_numerical, origin_encoded], 1).float() # Concatenating the origin and numerical column tensors

origin_encoded = one_hot(torch.from_numpy(df_test_full['Origin'].values) % total_origin)
X_test_numerical = torch.tensor(df_test_full[numerical_columns].values)
X_test = torch.cat([X_test_numerical, origin_encoded], 1).float()

## Create Label Tensors for 'MPG'

In [62]:
y_train = torch.tensor(df_train_full['MPG'].values).float() # Creating a tensor of the values in the 'MPG' column
y_test = torch.tensor(df_test_full['MPG'].values).float()

## Create Data Loader w/ Batch Size 8

In [66]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(X_train, y_train) # Creating a dataset of the X and y training data
batch_size = 8 #The batch size for training
torch.manual_seed(1)
train_dataload = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) #Loads data using the training dataset and the batch size

## Build NN model & Initialize it

In [72]:
class Model(nn.Module):
    def __init__(self, input_size = X_train.shape[1]):
        super().__init__()
        hidden_units = [8, 4]
        
        self.layer1 = nn.Linear(in_features=input_size, out_features=hidden_units[0], bias=True)
        self.relu1 = nn.ReLU()

        self.layer2 = nn.Linear(in_features=hidden_units[0], out_features=hidden_units[1], bias=True)
        self.relu2 = nn.ReLU()

        self.out = nn.Linear(in_features=hidden_units[1], out_features=1)

    def forward(self, x):
        x = self.relu1(self.layer1(x))
        x = self.relu2(self.layer2(x))
        x = self.out(x)

        return x
    

## Initialize the model

In [74]:
model = Model()

## Loss/Error & Optimizer

In [78]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

## Train Model

In [85]:
torch.manual_seed(1)
num_epochs = 200
log_epochs = 20

for epochs in range(num_epochs):
    loss_hist_train = 0
    for X_batch, y_batch in train_dataload:
        pred = model(X_batch)[:, 0]
        loss = loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train += loss.item()

    if epochs % log_epochs == 0:
        print(f'Epoch: {epochs} | Loss: {loss_hist_train/len(train_dataload):.4f}')

Epoch: 0 | Loss: 35.5917
Epoch: 20 | Loss: 8.0433
Epoch: 40 | Loss: 7.3729
Epoch: 60 | Loss: 7.1512
Epoch: 80 | Loss: 6.7276
Epoch: 100 | Loss: 6.7571
Epoch: 120 | Loss: 6.3095
Epoch: 140 | Loss: 6.5172
Epoch: 160 | Loss: 6.7965
Epoch: 180 | Loss: 6.2448


## Test the data

In [90]:
with torch.no_grad():
    pred = model(X_test)[:, 0]
    loss = loss_fn(pred, y_test)
    print(f"Test MSE: {loss.item():.4f}")
    print(f'Test MAE: {nn.L1Loss() (pred, y_test).item():.4f}')

Test MSE: 9.5496
Test MAE: 2.1327


## Testing New Car Data

In [107]:
# Raw new car data
new_car = pd.DataFrame({
    'Cylinders': [6],
    'Displacement': [97.00],
    'Horsepower': [88.00],
    'Weight': [2130.0],
    'Acceleration': [15],
    'Model_Year': [2020],
    'Origin': ['Europe']
})

def Predicted_mpg(new_car_data):
    # Preprocess the data
    numeric_columns = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']
    new_car_numeric = sc.transform(new_car[numeric_columns])  # Normalize using fitted scaler
    
    # Bucket 'Model_Year' (using the same boundaries as training, e.g., [73, 76, 79])
    boundaries = [73, 76, 79]
    model_year_bucketed = torch.bucketize(torch.tensor(new_car['Model_Year'].values), torch.tensor(boundaries), right=True)
    
    # One-hot encode 'Origin'
    origin_values = ['USA', 'Japan', 'Europe']
    origin_encoded = torch.zeros(1, len(origin_values), dtype=torch.float32)
    origin_idx = origin_values.index(new_car['Origin'].iloc[0])
    origin_encoded[0, origin_idx] = 1
    
    # Combine into a single tensor
    new_car_data = torch.cat((
        torch.tensor(new_car_numeric, dtype=torch.float32),
        model_year_bucketed.float().unsqueeze(1),
        origin_encoded
    ), dim=1)
    
    # Make prediction
    model.eval()
    with torch.no_grad():
        prediction = model(new_car_data).item()
    
    print(f'Your predicted MPG is: {prediction:.2f}')

Predicted_mpg(new_car)

Your predicted MPG is: 36.04
