In [1]:
import numpy as np
import pandas as pd

# Feature Engineering

In [2]:
auto = pd.read_csv("./clean-auto-mpg.csv")
auto.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1


Right off the bat it can be seen that `origin` is not a numerical field. It needs to be changed to one-hot.

In [3]:
auto["USA"] = (auto["origin"] == 1)*1.
auto["Europe"] = (auto["origin"] == 2)*1.
auto["Japan"] = (auto["origin"] == 3)*1.
auto.tail(5)

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,USA,Europe,Japan
387,27.0,4,140.0,86,2790,15.6,82,1,1.0,0.0,0.0
388,44.0,4,97.0,52,2130,24.6,82,2,0.0,1.0,0.0
389,32.0,4,135.0,84,2295,11.6,82,1,1.0,0.0,0.0
390,28.0,4,120.0,79,2625,18.6,82,1,1.0,0.0,0.0
391,31.0,4,119.0,82,2720,19.4,82,1,1.0,0.0,0.0


## Train, Val, and Test splits

In [4]:
auto_train = auto.sample(frac=0.8)
auto_ = auto.drop(auto_train.index)
auto_val = auto_.sample(frac=0.5)
auto_test = auto_.drop(auto_val.index)
print(len(auto_train), len(auto_val), len(auto_test))

314 39 39


## Normalize
The following columns which will be used as inputs, have very widely differing scales. They need to be normalized so that their means are 0 and std devs are 1.
  * cylinders
  * displacement
  * horsepower
  * weight
  * acceleration

For all three datasets, use the train set's statistics to normalize.

In [6]:
train_stats = auto_train.describe()
train_stats

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,USA,Europe,Japan
count,314.0,314.0,314.0,314.0,314.0,314.0,314.0,314.0,314.0,314.0,314.0
mean,23.296497,5.477707,195.136943,104.60828,2982.455414,15.642675,75.859873,1.566879,0.627389,0.178344,0.194268
std,7.79919,1.711028,103.77207,37.655188,819.011447,2.850014,3.740733,0.797551,0.484272,0.383413,0.396267
min,9.0,3.0,70.0,46.0,1613.0,8.0,70.0,1.0,0.0,0.0,0.0
25%,17.125,4.0,107.0,78.0,2254.25,13.925,73.0,1.0,0.0,0.0,0.0
50%,22.0,4.0,151.0,95.0,2832.5,15.5,76.0,1.0,1.0,0.0,0.0
75%,28.0,8.0,293.25,125.0,3618.25,17.275,79.0,2.0,1.0,0.0,0.0
max,46.6,8.0,455.0,225.0,4955.0,24.8,82.0,3.0,1.0,1.0,1.0


In [7]:
cols_to_norm = ["cylinders", "displacement", "horsepower", "weight", "acceleration"]
for col in cols_to_norm:
    mean = train_stats[col].loc["mean"]
    std = train_stats[col].loc["std"]
    auto_train[col] = (auto_train[col] - mean) / std
    auto_val[col] = (auto_val[col] - mean) / std
    auto_test[col] = (auto_test[col] - mean) / std

Now the mean of these columns should be 0 and their std dev should be 1. Lets verify.

In [9]:
auto_train.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,USA,Europe,Japan
count,314.0,314.0,314.0,314.0,314.0,314.0,314.0,314.0,314.0,314.0,314.0
mean,23.296497,1.470869e-16,-1.343582e-16,-5.6571870000000004e-18,-2.828594e-18,5.968333e-16,75.859873,1.566879,0.627389,0.178344,0.194268
std,7.79919,1.0,1.0,1.0,1.0,1.0,3.740733,0.797551,0.484272,0.383413,0.396267
min,9.0,-1.448081,-1.205883,-1.556446,-1.672083,-2.681627,70.0,1.0,0.0,0.0,0.0
25%,17.125,-0.863637,-0.849332,-0.7066299,-0.8891273,-0.60269,73.0,1.0,0.0,0.0,0.0
50%,22.0,-0.863637,-0.4253258,-0.2551648,-0.1830932,-0.05006121,76.0,1.0,1.0,0.0,0.0
75%,28.0,1.474139,0.9454669,0.5415381,0.7762951,0.5727427,79.0,2.0,1.0,0.0,0.0
max,46.6,1.474139,2.504171,3.197215,2.408446,3.21308,82.0,3.0,1.0,1.0,1.0


# Conclusion
The original *clean-auto-mpg.csv* has been split into train, val, and test sets and the data within them normalized appropriately. Lets write these split CSVs.

In [10]:
auto_train.to_csv("./train-auto-mpg.csv", index=False)
auto_val.to_csv("./val-auto-mpg.csv", index=False)
auto_test.to_csv("./test-auto-mpg.csv", index=False)