In [2]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit

import warnings
warnings.filterwarnings('ignore')

In [3]:
columns = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']

df = pd.read_csv('./auto-mpg.data', names = columns, na_values = "?", comment = '\t', sep = " ", skipinitialspace = True)

data = df.copy()

In [5]:
#set aside test data via stratified sampling (homogenous group with the right number of instances of eahc sub-group). From the correlation found in EDA, the cylinder column can be used to create the strate 

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.2, random_state = 42)

for train_index, test_index, in split.split(data, data["Cylinders"]):
    strat_train_set = data.loc[train_index]
    strat_test_set = data.loc[test_index]

In [6]:
#Separating feature variables and target variables
data = strat_train_set.drop("MPG", axis = 1)
data_labels = strat_train_set["MPG"].copy()

In [7]:
data

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,3
151,4,79.0,67.0,2000.0,16.0,74,2
388,4,156.0,92.0,2585.0,14.5,82,1
48,6,250.0,88.0,3139.0,14.5,71,1
114,4,98.0,90.0,2265.0,15.5,73,2
...,...,...,...,...,...,...,...
147,4,90.0,75.0,2108.0,15.5,74,2
156,8,400.0,170.0,4668.0,11.5,75,1
395,4,135.0,84.0,2295.0,11.6,82,1
14,4,113.0,95.0,2372.0,15.0,70,3


In [9]:
#preprocessing the origin column
def preprocess_origin_column(df):
    df["Origin"] = df["Origin"].map({1: "India", 2: "USA", 3: "Germany"})
    return df
data_tr = preprocess_origin_column(data)
data_tr.head()

Unnamed: 0,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
145,4,83.0,61.0,2003.0,19.0,74,Germany
151,4,79.0,67.0,2000.0,16.0,74,USA
388,4,156.0,92.0,2585.0,14.5,82,India
48,6,250.0,88.0,3139.0,14.5,71,India
114,4,98.0,90.0,2265.0,15.5,73,USA


In [10]:
#One Hot Encoding the origin column from the preprocessed data
data_tr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 318 entries, 145 to 362
Data columns (total 7 columns):
Cylinders       318 non-null int64
Displacement    318 non-null float64
Horsepower      314 non-null float64
Weight          318 non-null float64
Acceleration    318 non-null float64
Model Year      318 non-null int64
Origin          318 non-null object
dtypes: float64(4), int64(2), object(1)
memory usage: 29.9+ KB


In [12]:
#isolating the origin column because of data type "object". it is a categorical column that needs to be deal with 
data_category = data_tr[["Origin"]]
data_category.head()

Unnamed: 0,Origin
145,Germany
151,USA
388,India
48,India
114,USA


In [13]:
#One hot encoding the categorical values
from sklearn.preprocessing import OneHotEncoder

category_encoder = OneHotEncoder()
data_category_1hot = category_encoder.fit_transform(data_category)
data_category_1hot

<318x3 sparse matrix of type '<class 'numpy.float64'>'
	with 318 stored elements in Compressed Sparse Row format>

In [14]:
#converting the sparse matrix into a numpy array. This example just checks the top five roles 
data_category_1hot.toarray()[:5]

array([[1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [15]:
category_encoder.categories_

[array(['Germany', 'India', 'USA'], dtype=object)]

In [None]:
#Handling MIssing Values using SimpleImputer (10:32 in video)