In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

In [41]:
# data source: https://archive.ics.uci.edu/ml/datasets/auto+mpg
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
cols = [ 'mpg' , 'cylinders', 'displacement','hp','weight','acceleration','year','origin','carname']

In [42]:
dataset = pd.read_csv(url, names=cols, na_values='?', comment='\t', sep=' ', skipinitialspace=True)

In [43]:
dataset[:3]

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin,carname
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,


In [44]:
dataset.isna().sum()

mpg               0
cylinders         0
displacement      0
hp                6
weight            0
acceleration      0
year              0
origin            0
carname         398
dtype: int64

In [45]:
# missing values-> fix it, or drop it
# drop-> row, column
# fix-> replace-> MEAN, MEDIAN, MODE or a custom value
# fix only if you have <0.5% missing values
# 6 missing out of 398-> 1.5 % missing!

# choose to drop instead
dataset = dataset.iloc[:,:-1].dropna()

In [46]:
dataset.tail(10)

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin
388,26.0,4,156.0,92.0,2585.0,14.5,82,1
389,22.0,6,232.0,112.0,2835.0,14.7,82,1
390,32.0,4,144.0,96.0,2665.0,13.9,82,3
391,36.0,4,135.0,84.0,2370.0,13.0,82,1
392,27.0,4,151.0,90.0,2950.0,17.3,82,1
393,27.0,4,140.0,86.0,2790.0,15.6,82,1
394,44.0,4,97.0,52.0,2130.0,24.6,82,2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1
396,28.0,4,120.0,79.0,2625.0,18.6,82,1
397,31.0,4,119.0,82.0,2720.0,19.4,82,1


In [48]:
#Origin-> 1: Bengaluru, 2: Mumbai, 3: Cochin 
dataset.origin = dataset.origin.map({1:'Bengaluru', 2:'Mumbai', 3:'Cochin'})

In [50]:
dataset.tail(10)

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,origin
388,26.0,4,156.0,92.0,2585.0,14.5,82,Bengaluru
389,22.0,6,232.0,112.0,2835.0,14.7,82,Bengaluru
390,32.0,4,144.0,96.0,2665.0,13.9,82,Cochin
391,36.0,4,135.0,84.0,2370.0,13.0,82,Bengaluru
392,27.0,4,151.0,90.0,2950.0,17.3,82,Bengaluru
393,27.0,4,140.0,86.0,2790.0,15.6,82,Bengaluru
394,44.0,4,97.0,52.0,2130.0,24.6,82,Mumbai
395,32.0,4,135.0,84.0,2295.0,11.6,82,Bengaluru
396,28.0,4,120.0,79.0,2625.0,18.6,82,Bengaluru
397,31.0,4,119.0,82.0,2720.0,19.4,82,Bengaluru


In [51]:
# Label Encoding-> [Hello, World, How] -> [0, 1, 2]
# when the use could be filtering, or classification-> i prefer a 1-0 encoded mechanism

# ONE HOT ENCODING-> new columns->
# price = rest_of_the_factors*their_weights + w1*Mum + w2*Ben + w3*Coch + bias
# Ben-> price = res..._weights + w1*0 + w2*1 + w3*0 + bias 


encodeddataset = pd.get_dummies(dataset, columns=['origin'], prefix='hot',prefix_sep='_' )
encodeddataset.tail()

Unnamed: 0,mpg,cylinders,displacement,hp,weight,acceleration,year,hot_Bengaluru,hot_Cochin,hot_Mumbai
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,0,0
394,44.0,4,97.0,52.0,2130.0,24.6,82,0,0,1
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,0,0
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,0,0
397,31.0,4,119.0,82.0,2720.0,19.4,82,1,0,0
