In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('data/Auto MPG - Sheet1.csv')

In [3]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,18.0,8,307.0,130,3504,12.0,70,1
1,15.0,8,350.0,165,3693,11.5,70,1
2,18.0,8,318.0,150,3436,11.0,70,1
3,16.0,8,304.0,150,3433,12.0,70,1
4,17.0,8,302.0,140,3449,10.5,70,1
...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86,2790,15.6,82,1
394,44.0,4,97.0,52,2130,24.6,82,2
395,32.0,4,135.0,84,2295,11.6,82,1
396,28.0,4,120.0,79,2625,18.6,82,1


## a)

There is no non numeric ordinal attribute however we have discrete attributes like cylinders, model year and origin

Applying one hot encoding - 

In [4]:
def one_hot_encoding(df, column_name):
    ls_col_values = df[column_name].tolist()
    unique_values = list(set(ls_col_values))
    unique_values.sort()
    for value in unique_values:
        df[column_name + " " + str(value)] = df[column_name].apply(lambda x: 1 if x == value else 0)
    df.drop(column_name, axis=1, inplace=True)
    return df

In [5]:
df = one_hot_encoding(df, 'origin')
df = one_hot_encoding(df, 'cylinders')
df = one_hot_encoding(df, 'model year')

In [6]:
# save the dataframe
df.to_csv('data/Auto MPG - Sheet1 - one hot encoded.csv', index=False)

## b)

Computing Mean and Variance of Features

In [7]:
df = pd.read_csv('data/Auto MPG - Sheet1 - one hot encoded.csv')

In [8]:
# delete rows with non-numeric values
df = df[df.horsepower != '?']

In [9]:
def get_mean_and_variance(df):
    mean_arr = [0]*len(df.columns)
    variance_arr = [0]*len(df.columns)
    for i in range(len(df.columns)):
        for j in range(len(df)):
            mean_arr[i] += df.iloc[j, i]
        mean_arr[i] /= len(df)
        for j in range(len(df)):
            variance_arr[i] += (df.iloc[j, i] - mean_arr[i])**2
        variance_arr[i] /= len(df)
    return mean_arr, variance_arr

In [10]:
df['horsepower'] = df['horsepower'].apply(lambda x: float(x))
mean, variance = get_mean_and_variance(df)

In [11]:
df_mean_var = pd.DataFrame({'mean': mean, 'variance': variance}, index=df.columns)
df_mean_var

Unnamed: 0,mean,variance
mpg,23.445918,60.762738
displacement,194.41199,10922.432943
horsepower,104.469388,1477.789879
weight,2977.584184,719644.186791
acceleration,15.541327,7.591915
origin 1,0.625,0.234375
origin 2,0.173469,0.143378
origin 3,0.201531,0.160916
cylinders 3,0.010204,0.0101
cylinders 4,0.507653,0.249941


## c)

Normalizing with mean and variance

In [12]:
# normalizing the data
for i in range(len(df.columns)):
    df.iloc[:, i] = (df.iloc[:, i] - mean[i]) / np.sqrt(variance[i])

In [13]:
df

Unnamed: 0,mpg,displacement,horsepower,weight,acceleration,origin 1,origin 2,origin 3,cylinders 3,cylinders 4,...,model year 73,model year 74,model year 75,model year 76,model year 77,model year 78,model year 79,model year 80,model year 81,model year 82
0,-0.698638,1.077290,0.664133,0.620540,-1.285258,0.774597,-0.458123,-0.50239,-0.101535,-1.015425,...,-0.3371,-0.26653,-0.287877,-0.308175,-0.27735,-0.317999,-0.282648,-0.271979,-0.27735,-0.287877
1,-1.083498,1.488732,1.574594,0.843334,-1.466724,0.774597,-0.458123,-0.50239,-0.101535,-1.015425,...,-0.3371,-0.26653,-0.287877,-0.308175,-0.27735,-0.317999,-0.282648,-0.271979,-0.27735,-0.287877
2,-0.698638,1.182542,1.184397,0.540382,-1.648189,0.774597,-0.458123,-0.50239,-0.101535,-1.015425,...,-0.3371,-0.26653,-0.287877,-0.308175,-0.27735,-0.317999,-0.282648,-0.271979,-0.27735,-0.287877
3,-0.955212,1.048584,1.184397,0.536845,-1.285258,0.774597,-0.458123,-0.50239,-0.101535,-1.015425,...,-0.3371,-0.26653,-0.287877,-0.308175,-0.27735,-0.317999,-0.282648,-0.271979,-0.27735,-0.287877
4,-0.826925,1.029447,0.924265,0.555706,-1.829655,0.774597,-0.458123,-0.50239,-0.101535,-1.015425,...,-0.3371,-0.26653,-0.287877,-0.308175,-0.27735,-0.317999,-0.282648,-0.271979,-0.27735,-0.287877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
393,0.455941,-0.520637,-0.480448,-0.221125,0.021294,0.774597,-0.458123,-0.50239,-0.101535,0.984809,...,-0.3371,-0.26653,-0.287877,-0.308175,-0.27735,-0.317999,-0.282648,-0.271979,-0.27735,3.473711
394,2.636813,-0.932079,-1.364896,-0.999134,3.287676,-1.290994,2.182821,-0.50239,-0.101535,0.984809,...,-0.3371,-0.26653,-0.287877,-0.308175,-0.27735,-0.317999,-0.282648,-0.271979,-0.27735,3.473711
395,1.097374,-0.568479,-0.532474,-0.804632,-1.430430,0.774597,-0.458123,-0.50239,-0.101535,0.984809,...,-0.3371,-0.26653,-0.287877,-0.308175,-0.27735,-0.317999,-0.282648,-0.271979,-0.27735,3.473711
396,0.584228,-0.712005,-0.662540,-0.415627,1.110088,0.774597,-0.458123,-0.50239,-0.101535,0.984809,...,-0.3371,-0.26653,-0.287877,-0.308175,-0.27735,-0.317999,-0.282648,-0.271979,-0.27735,3.473711


In [14]:
mean_post_norm, variance_post_norm = get_mean_and_variance(df)

In [15]:
df_mean_var_post_norm = pd.DataFrame({'mean': mean_post_norm, 'variance': variance_post_norm}, index=df.columns)

In [16]:
df_mean_var_post_norm

Unnamed: 0,mean,variance
mpg,-3.834801e-16,1.0
displacement,-2.537653e-16,1.0
horsepower,-3.965082e-16,1.0
weight,6.839767e-17,1.0
acceleration,6.241039e-15,1.0
origin 1,-3.4213e-16,1.0
origin 2,-1.7276430000000002e-17,1.0
origin 3,8.11709e-16,1.0
cylinders 3,-2.2020370000000003e-17,1.0
cylinders 4,1.272791e-15,1.0


Hence post normalization the variance is one for all features