In [1]:
# The usual preamble
import pandas as pd

import matplotlib as mpl
# Make the graphs a bit prettier, and bigger 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60) 

In [2]:
import numpy as np

## import the dataset first

In [3]:
df = pd.read_csv('./data/DataSet.txt')

In [4]:
header_names = []
#set the headers for the dataframe
with open('./data/header.txt', 'r') as headers:
    for line in headers:
        for word in line.split():
            header_names.append(word)
print(header_names)

['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']


## Use the header names as column headers for csv file

In [5]:
df.columns = header_names

In [6]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
1,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
2,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
3,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
4,2,?,audi,gas,std,two,sedan,fwd,front,99.8,177.3,66.3,53.1,2507,ohc,five,136,mpfi,3.19,3.4,8.5,110,5500,19,25,15250


In [7]:
df.describe()

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
count,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0,204.0
mean,0.823529,98.806373,174.075,65.916667,53.74902,2555.602941,126.892157,10.148137,25.240196,30.769608
std,1.239035,5.994144,12.362123,2.146716,2.424901,521.96082,41.744569,3.981,6.551513,6.898337
min,-2.0,86.6,141.1,60.3,47.8,1488.0,61.0,7.0,13.0,16.0
25%,0.0,94.5,166.3,64.075,52.0,2145.0,97.0,8.575,19.0,25.0
50%,1.0,97.0,173.2,65.5,54.1,2414.0,119.5,9.0,24.0,30.0
75%,2.0,102.4,183.2,66.9,55.5,2939.25,142.0,9.4,30.0,34.5
max,3.0,120.9,208.1,72.3,59.8,4066.0,326.0,23.0,49.0,54.0


In [8]:
df.describe(include = ['O'])

Unnamed: 0,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system,bore,stroke,horsepower,peak-rpm,price
count,204,204,204,204,204,204,204,204,204,204,204,204.0,204.0,204,204,204
unique,52,22,2,2,3,5,3,2,7,7,8,39.0,37.0,60,24,186
top,?,toyota,gas,std,four,sedan,fwd,front,ohc,four,mpfi,3.62,3.4,68,5500,?
freq,40,32,184,167,114,96,120,201,148,158,93,23.0,20.0,19,37,4


In [9]:
df.dtypes

symboling              int64
normalized-losses     object
make                  object
fuel-type             object
aspiration            object
num-of-doors          object
body-style            object
drive-wheels          object
engine-location       object
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-type           object
num-of-cylinders      object
engine-size            int64
fuel-system           object
bore                  object
stroke                object
compression-ratio    float64
horsepower            object
peak-rpm              object
city-mpg               int64
highway-mpg            int64
price                 object
dtype: object

# Create the dataframe for continuous data
* convert all continous to float-64

In [10]:
#create a df for continous features
df_cont = df[['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']].copy()

In [11]:
len(df_cont.columns)

15

## convert all to float-64

In [12]:
df_cont.head(5)

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,?,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500
1,?,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500
2,164,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950
3,164,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450
4,?,99.8,177.3,66.3,53.1,2507,136,3.19,3.4,8.5,110,5500,19,25,15250


In [13]:
df_cont = df_cont.apply(pd.to_numeric, errors='coerce') #errors converts the ? value to NaN

In [14]:
df_cont.dtypes

normalized-losses    float64
wheel-base           float64
length               float64
width                float64
height               float64
curb-weight            int64
engine-size            int64
bore                 float64
stroke               float64
compression-ratio    float64
horsepower           float64
peak-rpm             float64
city-mpg               int64
highway-mpg            int64
price                float64
dtype: object

In [15]:
df_cont.describe()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
count,164.0,204.0,204.0,204.0,204.0,204.0,204.0,200.0,200.0,204.0,202.0,202.0,204.0,204.0,200.0
mean,122.0,98.806373,174.075,65.916667,53.74902,2555.602941,126.892157,3.32905,3.2583,10.148137,104.222772,5125.990099,25.240196,30.769608,13205.69
std,35.442168,5.994144,12.362123,2.146716,2.424901,521.96082,41.744569,0.274044,0.314868,3.981,39.810182,480.44368,6.551513,6.898337,7966.982558
min,65.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,94.0,94.5,166.3,64.075,52.0,2145.0,97.0,3.15,3.11,8.575,70.0,4800.0,19.0,25.0,7775.0
50%,115.0,97.0,173.2,65.5,54.1,2414.0,119.5,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10270.0
75%,150.0,102.4,183.2,66.9,55.5,2939.25,142.0,3.59,3.41,9.4,116.0,5500.0,30.0,34.5,16500.75
max,256.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


# Create the dataframe for the categorical data
* ensure all the data is in the form of an object

In [16]:
#get names of categorical headers
l = ['normalized-losses', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']
cat_list = []
for x in header_names:
    if x not in l:
        cat_list.append(x)

In [17]:
#create a df for continous features
df_cat = df[cat_list].copy()

In [18]:
df_cat.head()

Unnamed: 0,symboling,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
0,3,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,1,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
2,2,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
3,2,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi
4,2,audi,gas,std,two,sedan,fwd,front,ohc,five,mpfi


In [19]:
df_cat.dtypes

symboling            int64
make                object
fuel-type           object
aspiration          object
num-of-doors        object
body-style          object
drive-wheels        object
engine-location     object
engine-type         object
num-of-cylinders    object
fuel-system         object
dtype: object

In [20]:
df_cat.symboling = df_cat.symboling.apply(str)

In [21]:
df_cat.describe(include="all")

Unnamed: 0,symboling,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
count,204,204,204,204,204,204,204,204,204,204,204
unique,6,22,2,2,3,5,3,2,7,7,8
top,0,toyota,gas,std,four,sedan,fwd,front,ohc,four,mpfi
freq,67,32,184,167,114,96,120,201,148,158,93


In [28]:
cont_desc = df_cont.describe()

In [30]:
cont_desc

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
count,164.0,204.0,204.0,204.0,204.0,204.0,204.0,200.0,200.0,204.0,202.0,202.0,204.0,204.0,200.0
mean,122.0,98.806373,174.075,65.916667,53.74902,2555.602941,126.892157,3.32905,3.2583,10.148137,104.222772,5125.990099,25.240196,30.769608,13205.69
std,35.442168,5.994144,12.362123,2.146716,2.424901,521.96082,41.744569,0.274044,0.314868,3.981,39.810182,480.44368,6.551513,6.898337,7966.982558
min,65.0,86.6,141.1,60.3,47.8,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,13.0,16.0,5118.0
25%,94.0,94.5,166.3,64.075,52.0,2145.0,97.0,3.15,3.11,8.575,70.0,4800.0,19.0,25.0,7775.0
50%,115.0,97.0,173.2,65.5,54.1,2414.0,119.5,3.31,3.29,9.0,95.0,5200.0,24.0,30.0,10270.0
75%,150.0,102.4,183.2,66.9,55.5,2939.25,142.0,3.59,3.41,9.4,116.0,5500.0,30.0,34.5,16500.75
max,256.0,120.9,208.1,72.3,59.8,4066.0,326.0,3.94,4.17,23.0,288.0,6600.0,49.0,54.0,45400.0


In [32]:
cont_desc = cont_desc.transpose()

In [33]:
cont_desc

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
normalized-losses,164.0,122.0,35.442168,65.0,94.0,115.0,150.0,256.0
wheel-base,204.0,98.806373,5.994144,86.6,94.5,97.0,102.4,120.9
length,204.0,174.075,12.362123,141.1,166.3,173.2,183.2,208.1
width,204.0,65.916667,2.146716,60.3,64.075,65.5,66.9,72.3
height,204.0,53.74902,2.424901,47.8,52.0,54.1,55.5,59.8
curb-weight,204.0,2555.602941,521.96082,1488.0,2145.0,2414.0,2939.25,4066.0
engine-size,204.0,126.892157,41.744569,61.0,97.0,119.5,142.0,326.0
bore,200.0,3.32905,0.274044,2.54,3.15,3.31,3.59,3.94
stroke,200.0,3.2583,0.314868,2.07,3.11,3.29,3.41,4.17
compression-ratio,204.0,10.148137,3.981,7.0,8.575,9.0,9.4,23.0


# Add missing values to report: Cardinality, Missing_%

In [76]:
cont_features = ['cardinality', 'missing%']
cols = cont_desc.index

#for the two features generate the data for each column in the continuous dataframe

total = df.shape[0] #gets the total number of instances in the dataframe to calculate missing %

for feature in cont_features:
    
    for col in cols:
        if feature in 'cardinality':
            #cont_desc[feature] = np.nan
            uniq_count = len(df_cont[col].unique())
            cont_desc[feature][col] = uniq_count
        elif feature in 'missing%':
            #cont_desc[feature] = np.nan
            miss_perc = (df.shape[0] - cont_desc['count'][col]) / df.shape[0] * 100
            cont_desc[feature][col] = miss_perc

## Result -> 

In [77]:
cont_desc

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,cardinality,missing%
normalized-losses,164.0,122.0,35.442168,65.0,94.0,115.0,150.0,256.0,52.0,19.607843
wheel-base,204.0,98.806373,5.994144,86.6,94.5,97.0,102.4,120.9,53.0,0.0
length,204.0,174.075,12.362123,141.1,166.3,173.2,183.2,208.1,75.0,0.0
width,204.0,65.916667,2.146716,60.3,64.075,65.5,66.9,72.3,44.0,0.0
height,204.0,53.74902,2.424901,47.8,52.0,54.1,55.5,59.8,49.0,0.0
curb-weight,204.0,2555.602941,521.96082,1488.0,2145.0,2414.0,2939.25,4066.0,171.0,0.0
engine-size,204.0,126.892157,41.744569,61.0,97.0,119.5,142.0,326.0,44.0,0.0
bore,200.0,3.32905,0.274044,2.54,3.15,3.31,3.59,3.94,39.0,1.960784
stroke,200.0,3.2583,0.314868,2.07,3.11,3.29,3.41,4.17,37.0,1.960784
compression-ratio,204.0,10.148137,3.981,7.0,8.575,9.0,9.4,23.0,32.0,0.0
