In [None]:
import numpy as np
import pandas as pd
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'

df =  pd.read_csv(url, header = None)


headers = ["symboling","normalized-losses","make","fuel-type","aspiration", "num-of-doors","body-style",
         "drive-wheels","engine-location","wheel-base", "length","width","height","curb-weight","engine-type",
         "num-of-cylinders", "engine-size","fuel-system","bore","stroke","compression-ratio","horsepower",
         "peak-rpm","city-mpg","highway-mpg","price"]

df.columns = headers
df.dropna(subset=['price'], axis=0, inplace = True)
df.head(5)


In [None]:
df.replace("?",np.nan, inplace = True)

In [None]:
missing_data = df.isnull()
missing_data.head()
# COUNT MISSING VALUES IN DATA

for column in missing_data.columns.values.tolist():
    print(missing_data[column].value_counts())
    print(" ")
    

# Deal with missing data

 1. Drop data 
        a. drop the whole row
        b. drop the whole column
    2. Replace data
        a. replace it by mean
        b. replace it by frequency
        c. replace it based on other functions

In [None]:
df["normalized-losses"].replace(np.nan,df["normalized-losses"].astype("float").mean(axis=0), inplace = True)
df["stroke"].replace(np.nan,df["stroke"].astype("float").mean(axis=0), inplace = True)
df["bore"].replace(np.nan,df["bore"].astype("float").mean(axis=0), inplace = True)
df["horsepower"].replace(np.nan,df["horsepower"].astype("float").mean(axis=0), inplace = True)
df["peak-rpm"].replace(np.nan,df["peak-rpm"].astype("float").mean(axis=0), inplace = True)



In [None]:
df['num-of-doors'].value_counts()

In [None]:
df['num-of-doors'].value_counts().idxmax()

In [None]:
df['num-of-doors'].replace(np.nan, df['num-of-doors'].value_counts().idxmax(), inplace = True)

In [None]:
# reset index, because we droped two rows

df.reset_index(drop = True, inplace = True)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df[['bore','stroke','price','peak-rpm']] = df[['bore','stroke','price','peak-rpm']].astype("float") 

In [None]:
df[["normalized-losses"]] = df[["normalized-losses"]].astype("int")
print("data type changed")

In [None]:
df.head(5)
df['city-L/100km'] = 235/df['city-mpg']
df.head(5)


In [None]:
df.rename(columns={"city-mpg":"city-L/100km"}, inplace = True)
df.head()

In [None]:
# replace (origianl value) by (original value)/(maximum value)
df['length'] = df['length']/df['length'].max()
df['width'] = df['width']/df['width'].max()
df['height'] = df['height']/df['height'].max()

df[['length','width','height']].head()


In [None]:
arr = np.arange( 1, 10, 5)
print(arr)

In [None]:
df['horsepower'] = df['horsepower'].astype(int, inplace = True)

print(df['horsepower'].min())
print(df['horsepower'].max())

bandwidth = (df['horsepower'].max()-df['horsepower'].min())/4
print(bandwidth)


min1 = df['horsepower'].min()
max1 = df['horsepower'].max()

print(min1)
print(max1)

bins= np.arange(min1,max1, bandwidth)


print(bins)




In [None]:
group_names = ['low','medium','high']
#cutting into categories using cut function

df['horsepower-bin'] = pd.cut(df['horsepower'],bins,labels = group_names, include_lowest=True)
df[['horsepower','horsepower-bin']].head(20)

In [None]:
# data Visualisation

%matplotlib inline
import matplotlib as plt
from matplotlib import pyplot

plt.pyplot.hist(df['horsepower'],bins = 3)
plt.pyplot.xlabel("horsepower")
plt.pyplot.ylabel("count")
plt.pyplot.title("horsepower bins")

In [None]:
df.columns
dummy_variable1 = pd.get_dummies(df['fuel-type'])
dummy_variable1.head()


In [None]:
dummy_variable1.rename(columns= {'fuel-type-gas':'gas','fuel-type-diesel':'diesel'}, inplace=True)
dummy_variable1.head()

In [None]:
# merge data frame df and dummy_variable1
df = pd.concat([df, dummy_variable1], axis=1)

# drop original column fuel type from df

df.drop('fuel-type', axis=1, inplace=True)
df.head()
