In [9]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from pandas.core.dtypes.common import is_numeric_dtype
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge
from sklearn.model_selection import train_test_split

url = 'https://raw.githubusercontent.com/rashakil-ds/Public-Datasets/main/automobile.csv'
df = pd.read_csv(url)

In [10]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [11]:
df.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

# Goal & Steps:

1. Predicting Car Prices:
   - Task: Build a regression model to predict the price (y=price) of cars based on other features in the dataset. This could involve preprocessing the data, selecting relevant features, and implementing `linear regression` and `KNN` algorithm.

2. Feature Engineering:
   - Task: Challenge students to come up with creative ways to engineer new features from the existing ones. For example, they could create a new feature representing the ratio of horsepower to curb weight.

3. Exploratory Data Analysis (EDA):
   - Task: Have students perform in-depth exploratory data analysis to understand the relationships between different features. Visualization tools can be used to communicate their findings effectively.

4. Handling Missing Data:
   - Task: Teach students how to handle missing data in a dataset. They can explore different strategies such as imputation or removal of missing values.

5. Training:
   - Task: Use relavant ML Models

6. Model evaluation:
- Evaluate models and Summarizing their findings. This should include a clear explanation of the model's performance and any insights gained from the analysis.

In [12]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [13]:
df.shape

(202, 26)

In [14]:
X = df.iloc[: , 0:25]
Y = df.price

In [15]:
# Number columns

num_column = X.select_dtypes(include=['int64', 'float64']).columns
num_column

Index(['symboling', 'normalized-losses', 'wheel-base', 'length', 'width',
       'height', 'curb-weight', 'engine-size', 'compression-ratio', 'city-mpg',
       'highway-mpg'],
      dtype='object')

In [16]:
# Text/Object columns

object_column = X.select_dtypes(include=['object']).columns
object_column

Index(['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style',
       'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders',
       'fuel-system', 'bore', 'stroke', 'horsepower', 'peak-rpm'],
      dtype='object')

In [17]:
# Finding the percentage of characters (with special/unique) from a feature 
# where majority of numbers exist

col_obj_to_num = []

for index, column in enumerate(object_column):
    is_numeric = df[column].str.match(r'^-?\d+(\.\d+)?$')
    num_count = is_numeric.sum()
    text_count = len(df[column]) - num_count

    percent_num = float(((num_count/len(df[column])) * 100))
    percent_text = float(((text_count/len(df[column])) * 100))
    
    if percent_num > 95:
        col_obj_to_num.append(column)

print(col_obj_to_num)

['bore', 'stroke', 'horsepower', 'peak-rpm']


In [18]:
df[col_obj_to_num]

Unnamed: 0,bore,stroke,horsepower,peak-rpm
0,3.47,2.68,111,5000
1,3.47,2.68,111,5000
2,2.68,3.47,154,5000
3,3.19,3.4,102,5500
4,3.19,3.4,115,5500
...,...,...,...,...
197,3.78,3.15,160,5300
198,3.58,2.87,134,5500
199,3.01,3.4,106,4800
200,3.78,3.15,114,5400


In [19]:
# Convert Object columns into Numeric type

for index, column in enumerate(col_obj_to_num):
    df[column] = pd.to_numeric(df[column], errors='coerce')

In [20]:
df_clean_merge = pd.concat([df[num_column], df[col_obj_to_num]], axis=1)
df_clean_merge.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg,bore,stroke,horsepower,peak-rpm
0,3,,88.6,168.8,64.1,48.8,2548,130,9.0,21,27,3.47,2.68,111.0,5000.0
1,3,,88.6,168.8,64.1,48.8,2548,130,9.0,21,27,3.47,2.68,111.0,5000.0
2,1,,94.5,171.2,65.5,52.4,2823,152,9.0,19,26,2.68,3.47,154.0,5000.0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,10.0,24,30,3.19,3.4,102.0,5500.0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,8.0,18,22,3.19,3.4,115.0,5500.0


In [21]:
df_clean_merge.isnull().sum()

symboling             0
normalized-losses    37
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-size           0
compression-ratio     0
city-mpg              0
highway-mpg           0
bore                  4
stroke                4
horsepower            2
peak-rpm              2
dtype: int64

In [22]:
# Replace NaN/Null using mean function
for index, col in enumerate(df_clean_merge.columns):
        means = df_clean_merge[col].mean()
        df_clean_merge[col] = df_clean_merge[col].fillna(means)
df_clean_merge.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg,bore,stroke,horsepower,peak-rpm
0,3,121.836364,88.6,168.8,64.1,48.8,2548,130,9.0,21,27,3.47,2.68,111.0,5000.0
1,3,121.836364,88.6,168.8,64.1,48.8,2548,130,9.0,21,27,3.47,2.68,111.0,5000.0
2,1,121.836364,94.5,171.2,65.5,52.4,2823,152,9.0,19,26,2.68,3.47,154.0,5000.0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,10.0,24,30,3.19,3.4,102.0,5500.0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,8.0,18,22,3.19,3.4,115.0,5500.0


In [23]:
# df_merge_num = pd.concat([df_clean_merge, df[object_column]], axis=1)
# df_merge_num

In [24]:
# df_merge_num.to_csv('exception.csv')

In [25]:
# num_column_new = df_merge_num.select_dtypes(include=['int64', 'float64']).columns
# num_column_new

In [26]:
# object_column_new = df_merge_num.select_dtypes(include=['object']).columns
# object_column_new

In [27]:
object_column_new = df.select_dtypes(include=['object']).columns
object_column_new

Index(['make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style',
       'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders',
       'fuel-system'],
      dtype='object')

In [28]:
label_encoder = LabelEncoder()
object_column_new = df.select_dtypes(include=['object']).columns
for col_obj in df[object_column_new].columns:
    df[col_obj] = label_encoder.fit_transform(df[col_obj])

df[object_column_new]

Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
0,0,1,0,1,0,2,0,0,2,5
1,0,1,0,1,0,2,0,0,2,5
2,0,1,0,1,2,2,0,4,3,5
3,1,1,0,0,3,1,0,2,2,5
4,1,1,0,0,3,0,0,2,1,5
...,...,...,...,...,...,...,...,...,...,...
197,21,1,1,0,3,2,0,2,2,5
198,21,1,0,0,3,2,0,4,3,5
199,21,0,1,0,3,2,0,2,3,3
200,21,1,1,0,3,2,0,2,2,5


In [29]:
clean_and_transformed = pd.concat([df_clean_merge, df[object_column_new]], axis=1)
clean_and_transformed.head()

Unnamed: 0,symboling,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,...,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,engine-type,num-of-cylinders,fuel-system
0,3,121.836364,88.6,168.8,64.1,48.8,2548,130,9.0,21,...,0,1,0,1,0,2,0,0,2,5
1,3,121.836364,88.6,168.8,64.1,48.8,2548,130,9.0,21,...,0,1,0,1,0,2,0,0,2,5
2,1,121.836364,94.5,171.2,65.5,52.4,2823,152,9.0,19,...,0,1,0,1,2,2,0,4,3,5
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,10.0,24,...,1,1,0,0,3,1,0,2,2,5
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,8.0,18,...,1,1,0,0,3,0,0,2,1,5


In [30]:
# Correlations with respect to each features
corr_number_column = df.select_dtypes(include=['int64', 'float64']).corr()
corr_number_column

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
symboling,1.0,0.531064,-0.138394,0.19323,-0.06905,0.634824,-0.600069,-0.061339,0.212095,-0.541598,...,-0.112429,0.084678,-0.154722,-0.005726,-0.180012,0.073553,0.274601,-0.028289,0.042025,-0.090242
normalized-losses,0.531064,1.0,-0.292454,0.115028,-0.017713,0.410565,-0.319323,0.350388,,-0.082988,...,0.163334,0.248635,-0.043964,0.066865,-0.131651,0.292232,0.261356,-0.25155,-0.204832,0.191829
make,-0.138394,-0.292454,1.0,-0.106493,0.084672,-0.183553,0.093195,-0.026242,0.052209,0.084673,...,-0.081998,0.149758,0.246559,-0.206036,0.130768,-0.06242,-0.208079,0.058683,0.052987,-0.154161
fuel-type,0.19323,0.115028,-0.106493,1.0,-0.400081,0.125488,-0.146811,-0.128596,0.040702,-0.302191,...,-0.070179,0.044157,-0.0513,-0.241736,-0.985168,0.169462,0.476461,-0.266577,-0.199666,-0.107964
aspiration,-0.06905,-0.017713,0.084672,-0.400081,1.0,-0.036796,0.075893,0.105748,-0.058142,0.269937,...,0.112352,0.287673,0.24164,0.21264,0.302328,0.251639,-0.181717,-0.19678,-0.247491,0.189366
num-of-doors,0.634824,0.410565,-0.183553,0.125488,-0.036796,1.0,-0.642376,0.081945,0.131789,-0.44568,...,-0.040116,0.009267,-0.138859,0.025362,-0.112946,0.07637,0.216559,0.059293,0.07723,-0.062527
body-style,-0.600069,-0.319323,0.093195,-0.146811,0.075893,-0.642376,1.0,-0.157685,-0.278475,0.407756,...,-0.062894,-0.052779,0.024702,-0.014253,0.134181,-0.136902,-0.105138,0.012063,-0.023116,-0.070042
drive-wheels,-0.061339,0.350388,-0.026242,-0.128596,0.105748,0.081945,-0.157685,1.0,0.148085,0.472777,...,0.526009,0.43872,0.484321,0.077623,0.118686,0.546663,-0.032044,-0.472226,-0.475363,0.591249
engine-location,0.212095,,0.052209,0.040702,-0.058142,0.131789,-0.278475,0.148085,1.0,-0.188796,...,0.199095,0.106006,0.186085,-0.139116,-0.020364,0.341915,0.201405,-0.156179,-0.102274,0.329178
wheel-base,-0.541598,-0.082988,0.084673,-0.302191,0.269937,-0.44568,0.407756,0.472777,-0.188796,1.0,...,0.570654,0.385107,0.501912,0.154513,0.247128,0.371294,-0.352788,-0.474246,-0.545504,0.588382


In [31]:
# Save Correlation table
corr_number_column.to_csv('correlation.csv')

In [32]:
# Correlation of all features with respect to Price
most_correlated_with_price = corr_number_column['price'].sort_values(ascending=False)
most_correlated_with_price

price                1.000000
engine-size          0.871051
curb-weight          0.835277
horsepower           0.809232
width                0.753236
length               0.692774
drive-wheels         0.591249
wheel-base           0.588382
bore                 0.547582
fuel-system          0.523984
engine-location      0.329178
normalized-losses    0.191829
aspiration           0.189366
height               0.139003
engine-type          0.125031
stroke               0.080014
compression-ratio    0.069881
num-of-cylinders     0.004485
num-of-doors        -0.062527
body-style          -0.070042
symboling           -0.090242
peak-rpm            -0.097752
fuel-type           -0.107964
make                -0.154161
city-mpg            -0.688254
highway-mpg         -0.705924
Name: price, dtype: float64

In [33]:
# Analyze the Highly correlated features with respect to Price
high_corr_features = []
predictor = 'price'

for col in df.columns:
    if is_numeric_dtype(df[col]):
                high_corr_val = clean_and_transformed[predictor].corr(clean_and_transformed[col])
                if high_corr_val >= 0.5 and high_corr_val <= 1:
                    high_corr_features.append(col)
print(high_corr_features)

KeyError: 'price'

In [None]:
# Correlation heatmap with respect to Price
sns.heatmap(clean_and_transformed[high_corr_features].corr(), annot=True)
plt.title('Highly correlated features with Price')

In [None]:
high_corr_features

In [None]:
xHC = clean_and_transformed[high_corr_features]
yHC = df.price

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(
    xHC, yHC, test_size=0.30, random_state=0
)

In [None]:
linear_reg_model = LinearRegression()
linear_reg_model

In [None]:
training_for_lr = linear_reg_model.fit(xTrain, yTrain)
training_for_lr

In [None]:
test_for_lr = linear_reg_model.fit(xTest, yTest)
test_for_lr

In [None]:
score_training_lr = linear_reg_model.score(xTrain, yTrain)
score_training_lr

In [None]:
score_test_lr = linear_reg_model.score(xTest, yTest)
score_test_lr