In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns

# 1. Problem Statement

# 2. Data Gathering

In [2]:
df = pd.read_csv('autos_dataset.csv')
df.head().T

Unnamed: 0,0,1,2,3,4
symboling,3,3,1,2,2
normalized-losses,?,?,?,164,164
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
fuel-type,gas,gas,gas,gas,gas
aspiration,std,std,std,std,std
num-of-doors,two,two,two,four,four
body-style,convertible,convertible,hatchback,sedan,sedan
drive-wheels,rwd,rwd,rwd,fwd,4wd
engine-location,front,front,front,front,front
wheel-base,88.6,88.6,94.5,99.8,99.4


In [3]:
df.shape

(205, 26)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [5]:
# df.isna().sum()

# 3. EDA

In [6]:
df.replace({'?':np.nan},inplace = True)

In [7]:
# df.isna().sum()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  164 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       203 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [8]:
df.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [9]:
df.drop(['make','aspiration','fuel-type','num-of-doors','body-style','drive-wheels','engine-location',
        'engine-type','fuel-system'],axis = 1, inplace=True)

In [10]:
# df.info()

In [11]:
df.drop('normalized-losses',axis =1 , inplace =True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   wheel-base         205 non-null    float64
 2   length             205 non-null    float64
 3   width              205 non-null    float64
 4   height             205 non-null    float64
 5   curb-weight        205 non-null    int64  
 6   num-of-cylinders   205 non-null    object 
 7   engine-size        205 non-null    int64  
 8   bore               201 non-null    object 
 9   stroke             201 non-null    object 
 10  compression-ratio  205 non-null    float64
 11  horsepower         203 non-null    object 
 12  peak-rpm           203 non-null    object 
 13  city-mpg           205 non-null    int64  
 14  highway-mpg        205 non-null    int64  
 15  price              201 non-null    object 
dtypes: float64(5), int64(5), o

In [13]:
# df.isna().sum()

In [14]:
df['bore'].fillna(df['bore'].median(),inplace= True)
df['stroke'].fillna(df['stroke'].median(),inplace= True)
df['horsepower'].fillna(df['horsepower'].median(),inplace= True)
df['peak-rpm'].fillna(df['peak-rpm'].median(),inplace= True)
df['price'].fillna(df['price'].median(),inplace= True)



In [15]:
# df.isna().sum()

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   wheel-base         205 non-null    float64
 2   length             205 non-null    float64
 3   width              205 non-null    float64
 4   height             205 non-null    float64
 5   curb-weight        205 non-null    int64  
 6   num-of-cylinders   205 non-null    object 
 7   engine-size        205 non-null    int64  
 8   bore               205 non-null    object 
 9   stroke             205 non-null    object 
 10  compression-ratio  205 non-null    float64
 11  horsepower         205 non-null    object 
 12  peak-rpm           205 non-null    object 
 13  city-mpg           205 non-null    int64  
 14  highway-mpg        205 non-null    int64  
 15  price              205 non-null    object 
dtypes: float64(5), int64(5), o

In [17]:
df['bore'] = df['bore'].astype(float)
df['stroke'] = df['stroke'].astype(float)
df['horsepower'] = df['horsepower'].astype(float)
df['peak-rpm'] = df['peak-rpm'].astype(float)
df['price'] = df['price'].astype(float)


In [20]:
# df['num-of-cylinders'] = df['num-of-cylinders'].astype(float)


In [21]:
df['num-of-cylinders'].unique()

array(['four', 'six', 'five', 'three', 'twelve', 'two', 'eight'],
      dtype=object)

In [22]:
df['num-of-cylinders'].replace({'four':4, 'six':6, 'five':5, 'three':3, 'twelve':12, 
                                'two':2, 'eight':8},inplace = True)


In [23]:
# df['num-of-cylinders']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   wheel-base         205 non-null    float64
 2   length             205 non-null    float64
 3   width              205 non-null    float64
 4   height             205 non-null    float64
 5   curb-weight        205 non-null    int64  
 6   num-of-cylinders   205 non-null    int64  
 7   engine-size        205 non-null    int64  
 8   bore               205 non-null    float64
 9   stroke             205 non-null    float64
 10  compression-ratio  205 non-null    float64
 11  horsepower         205 non-null    float64
 12  peak-rpm           205 non-null    float64
 13  city-mpg           205 non-null    int64  
 14  highway-mpg        205 non-null    int64  
 15  price              205 non-null    float64
dtypes: float64(10), int64(6)
m

# Train and Test Data

In [24]:
x = df.drop('price',axis = 1)
y = df['price']

In [25]:
x_train,x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=10)
# x_train

# Model Training 

In [26]:
lr_model = LinearRegression()
lr_model.fit(x_train,y_train)

LinearRegression()

In [27]:
# Evaluation

### Residual

### Test Data

In [34]:
y_pred = lr_model.predict(x_test)
# y_pred[:5]
mse = mean_squared_error(y_test, y_pred)
print("Mean squared value is ",mse)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error is ",mae)

r2score = r2_score(y_test, y_pred)
print("R2 score is :",r2score)

Mean squared value is  41136422.3598216
Mean Absolute Error is  3667.936869305307
R2 score is : 0.07063200888270205


In [29]:
# y_test[:5]

### Train Data

In [None]:
y_pred = lr_model.predict(x_test)
# y_pred[:5]
mse = mean_squared_error(y_test, y_pred)
print("Mean squared value is ",mse)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error is ",mae)

r2score = r2_score(y_test, y_pred)
print("R2 score is :",r2score)