# import Libraries

In [4]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import plot_tree

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV

In [5]:
df = pd.read_csv('autos_dataset.csv')

In [9]:
df.head().T

Unnamed: 0,0,1,2,3,4
symboling,3,3,1,2,2
normalized-losses,?,?,?,164,164
make,alfa-romero,alfa-romero,alfa-romero,audi,audi
fuel-type,gas,gas,gas,gas,gas
aspiration,std,std,std,std,std
num-of-doors,two,two,two,four,four
body-style,convertible,convertible,hatchback,sedan,sedan
drive-wheels,rwd,rwd,rwd,fwd,4wd
engine-location,front,front,front,front,front
wheel-base,88.6,88.6,94.5,99.8,99.4


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [13]:
df.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

In [14]:
df.drop(['make','aspiration','fuel-type','num-of-doors','body-style','drive-wheels','engine-location',
        'engine-type','fuel-system','normalized-losses'],axis = 1, inplace=True)

In [17]:
df.replace({'?':np.nan},inplace = True)
df['bore'].fillna(df['bore'].median(),inplace= True)
df['stroke'].fillna(df['stroke'].median(),inplace= True)
df['horsepower'].fillna(df['horsepower'].median(),inplace= True)
df['peak-rpm'].fillna(df['peak-rpm'].median(),inplace= True)
df['price'].fillna(df['price'].median(),inplace= True)


In [19]:
df['bore'] = df['bore'].astype(float)
df['stroke'] = df['stroke'].astype(float)
df['horsepower'] = df['horsepower'].astype(float)
df['peak-rpm'] = df['peak-rpm'].astype(float)
df['price'] = df['price'].astype(float)


In [20]:
df['num-of-cylinders'].replace({'four':4, 'six':6, 'five':5, 'three':3, 'twelve':12, 
                                'two':2, 'eight':8},inplace = True)

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   wheel-base         205 non-null    float64
 2   length             205 non-null    float64
 3   width              205 non-null    float64
 4   height             205 non-null    float64
 5   curb-weight        205 non-null    int64  
 6   num-of-cylinders   205 non-null    int64  
 7   engine-size        205 non-null    int64  
 8   bore               205 non-null    float64
 9   stroke             205 non-null    float64
 10  compression-ratio  205 non-null    float64
 11  horsepower         205 non-null    float64
 12  peak-rpm           205 non-null    float64
 13  city-mpg           205 non-null    int64  
 14  highway-mpg        205 non-null    int64  
 15  price              205 non-null    float64
dtypes: float64(10), int64(6)
m

# Train Test Split

In [26]:
x = df.drop('price',axis = 1)
y = df['price']
x_train,x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=10)
y_train

160     7738.0
98      8249.0
127    34028.0
47     32250.0
91      6649.0
        ...   
113    16695.0
64     11245.0
15     30760.0
125    22018.0
9      10295.0
Name: price, Length: 164, dtype: float64

# Model TRaining

In [43]:
dt_reg_model = DecisionTreeRegressor(min_samples_split=15,
                            min_samples_leaf=10)
dt_reg_model.fit(x_train, y_train)

DecisionTreeRegressor(min_samples_leaf=10, min_samples_split=15)

# Evaluation

In [41]:
# Test Data Accuracy 
y_pred = dt_reg_model.predict(x_test)
y_pred

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Value :",mse)

rmse = np.sqrt(mse)
print("Root Mean Squared Value :",rmse)

mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error is ",mae)

r2_value = r2_score(y_test, y_pred)
print('R squared value is :',r2_value)

Mean Squared Value : 22694925.502211247
Root Mean Squared Value : 4763.919132627174
Mean Absolute Error is  2735.617180205415
R squared value is : 0.4872685539336662


In [42]:
# Train Data Accuracy 

y_pred_train = dt_reg_model.predict(x_train)
y_pred_train

mse = mean_squared_error(y_train, y_pred_train)
print("Mean Squared Value :",mse)

rmse = np.sqrt(mse)
print("Root Mean Squared Value :",rmse)

r2_value = r2_score(y_train, y_pred_train)
print('R squared value is :',r2_value)

Mean Squared Value : 5746585.433388844
Root Mean Squared Value : 2397.2036695676993
R squared value is : 0.9131143091894993


In [None]:
# Train Data Accuracy