## Importing Library

In [1]:
import numpy as np
import pandas as pd

## Importing Dataset

In [2]:
df = pd.read_csv('audi.csv')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10668 entries, 0 to 10667
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         10668 non-null  object 
 1   year          10668 non-null  int64  
 2   price         10668 non-null  int64  
 3   transmission  10668 non-null  object 
 4   mileage       10668 non-null  int64  
 5   fuelType      10668 non-null  object 
 6   tax           10668 non-null  int64  
 7   mpg           10668 non-null  float64
 8   engineSize    10668 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 750.2+ KB


In [4]:
df.isnull().sum()

model           0
year            0
price           0
transmission    0
mileage         0
fuelType        0
tax             0
mpg             0
engineSize      0
dtype: int64

## Split into Features and Target Variable

In [5]:
X = df.loc[:, ('model', 'year', 'transmission', 'mileage', 'fuelType', 'tax', 'mpg', 'engineSize')].values
X

array([[' A1', 2017, 'Manual', ..., 150, 55.4, 1.4],
       [' A6', 2016, 'Automatic', ..., 20, 64.2, 2.0],
       [' A1', 2016, 'Manual', ..., 30, 55.4, 1.4],
       ...,
       [' A3', 2020, 'Manual', ..., 150, 49.6, 1.0],
       [' Q3', 2017, 'Automatic', ..., 150, 47.9, 1.4],
       [' Q3', 2016, 'Manual', ..., 150, 47.9, 1.4]], dtype=object)

In [6]:
y = df.loc[:, 'price'].values
y

array([12500, 16500, 11000, ..., 17199, 19499, 15999], dtype=int64)

## Encoding Categorical Data

In [7]:
df.loc[:, ('model', 'transmission', 'fuelType')].head()

Unnamed: 0,model,transmission,fuelType
0,A1,Manual,Petrol
1,A6,Automatic,Diesel
2,A1,Manual,Petrol
3,A4,Automatic,Diesel
4,A3,Manual,Petrol


In [8]:
df['model'].unique()

array([' A1', ' A6', ' A4', ' A3', ' Q3', ' Q5', ' A5', ' S4', ' Q2',
       ' A7', ' TT', ' Q7', ' RS6', ' RS3', ' A8', ' Q8', ' RS4', ' RS5',
       ' R8', ' SQ5', ' S8', ' SQ7', ' S3', ' S5', ' A2', ' RS7'],
      dtype=object)

In [9]:
df['transmission'].unique()

array(['Manual', 'Automatic', 'Semi-Auto'], dtype=object)

In [10]:
df['fuelType'].unique()

array(['Petrol', 'Diesel', 'Hybrid'], dtype=object)

### Encode Column model

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X[:, 0] = le.fit_transform(X[:, 0])
X

array([[0, 2017, 'Manual', ..., 150, 55.4, 1.4],
       [5, 2016, 'Automatic', ..., 20, 64.2, 2.0],
       [0, 2016, 'Manual', ..., 30, 55.4, 1.4],
       ...,
       [2, 2020, 'Manual', ..., 150, 49.6, 1.0],
       [9, 2017, 'Automatic', ..., 150, 47.9, 1.4],
       [9, 2016, 'Manual', ..., 150, 47.9, 1.4]], dtype=object)

### Encode Column transmission

In [12]:
le = LabelEncoder()
X[:, 2] = le.fit_transform(X[:, 2])
X

array([[0, 2017, 1, ..., 150, 55.4, 1.4],
       [5, 2016, 0, ..., 20, 64.2, 2.0],
       [0, 2016, 1, ..., 30, 55.4, 1.4],
       ...,
       [2, 2020, 1, ..., 150, 49.6, 1.0],
       [9, 2017, 0, ..., 150, 47.9, 1.4],
       [9, 2016, 1, ..., 150, 47.9, 1.4]], dtype=object)

### Encode Column fuelType

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [4])], remainder='passthrough')
X = np.array(ct.fit_transform(X))
X

array([[0.0, 0.0, 1.0, ..., 150, 55.4, 1.4],
       [1.0, 0.0, 0.0, ..., 20, 64.2, 2.0],
       [0.0, 0.0, 1.0, ..., 30, 55.4, 1.4],
       ...,
       [0.0, 0.0, 1.0, ..., 150, 49.6, 1.0],
       [0.0, 0.0, 1.0, ..., 150, 47.9, 1.4],
       [0.0, 0.0, 1.0, ..., 150, 47.9, 1.4]], dtype=object)

In [14]:
# df_X = pd.DataFrame(X)
# df_X.head()

## Split into Training Set and Test Set

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Training Model

In [16]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=11, random_state=0)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=11, random_state=0)

## Predict

In [17]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), 1))

[[19027.36 19000.  ]
 [17607.73 18799.  ]
 [23426.36 26664.  ]
 ...
 [19043.45 17505.  ]
 [30856.36 28500.  ]
 [55044.55 54000.  ]]


## Evaluate Model

In [18]:
from sklearn.metrics import r2_score, mean_squared_error
print(np.sqrt(mean_squared_error(y_test, y_pred)))
print(f'r2 score: {"{:.2f}".format(r2_score(y_test, y_pred)*100)} %')

2535.756374078876
r2 score: 94.96 %


## Tuning

In [19]:
n_estimators = np.arange(10, 160, 10)
random_state = np.arange(0, 21)

regressor_scores = list()
for i in random_state:
    for j in n_estimators:
        regressor = RandomForestRegressor(n_estimators=j, random_state=i, n_jobs=-1)
        regressor.fit(X_train, y_train)
        y_pred = regressor.predict(X_test)
        regressor_scores.append([j, i, r2_score(y_test, y_pred)])

In [20]:
df_scores = pd.DataFrame(regressor_scores, columns=['n_estimators', 'random_state', 'score'])
df_scores.sort_values(by='score', ascending=False).head()

Unnamed: 0,n_estimators,random_state,score
101,120,6,0.952401
100,110,6,0.952389
99,100,6,0.952376
207,130,13,0.952213
208,140,13,0.952207


In [21]:
regressor = RandomForestRegressor(n_estimators=120, random_state=6)
regressor.fit(X_train, y_train)

RandomForestRegressor(n_estimators=120, random_state=6)

In [22]:
y_pred = regressor.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))
print(f'r2 score: {"{:.2f}".format(r2_score(y_test, y_pred)*100)} %')

2464.023856848722
r2 score: 95.24 %
