<div style = 'background-color: #6482AD; text-align:center; padding: 10px; color:#F5EDED'>
    <h1>Random Forest Regression</h1>
    <h2><a href = 'https://www.linkedin.com/in/ahmedsharaf9/' style = "text-decoration: none; color: #F5EDED">Ahmed Sharaf</a></h2>
</div>   </ul>
</div>

<div style = 'background-color: #6482AD; text-align:center; padding: 10px; color:#F5EDED'>
    <h2>Import Libraries</h2>
</div>

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

<div style = 'background-color: #6482AD; text-align:center; padding: 10px; color:#F5EDED'>
    <h2>Import Data</h2>
</div>

In [19]:
df = pd.read_csv('cardekho_imputated.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


<div style = 'background-color: #6482AD; text-align:center; padding: 10px; color:#F5EDED'>
    <h2>Feature Engineering</h2>
</div>

In [20]:
# check null values
df.isnull().sum()

Unnamed: 0           0
car_name             0
brand                0
model                0
vehicle_age          0
km_driven            0
seller_type          0
fuel_type            0
transmission_type    0
mileage              0
engine               0
max_power            0
seats                0
selling_price        0
dtype: int64

In [21]:
# Removing un necessary columns
df.drop(columns = ['Unnamed: 0', 'car_name', 'brand'], axis = 1, inplace = True)
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [22]:
# getting all different types of features
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print(f'Num Of Numerical Features : {len(num_features)}')

cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print(f'Num Of Categoricak Features : {len(cat_features)}')

discrete_features = [feature for feature in num_features if len(df[feature].unique()) >= 25]
print(f'Num Of Discrete Features : {len(discrete_features)}')

continuous_features = [feature for feature in num_features if feature not in discrete_features]
print(f'Num Of Continuous Features : {len(continuous_features)}')

Num Of Numerical Features : 7
Num Of Categoricak Features : 4
Num Of Discrete Features : 5
Num Of Continuous Features : 2


In [23]:
# get dependent and independent Features
X = df.drop(['selling_price'], axis = 1)
y = df['selling_price']

In [24]:
# train, test, split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [25]:
# Feature Scaing And Encoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

In [26]:
le = LabelEncoder()
X['model'] = le.fit_transform(X['model'])

In [27]:
num_features = X.select_dtypes(exclude='object').columns
onehot_columns = ['seller_type', 'fuel_type', 'transmission_type']

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop = 'first')

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder', oh_transformer, onehot_columns),
        ('StandardScaler', numeric_transformer, num_features)
    ], remainder='passthrough'
)

X = preprocessor.fit_transform(X)

In [29]:
# train, test, split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
X_train.shape, X_test.shape

((12328, 14), (3083, 14))

<div style = 'background-color: #6482AD; text-align:center; padding: 10px; color:#F5EDED'>
    <h2>Build Model</h2>
</div>

In [30]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [35]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2 = r2_score(true, predicted)
    return mae, mse, rmse, r2

In [38]:
## Model Training
models = {
        'Linear Regression': LinearRegression(),
        'Lasso': Lasso(),
        'Ridge': Ridge(),
        'KNN': KNeighborsRegressor(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_mse, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])

    print()
    print('Model Performance In Training Set')
    print(f'- MAE : {model_train_mae}')
    print(f'- MSE : {model_train_mse}')
    print(f'- RMSE : {model_train_rmse}')
    print(f'- R2 : {model_train_r2}')
    print('-'* 40)
    print('Model Performance In Test Set')
    print(f'- MAE : {model_test_mae}')
    print(f'- MSE : {model_test_mse}')
    print(f'- RMSE : {model_test_rmse}')
    print(f'- R2 : {model_test_r2}')
    print('='* 40)
    print()

Linear Regression

Model Performance In Training Set
- MAE : 268101.6070829936
- MSE : 306756099359.7596
- RMSE : 553855.6665411663
- R2 : 0.6217719576765959
----------------------------------------
Model Performance In Test Set
- MAE : 279618.5794158427
- MSE : 252550062888.56555
- RMSE : 502543.5930230984
- R2 : 0.6645109298852006

Lasso

Model Performance In Training Set
- MAE : 268099.22264981153
- MSE : 306756104248.3742
- RMSE : 553855.6709544231
- R2 : 0.6217719516489696
----------------------------------------
Model Performance In Test Set
- MAE : 279614.7461034126
- MSE : 252549134806.78134
- RMSE : 502542.66963789385
- R2 : 0.6645121627547996

Ridge

Model Performance In Training Set
- MAE : 268059.8014688309
- MSE : 306756818740.9266
- RMSE : 553856.3159709624
- R2 : 0.6217710706848425
----------------------------------------
Model Performance In Test Set
- MAE : 279557.2168930272
- MSE : 252540243247.9687
- RMSE : 502533.82298902894
- R2 : 0.6645239743566809

KNN

Model Per

In [40]:
# we select random forest and KNN
# now try hyperparameter Tuning

from sklearn.model_selection import RandomizedSearchCV
knn_params = {'n_neighbors': [2, 3, 10, 20, 40, 50]}
rf_params = {
    'max_depth': [5, 8, 15, None, 10],
    'max_features': [5, 7, 'auto', 8],
    'min_samples_split': [2, 8, 15, 20],
    'n_estimators': [100, 200, 500, 1000]
}

randomcv_models = [
    ('KNN', KNeighborsRegressor(), knn_params),
    ('RF', RandomForestRegressor(), rf_params)
]

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(estimator=model, 
                               param_distributions=params,
                               n_iter=100, 
                               cv = 3,
                               verbose=2,
                               n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_

for model_name in model_param:
    print(f'------------------ Best Params For {model_name} ------------------')
    print(model_param[model_name])


Fitting 3 folds for each of 6 candidates, totalling 18 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
------------------ Best Params For KNN ------------------
{'n_neighbors': 10}
------------------ Best Params For RF ------------------
{'n_estimators': 100, 'min_samples_split': 2, 'max_features': 5, 'max_depth': None}


In [42]:
# Finally Retrain Model With Best Hyper Parameter
## Model Training
models = {
        'KNN': KNeighborsRegressor(n_neighbors = 10),
        'Random Forest': RandomForestRegressor(max_depth = None, max_features = 5, min_samples_split = 2, n_estimators = 100)
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_mse, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_mse, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])

    print()
    print('Model Performance In Training Set')
    print(f'- MAE : {model_train_mae}')
    print(f'- MSE : {model_train_mse}')
    print(f'- RMSE : {model_train_rmse}')
    print(f'- R2 : {model_train_r2}')
    print('-'* 40)
    print('Model Performance In Test Set')
    print(f'- MAE : {model_test_mae}')
    print(f'- MSE : {model_test_mse}')
    print(f'- RMSE : {model_test_rmse}')
    print(f'- R2 : {model_test_r2}')
    print('='* 40)
    print()

KNN

Model Performance In Training Set
- MAE : 103472.04737183647
- MSE : 132103731766.70993
- RMSE : 363460.7706021517
- R2 : 0.8371170582947722
----------------------------------------
Model Performance In Test Set
- MAE : 117496.21310411936
- MSE : 69636909424.46481
- RMSE : 263888.06230003055
- R2 : 0.9074938975612508

Random Forest

Model Performance In Training Set
- MAE : 39507.070358973084
- MSE : 19907710578.208397
- RMSE : 141094.6865697231
- R2 : 0.9754539374608946
----------------------------------------
Model Performance In Test Set
- MAE : 99107.58657449696
- MSE : 46013801782.88382
- RMSE : 214508.2790544081
- R2 : 0.9388749802869863

