<a href="https://colab.research.google.com/github/abdulwasaeee/Random-Forest/blob/main/RandomForestRegression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as plot
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression


In [12]:
df=pd.read_csv('cardekho_imputated.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,car_name,brand,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,0,Maruti Alto,Maruti,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,1,Hyundai Grand,Hyundai,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,2,Hyundai i20,Hyundai,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,3,Maruti Alto,Maruti,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,4,Ford Ecosport,Ford,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [13]:
df.isnull().sum()

Unnamed: 0,0
Unnamed: 0,0
car_name,0
brand,0
model,0
vehicle_age,0
km_driven,0
seller_type,0
fuel_type,0
transmission_type,0
mileage,0


In [14]:
df=df.drop(columns=['Unnamed: 0',	'car_name',	'brand'])
df.head()

Unnamed: 0,model,vehicle_age,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price
0,Alto,9,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000
1,Grand,5,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000
2,i20,11,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000
3,Alto,9,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000
4,Ecosport,6,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000


In [22]:
le=LabelEncoder()
df["model"]=le.fit_transform(df["model"])

In [23]:
x=df.drop(['selling_price'],axis=1)
y=df['selling_price']

In [24]:
ct=ColumnTransformer([
    ("onehotencoder",OneHotEncoder(drop='first'),['seller_type','fuel_type','transmission_type']),
    ("standardscaler",StandardScaler(),x.select_dtypes(exclude="object").columns)
],remainder='passthrough')


In [25]:
x=ct.fit_transform(x)

In [26]:
x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.25, random_state=42
)

In [30]:
#function to evaluate model
def evaluate(true,pred):
  mae=mean_absolute_error(true,pred)
  mse=mean_squared_error(true,pred)
  rmse=np.sqrt((mse))
  r2=r2_score(true,pred)
  return mae,mse,rmse,r2

In [32]:
models={
    "linear regression":LinearRegression(),
    "decision tress": DecisionTreeRegressor(),
    "knn": KNeighborsRegressor(),
    "random forest": RandomForestRegressor()
}

In [34]:
for name,model in models.items():
  model.fit(x_train,y_train)
  y_pred=model.predict(x_train)
  y_pred2=model.predict(x_test)

  mae,mse,rmse,r2=evaluate(y_train,y_pred)
  mae2,mse2,rmse2,r22=evaluate(y_test,y_pred2)

  print('*****'+name+'*****')
  print("train data performance:")
  print("mean absolute error: "+ str(mae))
  print("mean squared error: "+ str(mse))
  print("root mean squared error: "+ str(rmse))
  print("r2: "+str(r2))
  print("test data performance:")
  print("mean absolute error: "+ str(mae2))
  print("mean squared error: "+ str(mse2))
  print("root mean squared error: "+ str(rmse2))
  print("r2: "+str(r22))
  print("")

*****linear regression*****
train data performance:
mean absolute error: 266675.10755425063
mean squared error: 304874315292.8461
root mean squared error: 552154.249547032
r2: 0.6219860307551311
test data performance:
mean absolute error: 284283.44595338305
mean squared error: 270286925822.7529
root mean squared error: 519891.26346069033
r2: 0.6524693637784766

*****decision tress*****
train data performance:
mean absolute error: 4991.751744823211
mean squared error: 426210181.9807348
root mean squared error: 20644.85848778661
r2: 0.999471541567979
test data performance:
mean absolute error: 132064.01721602216
mean squared error: 96210823798.42906
root mean squared error: 310178.69655801484
r2: 0.876293650888643

*****knn*****
train data performance:
mean absolute error: 90941.30472400069
mean squared error: 108567823568.09137
root mean squared error: 329496.31798867095
r2: 0.8653866466913428
test data performance:
mean absolute error: 116412.17882169738
mean squared error: 78588031890

In [35]:
params={
    "max_features":[2,3,5,8,10],
    "max_depth":["auto",2,5,8,10],
    "n_estimators":[100,200,500]
}

In [40]:
rcv=RandomizedSearchCV(RandomForestRegressor(),param_distributions=params,n_iter=100,cv=3,verbose=0)
rcv.fit(x_train,y_train)
rcv.best_params_

{'n_estimators': 200, 'max_features': 8, 'max_depth': 10}