In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_excel('Data_Train.xlsx')
test = pd.read_excel('Data_Test.xlsx')
pd.set_option('display.max_columns', None)

In [3]:
train.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


In [4]:
test.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats
0,Maruti Alto K10 LXI CNG,Delhi,2014,40929,CNG,Manual,First,32.26 km/kg,998 CC,58.2 bhp,4.0
1,Maruti Alto 800 2016-2019 LXI,Coimbatore,2013,54493,Petrol,Manual,Second,24.7 kmpl,796 CC,47.3 bhp,5.0
2,Toyota Innova Crysta Touring Sport 2.4 MT,Mumbai,2017,34000,Diesel,Manual,First,13.68 kmpl,2393 CC,147.8 bhp,7.0
3,Toyota Etios Liva GD,Hyderabad,2012,139000,Diesel,Manual,First,23.59 kmpl,1364 CC,null bhp,5.0
4,Hyundai i20 Magna,Mumbai,2014,29000,Petrol,Manual,First,18.5 kmpl,1197 CC,82.85 bhp,5.0


## Data Preprocessing (Initial Stage)

In [5]:
train.isnull().sum()

Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               36
Power                36
Seats                42
Price                 0
dtype: int64

In [6]:
train=train.drop_duplicates()

In [7]:
train.drop(columns=['Location','Name'],axis=1, inplace=True)

In [8]:
train.dtypes.value_counts()

object     6
float64    2
int64      2
dtype: int64

In [9]:
import datetime as dt
train['Year'] = train['Year'].apply(lambda year: dt.datetime.now().year - year)

In [10]:
train['Year']

0       10
1        5
2        9
3        8
4        7
        ..
6014     6
6015     5
6016     8
6017     7
6018     9
Name: Year, Length: 6019, dtype: int64

In [11]:
train['Mileage']=train['Mileage'].str.rstrip('kmplkm/kg').astype('float')

In [12]:
train['Mileage']

0       26.60
1       19.67
2       18.20
3       20.77
4       15.20
        ...  
6014    28.40
6015    24.40
6016    14.00
6017    18.90
6018    25.44
Name: Mileage, Length: 6019, dtype: float64

In [13]:
train['Power']=train['Power'].str.rstrip('bhp')
train['Power']= pd.to_numeric(train['Power'],errors='coerce')

In [14]:
train['Power']

0        58.16
1       126.20
2        88.70
3        88.76
4       140.80
         ...  
6014     74.00
6015     71.00
6016    112.00
6017     67.10
6018     57.60
Name: Power, Length: 6019, dtype: float64

In [15]:
train['Engine']=train['Engine'].str.rstrip('CC').astype('float')

In [16]:
train['Engine']

0        998.0
1       1582.0
2       1199.0
3       1248.0
4       1968.0
         ...  
6014    1248.0
6015    1120.0
6016    2498.0
6017     998.0
6018     936.0
Name: Engine, Length: 6019, dtype: float64

In [17]:
train.isnull().sum()

Year                   0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Mileage                2
Engine                36
Power                143
Seats                 42
Price                  0
dtype: int64

In [18]:
train['Mileage'].fillna(train['Mileage'].astype('float').mean(), inplace = True)
train['Power'].fillna(train['Power'].astype('float').mean(), inplace = True)
train['Engine'].fillna(train['Engine'].astype('float').mean(), inplace = True)
train['Seats'].fillna(train['Seats'].astype('float').mean(), inplace = True)

In [19]:
train.isnull().sum()

Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64

In [20]:
train.columns

Index(['Year', 'Kilometers_Driven', 'Fuel_Type', 'Transmission', 'Owner_Type',
       'Mileage', 'Engine', 'Power', 'Seats', 'Price'],
      dtype='object')

In [21]:
train = pd.get_dummies(train,columns=['Fuel_Type', 'Transmission', 'Owner_Type'],drop_first = True)

In [22]:
train.head()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,Fuel_Type_Diesel,Fuel_Type_Electric,Fuel_Type_LPG,Fuel_Type_Petrol,Transmission_Manual,Owner_Type_Fourth & Above,Owner_Type_Second,Owner_Type_Third
0,10,72000,26.6,998.0,58.16,5.0,1.75,0,0,0,0,1,0,0,0
1,5,41000,19.67,1582.0,126.2,5.0,12.5,1,0,0,0,1,0,0,0
2,9,46000,18.2,1199.0,88.7,5.0,4.5,0,0,0,1,1,0,0,0
3,8,87000,20.77,1248.0,88.76,7.0,6.0,1,0,0,0,1,0,0,0
4,7,40670,15.2,1968.0,140.8,5.0,17.74,1,0,0,0,0,0,1,0


### Splitting data for Model training

In [23]:
col=train.pop('Price')
train['Price']=col

In [24]:
X = train.iloc[:,:-1].values
y = train.iloc[:,-1].values

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=50)

## Random Forest Regressor (Initial Stage)

In [26]:
from Random_Forests_Model import CustomRandomForest

In [27]:
regressor = CustomRandomForest()

In [28]:
regressor.tune_parameters(X_train, y_train)

Best parameters : {'max_depth': 15, 'n_estimators': 75}
Best score : 0.8518737679353752


In [29]:
regressor.fit(X_train, y_train)

In [30]:
y_pred = regressor.predict(X_test)

In [31]:
df = regressor.evaluate(y_test, y_pred)
df

Unnamed: 0,Metrics,Score
0,R2 Score,89.876923
1,Mean Squared Error,12.500999
