<a href="https://www.kaggle.com/code/a7madmostafa/used-cars-prediction?scriptVersionId=200031717" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# **Any Feedback is appreciated**


# **Please upvote it if you found it useful**

# Data Description



| Attribute | Description |
|----------|----------|
|Name	|The brand and model of the car
|Location	|The location in which the car is being sold or is available for purchase
|Year	|The year or edition of the model
|Kilometers_Driven	|The total kilometers driven in the car by the previous owner(s) in KM
|Fuel_Type	|The type of fuel used by the car
|Transmission	|The type of transmission used by the car
|Owner_Type	|Whether the ownership is Firsthand, Second hand or other
|Mileage	|The standard mileage offered by the car company in kmpl or km/kg
|Engine	|The displacement volume of the engine in cc
|Power	|The maximum power of the engine in bhp
|Seats	|The number of seats in the car
|New_Price	|Price of new model
|Price	|The price of the used car in INR Lakhs

> Note

        1 INR Lakhs = 100,000 Indian Rupees ~= 1,200 USD

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, OrdinalEncoder


# Reading Data

In [2]:
df = pd.read_csv('/kaggle/input/used-cars-price-prediction/train-data.csv')
df_test = pd.read_csv('/kaggle/input/used-cars-price-prediction/test-data.csv')

# Exploring Data

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              5977 non-null   float64
 12  New_Price          824 non-null    object 
 13  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(9)
memory usage: 658.5+ KB


In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Year,Kilometers_Driven,Seats,Price
count,6019.0,6019.0,6019.0,5977.0,6019.0
mean,3009.0,2013.358199,58738.38,5.278735,9.479468
std,1737.679967,3.269742,91268.84,0.80884,11.187917
min,0.0,1998.0,171.0,0.0,0.44
25%,1504.5,2011.0,34000.0,5.0,3.5
50%,3009.0,2014.0,53000.0,5.0,5.64
75%,4513.5,2016.0,73000.0,5.0,9.95
max,6018.0,2019.0,6500000.0,10.0,160.0


        * Year Range : 1998 - 2019
        * Kilometers_Driven Range : 171 - 650000
        * Seats Range : 0 - 10  (0 is not logical)
        * Price Range : 0.44 - 160

In [6]:
df[df.Seats == 0]

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
3999,3999,Audi A4 3.2 FSI Tiptronic Quattro,Hyderabad,2012,125000,Petrol,Automatic,First,10.5 kmpl,3197 CC,null bhp,0.0,,18.0


In [7]:
df.Seats = df.Seats.replace(0, np.nan)

In [8]:
df.Seats.value_counts().sort_values(ascending=False)

5.0     5014
7.0      674
8.0      134
4.0       99
6.0       31
2.0       16
10.0       5
9.0        3
Name: Seats, dtype: int64

In [9]:
df.describe(include=['O'])

Unnamed: 0,Name,Location,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,New_Price
count,6019,6019,6019,6019,6019,6017,5983,5983,824
unique,1876,11,5,2,4,442,146,372,540
top,Mahindra XUV500 W8 2WD,Mumbai,Diesel,Manual,First,18.9 kmpl,1197 CC,74 bhp,95.13 Lakh
freq,49,790,3205,4299,4929,172,606,235,6


In [10]:
# Scatter Matrix
fig = px.scatter_matrix(df, dimensions=['Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'Price'])
fig.update_layout(width=1500, height=1200, showlegend=False, title_text="Pairplot of Car Price Prediction Dataset")
fig.show()


In [11]:
df.Name.str.split().str[0].nunique()  ## Brand names

31

In [12]:
df.Name.str.contains('Hyundai').sum()

1107

In [13]:
df.Name.str.split().str[1].nunique()  ## Model names

212

In [14]:
df.Location.value_counts()

Mumbai        790
Hyderabad     742
Kochi         651
Coimbatore    636
Pune          622
Delhi         554
Kolkata       535
Chennai       494
Jaipur        413
Bangalore     358
Ahmedabad     224
Name: Location, dtype: int64

In [15]:
df.Fuel_Type.value_counts()

Diesel      3205
Petrol      2746
CNG           56
LPG           10
Electric       2
Name: Fuel_Type, dtype: int64

> Note

        Take care of Electrical cars (only 2) in splitting data

In [16]:
df.Transmission.value_counts()

Manual       4299
Automatic    1720
Name: Transmission, dtype: int64

In [17]:
df.Owner_Type.value_counts()

First             4929
Second             968
Third              113
Fourth & Above       9
Name: Owner_Type, dtype: int64

# Check Missing Values


In [18]:
df.isnull().sum()

Unnamed: 0              0
Name                    0
Location                0
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 36
Power                  36
Seats                  43
New_Price            5195
Price                   0
dtype: int64

In [19]:
df.isnull().mean() * 100

Unnamed: 0            0.000000
Name                  0.000000
Location              0.000000
Year                  0.000000
Kilometers_Driven     0.000000
Fuel_Type             0.000000
Transmission          0.000000
Owner_Type            0.000000
Mileage               0.033228
Engine                0.598106
Power                 0.598106
Seats                 0.714404
New_Price            86.310018
Price                 0.000000
dtype: float64

In [20]:
# Drop New_Price column
df.drop('New_Price', axis=1, inplace=True)

# Some Feature Engineering


In [21]:
# Fixing Numerical columns
def fix_numerical(df, col):
    df[col] = pd.to_numeric(df[col].str.split().str[0], errors='coerce')
    return df[col]

In [22]:
for col in ['Mileage', 'Engine', 'Power']:
    df[col] = fix_numerical(df, col)

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   float64
 9   Engine             5983 non-null   float64
 10  Power              5876 non-null   float64
 11  Seats              5976 non-null   float64
 12  Price              6019 non-null   float64
dtypes: float64(5), int64(3), object(5)
memory usage: 611.4+ KB


In [24]:
df.sample(5)

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
5810,5810,Honda City i DTEC VX,Kolkata,2015,30000,Diesel,Manual,First,25.1,1498.0,98.6,5.0,6.95
3452,3452,Maruti Swift Dzire VXI,Kolkata,2014,33000,Petrol,Manual,First,19.1,1197.0,85.8,5.0,3.99
3103,3103,Nissan Micra XL,Kolkata,2013,45000,Petrol,Manual,First,18.44,1198.0,75.0,5.0,1.75
138,138,Honda Jazz 1.2 V CVT i VTEC,Mumbai,2015,12000,Petrol,Automatic,First,19.0,1199.0,88.7,5.0,6.0
3583,3583,Hyundai Verna 1.6 SX VTVT,Mumbai,2011,58000,Petrol,Manual,Second,17.01,1591.0,121.3,5.0,4.0


In [25]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Price
count,6019.0,6019.0,6019.0,6017.0,5983.0,5876.0,5976.0,6019.0
mean,3009.0,2013.358199,58738.38,18.134961,1621.27645,113.25305,5.279618,9.479468
std,1737.679967,3.269742,91268.84,4.582289,601.355233,53.874957,0.806019,11.187917
min,0.0,1998.0,171.0,0.0,72.0,34.2,2.0,0.44
25%,1504.5,2011.0,34000.0,15.17,1198.0,75.0,5.0,3.5
50%,3009.0,2014.0,53000.0,18.15,1493.0,97.7,5.0,5.64
75%,4513.5,2016.0,73000.0,21.1,1984.0,138.1,5.0,9.95
max,6018.0,2019.0,6500000.0,33.54,5998.0,560.0,10.0,160.0


In [26]:
df[df['Mileage']== 0]

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
14,14,Land Rover Freelander 2 TD4 SE,Pune,2012,85000,Diesel,Automatic,Second,0.0,2179.0,115.0,5.0,17.50
67,67,Mercedes-Benz C-Class Progressive C 220d,Coimbatore,2019,15369,Diesel,Automatic,First,0.0,1950.0,194.0,5.0,35.67
79,79,Hyundai Santro Xing XL,Hyderabad,2005,87591,Petrol,Manual,First,0.0,1086.0,,5.0,1.30
194,194,Honda City 1.5 GXI,Ahmedabad,2007,60006,Petrol,Manual,First,0.0,,,,2.95
229,229,Ford Figo Diesel,Bangalore,2015,70436,Diesel,Manual,First,0.0,1498.0,99.0,,3.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5647,5647,Toyota Qualis Fleet A3,Mumbai,2001,227000,Diesel,Manual,Fourth & Above,0.0,2446.0,,8.0,2.20
5875,5875,Mercedes-Benz C-Class Progressive C 220d,Ahmedabad,2019,4000,Diesel,Automatic,First,0.0,1950.0,194.0,5.0,35.00
5943,5943,Mahindra Jeep MM 540 DP,Chennai,2002,75000,Diesel,Manual,First,0.0,2112.0,,6.0,1.70
5972,5972,Hyundai Santro Xing GL,Mumbai,2008,65000,Petrol,Manual,Second,0.0,1086.0,62.0,5.0,1.39


In [27]:
df.isnull().sum()

Unnamed: 0             0
Name                   0
Location               0
Year                   0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Mileage                2
Engine                36
Power                143
Seats                 43
Price                  0
dtype: int64

In [28]:
#df['Brand'] = df.Name.str.split().str[0]
df['Name'] = df.Name.str.split().str[0:2].str.join(' ')


In [29]:
df['Age'] = df.Year.max() - df.Year
df.drop('Year', axis=1, inplace=True)

In [30]:
df.sample()

Unnamed: 0.1,Unnamed: 0,Name,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price,Age
1434,1434,Honda City,Kolkata,36000,Diesel,Manual,First,26.0,1498.0,98.6,5.0,5.0,5


In [31]:
df.corr()

Unnamed: 0.1,Unnamed: 0,Kilometers_Driven,Mileage,Engine,Power,Seats,Price,Age
Unnamed: 0,1.0,-0.008734,0.023677,-0.004171,-0.012921,-0.010247,-0.020275,-0.002354
Kilometers_Driven,-0.008734,1.0,-0.065253,0.091068,0.033503,0.084209,-0.011493,0.173048
Mileage,0.023677,-0.065253,1.0,-0.597699,-0.537729,-0.311268,-0.306593,-0.321565
Engine,-0.004171,0.091068,-0.597699,1.0,0.866185,0.397848,0.658354,0.052197
Power,-0.012921,0.033503,-0.537729,0.866185,1.0,0.101562,0.772566,-0.014525
Seats,-0.010247,0.084209,-0.311268,0.397848,0.101562,1.0,0.053247,-0.011909
Price,-0.020275,-0.011493,-0.306593,0.658354,0.772566,0.053247,1.0,-0.305327
Age,-0.002354,0.173048,-0.321565,0.052197,-0.014525,-0.011909,-0.305327,1.0


In [32]:
# Heatmap
px.imshow(df.corr(), width=800, height=800, title="Heatmap of Car Price Prediction Dataset")


In [33]:
px.scatter(df, x='Engine', y='Price', trendline='ols', width=800, height=600, title="Scatterplot of Engine vs Price")

In [34]:
px.scatter(df, x='Power', y='Price', trendline='ols', width=800, height=600, title= "Scatterplot of Power vs Price")

# Data Splitting

In [35]:
# Data Splitting into features and target
X = df.drop('Price', axis=1)
y = df['Price']

# Splitting into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=X['Fuel_Type'])

In [36]:
X_train.Fuel_Type.value_counts()

Diesel      2564
Petrol      2197
CNG           45
LPG            8
Electric       1
Name: Fuel_Type, dtype: int64

# Data Preprocessing

In [37]:
numerical_cols = list(df.select_dtypes(include=['int64', 'float64']).columns)
numerical_cols.remove('Price')
numerical_cols

['Unnamed: 0',
 'Kilometers_Driven',
 'Mileage',
 'Engine',
 'Power',
 'Seats',
 'Age']

In [38]:
categorical_cols = list(df.select_dtypes(include=['object']).columns)
categorical_cols

['Name', 'Location', 'Fuel_Type', 'Transmission', 'Owner_Type']

## Handling Missing Values

In [39]:
df.isnull().sum()

Unnamed: 0             0
Name                   0
Location               0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Mileage                2
Engine                36
Power                143
Seats                 43
Price                  0
Age                    0
dtype: int64

In [40]:
missing_cols = ['Mileage', 'Engine', 'Power', 'Seats']
for col in missing_cols:
    fig = px.histogram(df, x=col, width=800, height=400)
    fig.show()

In [41]:
# Impute missing values using SimpleImputer

imputer = SimpleImputer(strategy='median')
X_train[missing_cols] = imputer.fit_transform(X_train[missing_cols])
X_test[missing_cols] = imputer.transform(X_test[missing_cols])


In [42]:
df.isnull().sum()

Unnamed: 0             0
Name                   0
Location               0
Kilometers_Driven      0
Fuel_Type              0
Transmission           0
Owner_Type             0
Mileage                2
Engine                36
Power                143
Seats                 43
Price                  0
Age                    0
dtype: int64

In [43]:
X_train.isnull().sum()

Unnamed: 0           0
Name                 0
Location             0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Age                  0
dtype: int64

In [44]:
X_test.isnull().sum()

Unnamed: 0           0
Name                 0
Location             0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Age                  0
dtype: int64

# Scaling Data

In [45]:
X_train.describe()

Unnamed: 0.1,Unnamed: 0,Kilometers_Driven,Mileage,Engine,Power,Seats,Age
count,4815.0,4815.0,4815.0,4815.0,4815.0,4815.0,4815.0
mean,2992.157217,58881.02,18.153776,1618.088889,112.321981,5.274143,5.668951
std,1739.608741,99969.64,4.576099,595.994205,52.545529,0.79677,3.290287
min,0.0,171.0,0.0,72.0,34.2,2.0,0.0
25%,1473.0,34004.5,15.26,1198.0,78.9,5.0,3.0
50%,2999.0,53000.0,18.16,1493.0,94.0,5.0,5.0
75%,4494.0,73000.0,21.1,1968.0,138.03,5.0,8.0
max,6018.0,6500000.0,33.54,5998.0,560.0,10.0,21.0


In [46]:
# Standardize numerical columns

scaler = MinMaxScaler()
#scaler = StandardScaler()
#scaler = RobustScaler()
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [47]:
X_train.describe()

Unnamed: 0.1,Unnamed: 0,Kilometers_Driven,Mileage,Engine,Power,Seats,Age
count,4815.0,4815.0,4815.0,4815.0,4815.0,4815.0,4815.0
mean,0.497201,0.009033,0.541257,0.260899,0.148577,0.409268,0.26995
std,0.289068,0.01538,0.136437,0.100573,0.099934,0.099596,0.15668
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.244766,0.005205,0.454979,0.19001,0.085013,0.375,0.142857
50%,0.498338,0.008128,0.541443,0.239791,0.113731,0.375,0.238095
75%,0.74676,0.011205,0.6291,0.319946,0.197471,0.375,0.380952
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [48]:
px.scatter(x = X_train['Seats'], y= y_train)

## Handling Categorical Data

In [49]:
df.describe(include=['O'])

Unnamed: 0,Name,Location,Fuel_Type,Transmission,Owner_Type
count,6019,6019,6019,6019,6019
unique,216,11,5,2,4
top,Maruti Swift,Mumbai,Diesel,Manual,First
freq,353,790,3205,4299,4929


In [50]:
X_train.Owner_Type.value_counts()

First             3930
Second             775
Third              101
Fourth & Above       9
Name: Owner_Type, dtype: int64

In [51]:
# Ordinal Encoding for Owner_Type 
ordinal_encoder = OrdinalEncoder(categories=[['First', 'Second', 'Third', 'Fourth & Above']])
X_train['Owner_Type'] = ordinal_encoder.fit_transform(X_train[['Owner_Type']])
X_test['Owner_Type'] = ordinal_encoder.transform(X_test[['Owner_Type']])

In [52]:
X_train['Owner_Type'].value_counts()

0.0    3930
1.0     775
2.0     101
3.0       9
Name: Owner_Type, dtype: int64

In [53]:
X_train.sample(5)

Unnamed: 0.1,Unnamed: 0,Name,Location,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Age
4430,0.736125,Datsun Redi,Bangalore,0.00182,Petrol,Manual,0.0,0.676804,0.12268,0.035755,0.375,0.095238
2313,0.384347,Volkswagen Vento,Mumbai,0.007205,Diesel,Manual,0.0,0.615385,0.240634,0.141308,0.375,0.380952
3388,0.562978,Ford Figo,Coimbatore,0.004733,Petrol,Manual,0.0,0.541443,0.189673,0.100038,0.375,0.095238
2007,0.3335,Hyundai Verna,Kochi,0.005736,Diesel,Manual,0.0,0.665474,0.254809,0.175162,0.375,0.142857
1229,0.204221,Chevrolet Beat,Pune,0.011974,Petrol,Manual,1.0,0.554562,0.190179,0.085964,0.375,0.333333


In [54]:
X_train.Name.nunique()

206

In [55]:
#X_train.Brand.nunique()

In [56]:
X_train.Location.nunique()

11

> Note

        * OneHotEncoder is not used because it will create a lot of columns


In [57]:
#pip install category_encoders

In [58]:
# Handling Categorical columns using Binary Encoding

from category_encoders import BinaryEncoder

encoder = BinaryEncoder(cols=[ 'Name', 'Location', 'Fuel_Type', 'Transmission'])
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [59]:
X_train

Unnamed: 0.1,Unnamed: 0,Name_0,Name_1,Name_2,Name_3,Name_4,Name_5,Name_6,Name_7,Location_0,...,Fuel_Type_1,Fuel_Type_2,Transmission_0,Transmission_1,Owner_Type,Mileage,Engine,Power,Seats,Age
5320,0.884015,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0.0,0.483005,0.190179,0.075694,0.375,0.142857
3406,0.565969,0,0,0,0,0,0,1,0,0,...,1,0,1,0,1.0,0.596303,0.349477,0.139026,0.375,0.238095
2948,0.489864,0,0,0,0,0,0,1,1,0,...,1,0,1,0,0.0,0.574538,0.349477,0.253747,0.375,0.047619
2909,0.483383,0,0,0,0,0,1,0,0,0,...,0,1,0,1,0.0,0.534287,0.171110,0.053062,0.375,0.523810
3599,0.598039,0,0,0,0,0,1,0,1,0,...,0,1,0,1,0.0,0.590638,0.171110,0.064378,0.375,0.523810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3347,0.556165,0,0,0,1,0,0,1,1,0,...,0,1,1,0,0.0,0.551580,0.189841,0.093077,0.375,0.095238
237,0.039382,0,0,0,0,1,0,0,1,1,...,0,1,0,1,0.0,0.518784,0.240466,0.158045,0.375,0.380952
2461,0.408940,0,1,0,1,1,1,0,1,0,...,1,0,0,1,1.0,0.566786,0.234391,0.141308,0.375,0.190476
5868,0.975075,0,1,0,0,0,1,0,0,0,...,1,0,1,0,0.0,0.244484,0.492913,0.400913,0.250,0.238095


In [60]:
px.histogram(y_train)

In [61]:
# Log Transforming the target variable
y_train = np.log1p(y_train)
y_test = np.log1p(y_test)

In [62]:
px.histogram(y_train)

# Model Building

In [63]:
# Linear Regression

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lr = LinearRegression()
lr.fit(X_train, y_train)

print('Training Score: ', lr.score(X_train, y_train))
print('Testing Score: ', lr.score(X_test, y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, lr.predict(X_test))))

Training Score:  0.8728975883760148
Testing Score:  0.8937720130890596
RMSE:  0.248759748456633


In [64]:
prices = np.expm1(lr.predict(X_test))
prices

array([19.40183811,  4.70982   , 39.41571724, ..., 22.11831129,
        4.30138841,  6.13379785])

In [65]:
# Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)

print('Training Score: ', rf.score(X_train, y_train))
print('Testing Score: ', rf.score(X_test, y_test))
print('RMSE: ', np.sqrt(mean_squared_error(y_test, rf.predict(X_test))))

Training Score:  0.9902117634045993
Testing Score:  0.9450156912138995
RMSE:  0.17896990939101043


In [66]:
# Preprocessing Test Data
df_test.drop('New_Price', axis=1, inplace=True)
for col in ['Mileage', 'Engine', 'Power']:
    df_test[col] = fix_numerical(df_test, col)
df_test['Name'] = df_test.Name.str.split().str[0:2].str.join(' ')
df_test['Age'] = df_test.Year.max() - df_test.Year
df_test.drop('Year', axis=1, inplace=True)
df_test[missing_cols] = imputer.transform(df_test[missing_cols])
df_test[numerical_cols] = scaler.transform(df_test[numerical_cols]) 
df_test['Owner_Type'] = ordinal_encoder.transform(df_test[['Owner_Type']])
df_test = encoder.transform(df_test)

df_test.head()

Unnamed: 0.1,Unnamed: 0,Name_0,Name_1,Name_2,Name_3,Name_4,Name_5,Name_6,Name_7,Location_0,...,Fuel_Type_1,Fuel_Type_2,Transmission_0,Transmission_1,Owner_Type,Mileage,Engine,Power,Seats,Age
0,0.0,0,0,1,1,1,1,1,0,1,...,1,1,0,1,0.0,0.961837,0.156261,0.045645,0.25,0.238095
1,0.000166,0,0,1,1,1,1,1,0,0,...,0,1,0,1,1.0,0.736434,0.122173,0.024914,0.375,0.285714
2,0.000332,0,0,0,0,1,1,1,1,0,...,1,0,0,1,0.0,0.407871,0.391664,0.216052,0.625,0.095238
3,0.000499,0,0,1,1,1,0,0,1,0,...,1,0,0,1,0.0,0.703339,0.218022,0.113731,0.375,0.333333
4,0.000665,0,0,1,0,1,0,1,1,0,...,0,1,0,1,0.0,0.55158,0.189841,0.092526,0.375,0.238095


In [67]:
# Predicting on Test Data
prices = np.expm1(rf.predict(df_test))
prices

array([ 2.92029418,  2.4463736 , 18.66387616, ...,  2.63294412,
        5.45711486, 19.54602965])