In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error, root_mean_squared_error

In [3]:
df=pd.read_csv('datasets/car_price.csv')

In [5]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [7]:
df.sample(5)

Unnamed: 0,car_name,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats
5251,Land Rover Range Rover Vogue SE 4.4 SDV8,39.75 Lakh,"95,004 kms",Diesel,Automatic,1st Owner,2013,1956 cc,5 Seats
408,Mahindra Scorpio SLX 2.6 Turbo 7 Str,3.50 Lakh,"1,55,000 kms",Diesel,Manual,2nd Owner,2006,2609 cc,7 Seats
1656,Hyundai Venue SX Plus Turbo DCT DT,11.95 Lakh,"24,000 kms",Petrol,Automatic,1st Owner,2021,1497 cc,7 Seats
4583,Tata Zest Revotron 1.2T XMS,2.41 Lakh,"1,12,991 kms",Petrol,Manual,3rd Owner,2015,1368 cc,5 Seats
2219,Mahindra XUV500 W8 1.99 mHawk,10.90 Lakh,"68,269 kms",Diesel,Manual,1st Owner,2016,1498 cc,5 Seats


In [9]:
df['brand']=df['car_name'].str.split().str[0]

In [11]:
target=df['brand'].value_counts()[(df['brand'].value_counts()>100) & (df['brand'].value_counts()<1000)].index.tolist()

In [13]:
df=df[df['brand'].isin(target)]

In [15]:
df.shape

(2576, 10)

In [17]:
df['model']=df['car_name'].str.split().str[1]

In [20]:
counts=df['model'].value_counts()

In [22]:
target_model=counts[counts>50].index.tolist()

In [27]:
df=df[df['model'].isin(target_model)]

In [29]:
df.shape

(1046, 11)

In [31]:
df.drop(columns=['car_name'],inplace=True)

In [33]:
df.head()

Unnamed: 0,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats,brand,model
1,12.83 Lakh,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats,Renault,Duster
4,5.15 Lakh,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats,Volkswagen,Polo
5,7.66 Lakh,"49,719 kms",Petrol,Automatic,1st Owner,2017,1197 cc,5 Seats,Volkswagen,Vento
6,7.58 Lakh,"43,688 kms",Petrol,Automatic,1st Owner,2017,1197 cc,5 Seats,Volkswagen,Vento
8,6.99 Lakh,"21,429 kms",Petrol,Automatic,1st Owner,2015,1497 cc,5 Seats,Honda,City


In [35]:
rupees=df['car_prices_in_rupee'].str.split().str[0]

In [37]:
df['car_prices_in_rupee']=rupees.astype(float)*100000

In [39]:
df.head()

Unnamed: 0,car_prices_in_rupee,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats,brand,model
1,1283000.0,"13,248 kms",Petrol,Automatic,1st Owner,2021,1330 cc,5 Seats,Renault,Duster
4,515000.0,"69,414 kms",Petrol,Manual,1st Owner,2016,1199 cc,5 Seats,Volkswagen,Polo
5,766000.0,"49,719 kms",Petrol,Automatic,1st Owner,2017,1197 cc,5 Seats,Volkswagen,Vento
6,758000.0,"43,688 kms",Petrol,Automatic,1st Owner,2017,1197 cc,5 Seats,Volkswagen,Vento
8,699000.0,"21,429 kms",Petrol,Automatic,1st Owner,2015,1497 cc,5 Seats,Honda,City


In [41]:
km_driven=df['kms_driven'].str.replace(',','',regex=False).str.split().str[0]

In [43]:
df['kms_driven'] = km_driven.astype(int)

In [45]:
df['engine']=df['engine'].str.split().str[0].astype(int)

In [47]:
x_train,x_test,y_train,y_test=train_test_split(df.drop(columns=['car_prices_in_rupee']),df['car_prices_in_rupee'],test_size=0.2,random_state=42)

In [51]:
x_train.sample(5)

Unnamed: 0,kms_driven,fuel_type,transmission,ownership,manufacture,engine,Seats,brand,model
211,21000,Diesel,Automatic,1st Owner,2018,2143,5 Seats,Mercedes-Benz,C-Class
4568,77494,Diesel,Manual,2nd Owner,2012,1461,5 Seats,Mahindra,XUV500
791,80002,Diesel,Manual,1st Owner,2019,1330,5 Seats,Renault,Duster
1811,95000,Diesel,Automatic,1st Owner,2018,1796,5 Seats,Volkswagen,Vento
1695,37000,Petrol,Manual,1st Owner,2019,1498,5 Seats,Honda,City


In [53]:
num=['kms_driven','manufacture','engine']
cat=['fuel_type','transmission','brand','model']
ord=['ownership','Seats']

In [60]:
handle_num=Pipeline(steps=[
    ('standardiz',StandardScaler())
])

handle_cat=Pipeline(steps=[
    ("encoding",OneHotEncoder())
                    
])
handle_ord=Pipeline(steps=[
    ('ordinal_encoding',OrdinalEncoder(categories=[['5th Owner','4th Owner','3rd Owner','2nd Owner','1st Owner','0th Owner'],['8 Seats', '7 Seats', '6 Seats', '5 Seats', '4 Seats', '2 Seats']]))
])

processor=ColumnTransformer(transformers=[
    ('std',handle_num,num),
    ('encode',handle_cat,cat),
    ('ord_encode',handle_ord,ord)
])

pipe=Pipeline(steps=[
    ('process',processor),
    ('regression',LinearRegression())
])

In [65]:
pipe.fit(x_train,y_train)

In [67]:
y_pred=pipe.predict(x_test)

In [87]:
print("r2_score: ",r2_score(y_test,y_pred))
print("mean squared error: ",mean_squared_error(y_test,y_pred))
print("mean absolute error: ",mean_absolute_error(y_test,y_pred))
print("root mean squared error: ",root_mean_squared_error(y_test,y_pred))


r2_score:  0.8175364232292246
mean squared error:  182183244679.8701
mean absolute error:  289104.27995340334
root mean squared error:  426829.2921999029
