# **Importing Libraries**

In [5]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sklearn

In [6]:
print(pd.__version__)
print(np.__version__)
print(sklearn.__version__)

2.2.2
1.26.4
1.6.0


# **Data Collection and Processing**

In [7]:
car_dataset = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Dataset/CAR DETAILS FROM CAR DEKHO.csv")

In [8]:
car_dataset.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [9]:
car_dataset.describe()

Unnamed: 0,year,selling_price,km_driven
count,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419
std,4.215344,578548.7,46644.102194
min,1992.0,20000.0,1.0
25%,2011.0,208749.8,35000.0
50%,2014.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


In [10]:
# Getting information about dataset
car_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [11]:
# Checking for number of rows and columns
car_dataset.shape

(4340, 8)

In [12]:
# Checking for missing value
car_dataset.isnull().sum()

Unnamed: 0,0
name,0
year,0
selling_price,0
km_driven,0
fuel,0
seller_type,0
transmission,0
owner,0


In [13]:
# Counting the value for each columns
print(car_dataset.fuel.value_counts())
print(car_dataset.seller_type.value_counts())
print(car_dataset.transmission.value_counts())
print(car_dataset.owner.value_counts())

fuel
Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: count, dtype: int64
seller_type
Individual          3244
Dealer               994
Trustmark Dealer     102
Name: count, dtype: int64
transmission
Manual       3892
Automatic     448
Name: count, dtype: int64
owner
First Owner             2832
Second Owner            1106
Third Owner              304
Fourth & Above Owner      81
Test Drive Car            17
Name: count, dtype: int64


In [14]:
# Extract the first word by splitting on spaces
car_dataset['car_brand'] = car_dataset['name'].apply(lambda x: x.split(" ")[0])
# Move 'Car Brand' column to the first position
car_dataset.insert(0, 'car_brand', car_dataset.pop('car_brand'))
car_dataset.drop(['name'],axis=1,inplace=True)

# **Adding another column for Age**

In [15]:
# Convert 'year' to 'Age' and drop 'year' column
current_year = pd.Timestamp.now().year
car_dataset['age'] = current_year - car_dataset['year']
car_dataset.drop(['year'], axis=1, inplace=True)

In [16]:
car_dataset.head()

Unnamed: 0,car_brand,selling_price,km_driven,fuel,seller_type,transmission,owner,age
0,Maruti,60000,70000,Petrol,Individual,Manual,First Owner,17
1,Maruti,135000,50000,Petrol,Individual,Manual,First Owner,17
2,Hyundai,600000,100000,Diesel,Individual,Manual,First Owner,12
3,Datsun,250000,46000,Petrol,Individual,Manual,First Owner,7
4,Honda,450000,141000,Diesel,Individual,Manual,Second Owner,10


In [17]:
car_dataset['car_brand'].unique()

array(['Maruti', 'Hyundai', 'Datsun', 'Honda', 'Tata', 'Chevrolet',
       'Toyota', 'Jaguar', 'Mercedes-Benz', 'Audi', 'Skoda', 'Jeep',
       'BMW', 'Mahindra', 'Ford', 'Nissan', 'Renault', 'Fiat',
       'Volkswagen', 'Volvo', 'Mitsubishi', 'Land', 'Daewoo', 'MG',
       'Force', 'Isuzu', 'OpelCorsa', 'Ambassador', 'Kia'], dtype=object)

In [18]:
car_dataset['fuel'].unique()

array(['Petrol', 'Diesel', 'CNG', 'LPG', 'Electric'], dtype=object)

In [19]:
car_dataset['seller_type'].unique()

array(['Individual', 'Dealer', 'Trustmark Dealer'], dtype=object)

In [20]:
car_dataset['transmission'].unique()

array(['Manual', 'Automatic'], dtype=object)

In [21]:
car_dataset['owner'].unique()

array(['First Owner', 'Second Owner', 'Fourth & Above Owner',
       'Third Owner', 'Test Drive Car'], dtype=object)

# **Splitting the data**

 1.   x---> Non-label data
 2.   y---> Label of data



In [22]:
x = car_dataset.drop(['selling_price'],axis=1)
y = car_dataset['selling_price']

# **Split Data into Train-Test**

In [23]:
# Split the dataset with 20% test size
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=1)

# **One-Hot-Encoding**

In [24]:
processor = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(drop="first"),["fuel", "seller_type", "transmission", "owner", "car_brand"]),
        ("stdscl", StandardScaler(),["km_driven","age"])
    ])

# **Model Training**

In [65]:
model = RandomForestRegressor(n_estimators=20)

In [66]:
pipeline = Pipeline(steps=[
    ("processor", processor),
    ("model", model)
])

In [67]:
pipeline.fit(x_train, y_train)

# **Model Evaluation on training data**

In [68]:
# Prediction on training data
training_data_prediction = pipeline.predict(x_train)

In [69]:
# R square error
error_score1 = metrics.r2_score(y_train,training_data_prediction)
print("R square error : ",error_score1)

R square error :  0.9650251003975836


# **Model Evaluation on testing data**

In [70]:
test_data_prediction = pipeline.predict(x_test)

In [71]:
# R square error
error_score2 = metrics.r2_score(y_test,test_data_prediction)
print("R square error : ",error_score2)

R square error :  0.9105974820487512


In [72]:
# Save the pipeline model using pickle
with open('predict_car_price.pkl', 'wb') as file:
    pickle.dump(pipeline, file)