In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv('car_data.csv')

In [3]:
dataset.head()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,Maruti 800 AC,2007,60000,70000,Petrol,Individual,Manual,First Owner
1,Maruti Wagon R LXI Minor,2007,135000,50000,Petrol,Individual,Manual,First Owner
2,Hyundai Verna 1.6 SX,2012,600000,100000,Diesel,Individual,Manual,First Owner
3,Datsun RediGO T Option,2017,250000,46000,Petrol,Individual,Manual,First Owner
4,Honda Amaze VX i-DTEC,2014,450000,141000,Diesel,Individual,Manual,Second Owner


In [4]:
dataset.shape

(4340, 8)

In [5]:
dataset.columns

Index(['name', 'year', 'selling_price', 'km_driven', 'fuel', 'seller_type',
       'transmission', 'owner'],
      dtype='object')

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4340 entries, 0 to 4339
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   name           4340 non-null   object
 1   year           4340 non-null   int64 
 2   selling_price  4340 non-null   int64 
 3   km_driven      4340 non-null   int64 
 4   fuel           4340 non-null   object
 5   seller_type    4340 non-null   object
 6   transmission   4340 non-null   object
 7   owner          4340 non-null   object
dtypes: int64(3), object(5)
memory usage: 271.4+ KB


In [7]:
dataset.describe()

Unnamed: 0,year,selling_price,km_driven
count,4340.0,4340.0,4340.0
mean,2013.090783,504127.3,66215.777419
std,4.215344,578548.7,46644.102194
min,1992.0,20000.0,1.0
25%,2011.0,208749.8,35000.0
50%,2014.0,350000.0,60000.0
75%,2016.0,600000.0,90000.0
max,2020.0,8900000.0,806599.0


In [8]:
dataset.isnull()

Unnamed: 0,name,year,selling_price,km_driven,fuel,seller_type,transmission,owner
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...
4335,False,False,False,False,False,False,False,False
4336,False,False,False,False,False,False,False,False
4337,False,False,False,False,False,False,False,False
4338,False,False,False,False,False,False,False,False


In [9]:
dataset.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
4335    False
4336    False
4337    False
4338    False
4339    False
Length: 4340, dtype: bool

In [10]:
dataset.duplicated().sum()

763

### Data Analysis

In [11]:
dataset["selling_price"].isnull().sum()

0

In [12]:
dataset["fuel"].value_counts()

fuel
Diesel      2153
Petrol      2123
CNG           40
LPG           23
Electric       1
Name: count, dtype: int64

In [13]:
dataset["seller_type"].value_counts()

seller_type
Individual          3244
Dealer               994
Trustmark Dealer     102
Name: count, dtype: int64

### Encoding Data 

In [14]:
X = dataset.iloc[:, [1,3,4,6]].values
y = dataset.iloc[:, 2].values

In [15]:
X

array([[2007, 70000, 'Petrol', 'Manual'],
       [2007, 50000, 'Petrol', 'Manual'],
       [2012, 100000, 'Diesel', 'Manual'],
       ...,
       [2009, 83000, 'Petrol', 'Manual'],
       [2016, 90000, 'Diesel', 'Manual'],
       [2016, 40000, 'Petrol', 'Manual']], dtype=object)

### Label Encoder 

In [16]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
X[:,2]=lb.fit_transform(X[:,2])
lb1 = LabelEncoder()
X[:,3]=lb1.fit_transform(X[:,3])

In [17]:
X


array([[2007, 70000, 4, 1],
       [2007, 50000, 4, 1],
       [2012, 100000, 1, 1],
       ...,
       [2009, 83000, 4, 1],
       [2016, 90000, 1, 1],
       [2016, 40000, 4, 1]], dtype=object)

### Splitting the Data into Train and Test Set

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.05, random_state = 0)

In [22]:
print(X_train[:,:])

[[2016 36000 1 1]
 [2014 70000 4 1]
 [2016 23000 4 1]
 ...
 [2016 22000 4 1]
 [2015 70000 1 1]
 [2013 62000 4 1]]


### Training the model

In [24]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=300,random_state=0)

In [25]:
regressor.fit(X_train,y_train)

In [26]:
accuracy = regressor.score(X_test,y_test)
print(accuracy*100,'%')

85.74236935963135 %


### Performing a Test on given Input

In [28]:
new_data=[2017,7000,"Petrol","Manual"]
new_data[2]=lb.transform([new_data[2]])[0]
new_data[3]=lb1.transform([new_data[3]])[0]


In [29]:
print(new_data)
regressor.predict([new_data])

[2017, 7000, 4, 1]


array([624428.57142857])

### Saving the Model and Encoder using Pickle

In [31]:
import pickle

In [32]:
pickle.dump(regressor,open('regressor.pkl','wb'))
pickle.dump(lb,open('lb','wb'))
pickle.dump(lb1,open('lb1','wb'))