In [1]:
#importing packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [2]:
#read the csv in data frame object
df = pd.read_csv("car_data.csv")
df.head()

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,Petrol,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,Diesel,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,Petrol,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,Petrol,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,Diesel,Dealer,Manual,0


In [3]:
df.Fuel_Type.value_counts()  #so we have 239 cars with petrol, 60 cars with Diesel, 2 car with CNG

Petrol    239
Diesel     60
CNG         2
Name: Fuel_Type, dtype: int64

In [4]:
#so will replace the strings with 0,1,2
df.replace({'Fuel_Type': {'Petrol':0, 'Diesel':1, 'CNG':2}}, inplace = True)
df.head() #let's check df now

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,0,Dealer,Manual,0
1,sx4,2013,4.75,9.54,43000,1,Dealer,Manual,0
2,ciaz,2017,7.25,9.85,6900,0,Dealer,Manual,0
3,wagon r,2011,2.85,4.15,5200,0,Dealer,Manual,0
4,swift,2014,4.6,6.87,42450,1,Dealer,Manual,0


In [5]:
df.Seller_Type.value_counts()  #so we have 195 cars with dealers, 106 cars with Individuals

Dealer        195
Individual    106
Name: Seller_Type, dtype: int64

In [6]:
#so will replace the strings with 0,1
df.replace({'Seller_Type': {'Dealer':0, 'Individual':1}}, inplace = True)
df.head() #let's check df now

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,0,0,Manual,0
1,sx4,2013,4.75,9.54,43000,1,0,Manual,0
2,ciaz,2017,7.25,9.85,6900,0,0,Manual,0
3,wagon r,2011,2.85,4.15,5200,0,0,Manual,0
4,swift,2014,4.6,6.87,42450,1,0,Manual,0


In [7]:
df.Transmission.value_counts()  #so we have 261 cars with manual, 40 cars with automatic

Manual       261
Automatic     40
Name: Transmission, dtype: int64

In [8]:
#so will replace the strings with 0,1
df.replace({'Transmission': {'Manual':0, 'Automatic':1}}, inplace = True)
df.head() #let's check df now

Unnamed: 0,Car_Name,Year,Selling_Price,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,ritz,2014,3.35,5.59,27000,0,0,0,0
1,sx4,2013,4.75,9.54,43000,1,0,0,0
2,ciaz,2017,7.25,9.85,6900,0,0,0,0
3,wagon r,2011,2.85,4.15,5200,0,0,0,0
4,swift,2014,4.6,6.87,42450,1,0,0,0


In [9]:
#now all affecting data is in numeric
#but let's check other prameters
df.Owner.value_counts()  #so we have 290 cars with 0 owners, 10 cars with 1 owners, 1 car with 0 owners

0    290
1     10
3      1
Name: Owner, dtype: int64

In [10]:
#now we will set the affecting data in X
X = df.drop(['Car_Name', 'Selling_Price'], axis=1) # not changing the df, keeping a part of df in X
X

Unnamed: 0,Year,Present_Price,Kms_Driven,Fuel_Type,Seller_Type,Transmission,Owner
0,2014,5.59,27000,0,0,0,0
1,2013,9.54,43000,1,0,0,0
2,2017,9.85,6900,0,0,0,0
3,2011,4.15,5200,0,0,0,0
4,2014,6.87,42450,1,0,0,0
...,...,...,...,...,...,...,...
296,2016,11.60,33988,1,0,0,0
297,2015,5.90,60000,0,0,0,0
298,2009,11.00,87934,0,0,0,0
299,2017,12.50,9000,1,0,0,0


In [11]:
#now we will set the resulting data in Y
Y = df['Selling_Price'] # not changing the df, keeping a part of df in Y
Y

0       3.35
1       4.75
2       7.25
3       2.85
4       4.60
       ...  
296     9.50
297     4.00
298     3.35
299    11.50
300     5.30
Name: Selling_Price, Length: 301, dtype: float64

In [12]:
#sklearn.model_selection.train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1, random_state = 2)
#sklearn.linear_model.LinearRegression
Linear_Reg = LinearRegression()
Linear_Reg.fit(X_train, Y_train)

In [13]:
training_data = Linear_Reg.predict(X_train)
#sklearn.metrics.r2_score
err_score = r2_score(Y_train, training_data) 
print(f"Accuracy with test data : {err_score:0.2f}")  #if 1 then best, 0 then worst

Accuracy with test data : 0.88


In [14]:
pd.to_pickle(Linear_Reg,'car_price_prediction_model.pickle') #creating a pickle file for future use of predicition algo