In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import pickle

In [2]:
Car=pd.read_csv("quikr_car.csv")
Car

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel
...,...,...,...,...,...,...
887,Ta,Tara,zest,310000,,
888,Tata Zest XM Diesel,Tata,2018,260000,"27,000 kms",Diesel
889,Mahindra Quanto C8,Mahindra,2013,390000,"40,000 kms",Diesel
890,Honda Amaze 1.2 E i VTEC,Honda,2014,180000,Petrol,


In [3]:
df=Car.copy()
df

Unnamed: 0,name,company,year,Price,kms_driven,fuel_type
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007,80000,"45,000 kms",Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006,425000,40 kms,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018,Ask For Price,"22,000 kms",Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014,325000,"28,000 kms",Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014,575000,"36,000 kms",Diesel
...,...,...,...,...,...,...
887,Ta,Tara,zest,310000,,
888,Tata Zest XM Diesel,Tata,2018,260000,"27,000 kms",Diesel
889,Mahindra Quanto C8,Mahindra,2013,390000,"40,000 kms",Diesel
890,Honda Amaze 1.2 E i VTEC,Honda,2014,180000,Petrol,


In [4]:
df.columns

Index(['name', 'company', 'year', 'Price', 'kms_driven', 'fuel_type'], dtype='object')

In [5]:
df.rename(columns=
    {
        "kms_driven":"km",
        "fuel_type":"fuel"
    },
    inplace=True
)
df.columns

Index(['name', 'company', 'year', 'Price', 'km', 'fuel'], dtype='object')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   name     892 non-null    object
 1   company  892 non-null    object
 2   year     892 non-null    object
 3   Price    892 non-null    object
 4   km       840 non-null    object
 5   fuel     837 non-null    object
dtypes: object(6)
memory usage: 41.9+ KB


#### Dealing with Kilometers

In [7]:
df["km"]

0      45,000 kms
1          40 kms
2      22,000 kms
3      28,000 kms
4      36,000 kms
          ...    
887           NaN
888    27,000 kms
889    40,000 kms
890        Petrol
891        Petrol
Name: km, Length: 892, dtype: object

In [8]:
#km columns has km in it
df["km"]=df["km"].str.split().str.get(0)
df["km"]=df["km"].str.replace(",","")
df["km"]=pd.to_numeric(df["km"],errors="coerce")



#### Dealing with year columns

In [9]:
df["year"]=pd.to_numeric(df["year"],errors="coerce")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     892 non-null    object 
 1   company  892 non-null    object 
 2   year     842 non-null    float64
 3   Price    892 non-null    object 
 4   km       838 non-null    float64
 5   fuel     837 non-null    object 
dtypes: float64(2), object(4)
memory usage: 41.9+ KB


#### Dealing with price

In [10]:

df["Price"]=df["Price"].str.replace(",","")
df["Price"]=pd.to_numeric(df["Price"],errors="coerce")
df

Unnamed: 0,name,company,year,Price,km,fuel
0,Hyundai Santro Xing XO eRLX Euro III,Hyundai,2007.0,80000.0,45000.0,Petrol
1,Mahindra Jeep CL550 MDI,Mahindra,2006.0,425000.0,40.0,Diesel
2,Maruti Suzuki Alto 800 Vxi,Maruti,2018.0,,22000.0,Petrol
3,Hyundai Grand i10 Magna 1.2 Kappa VTVT,Hyundai,2014.0,325000.0,28000.0,Petrol
4,Ford EcoSport Titanium 1.5L TDCi,Ford,2014.0,575000.0,36000.0,Diesel
...,...,...,...,...,...,...
887,Ta,Tara,,310000.0,,
888,Tata Zest XM Diesel,Tata,2018.0,260000.0,27000.0,Diesel
889,Mahindra Quanto C8,Mahindra,2013.0,390000.0,40000.0,Diesel
890,Honda Amaze 1.2 E i VTEC,Honda,2014.0,180000.0,,


In [11]:
df["Price"].unique()

array([  80000.,  425000.,      nan,  325000.,  575000.,  175000.,
        190000.,  830000.,  250000.,  182000.,  315000.,  415000.,
        320000., 1000000.,  500000.,  350000.,  160000.,  310000.,
         75000.,  100000.,  290000.,   95000.,  180000.,  385000.,
        105000.,  650000.,  689999.,  448000.,  549000.,  501000.,
        489999.,  280000.,  349999.,  284999.,  345000.,  499999.,
        235000.,  249999., 1475000.,  395000.,  220000.,  170000.,
         85000.,  200000.,  570000.,  110000.,  448999., 1891111.,
        159500.,  344999.,  449999.,  865000.,  699000.,  375000.,
        224999., 1200000.,  195000.,  351000.,  240000.,   90000.,
        155000.,  600000.,  189500.,  210000.,  390000.,  135000.,
       1600000.,  701000.,  265000.,  525000.,  372000.,  635000.,
        550000.,  485000.,  329500.,  251111.,  569999.,   69999.,
        299999.,  399999.,  450000.,  270000.,  158400.,  179000.,
        125000.,  299000.,  150000.,  275000.,  285000.,  3400

#### Dealing with name


In [12]:
df["name"]=df["name"].str.split(" ").str.slice(0,3).str.join(" ")

#### Outlier Removal

In [13]:
q1=df["Price"].quantile(0.25)
q3=df["Price"].quantile(0.75)
iqr=q3-q1
mini=q1-1.5*iqr
maxi=q3+1.5*iqr
df=df[df["Price"]<=maxi]

#### Drop na values

In [14]:
df.dropna(inplace=True)
df["year"]=df["year"].astype("int64")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["year"]=df["year"].astype("int64")


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 763 entries, 0 to 889
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     763 non-null    object 
 1   company  763 non-null    object 
 2   year     763 non-null    int64  
 3   Price    763 non-null    float64
 4   km       763 non-null    float64
 5   fuel     763 non-null    object 
dtypes: float64(2), int64(1), object(3)
memory usage: 41.7+ KB


#### Saving clean data to csv

In [16]:
df

Unnamed: 0,name,company,year,Price,km,fuel
0,Hyundai Santro Xing,Hyundai,2007,80000.0,45000.0,Petrol
1,Mahindra Jeep CL550,Mahindra,2006,425000.0,40.0,Diesel
3,Hyundai Grand i10,Hyundai,2014,325000.0,28000.0,Petrol
4,Ford EcoSport Titanium,Ford,2014,575000.0,36000.0,Diesel
6,Ford Figo,Ford,2012,175000.0,41000.0,Diesel
...,...,...,...,...,...,...
883,Maruti Suzuki Ritz,Maruti,2011,270000.0,50000.0,Petrol
885,Tata Indica V2,Tata,2009,110000.0,30000.0,Diesel
886,Toyota Corolla Altis,Toyota,2009,300000.0,132000.0,Petrol
888,Tata Zest XM,Tata,2018,260000.0,27000.0,Diesel


In [17]:
df.to_csv("clean_quiker.csv")

#### Splitting I/O columns

In [18]:
y=df["Price"]
X=df.drop("Price",axis=1)

#### Column Transformation

In [19]:
ohe=OneHotEncoder()
ohe.fit(X[["name","fuel","company"]])

In [20]:
ct= make_column_transformer((OneHotEncoder(handle_unknown='ignore'),["name","fuel","company"]),remainder="passthrough")

#### Finding out correct random state for better accuracy

In [21]:
scores=[]
for i in range(1000):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=i)
    lr=LinearRegression()
    pipe=Pipeline([("CT",ct),("LR",lr)])
    pipe.fit(X_train,y_train)
    y_pred=pipe.predict(X_test)
    scores.append(r2_score(y_test,y_pred))


In [22]:
np.argmax(scores)

283

In [23]:
scores[np.argmax(scores)]

0.756287481483584

#### Model Taining

In [24]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=np.argmax(scores))
lr=LinearRegression()
pipe=Pipeline([("CT",ct),("LR",lr)])
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)
r2_score(y_test,y_pred)

0.756287481483584

#### Dumping Model

In [25]:
pickle.dump(pipe,open('LinearRegressionModel.pkl','wb'))

In [26]:
pipe.predict(pd.DataFrame([["Maruti Suzuki Swift", "Maruti", 2019, 100, "Petrol"]],
                          columns=["name", "company", "year", "km", "fuel"]))

array([402517.18734369])