In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score , mean_absolute_error,mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import category_encoders as ce

%matplotlib inline

In [3]:
data = pd.read_excel('Data_Train (1).xlsx')

data.head()

data.columns

data.describe()

data.info()

data.isnull().sum()

# Removing units

data['Power'] = data['Power'].str.split(' ').str[0]
# including nan rows there is data in this column of 'null' value
data.loc[data.Power == 'null','Power'] = np.NaN

data['Mileage'] = data['Mileage'].apply(lambda x: re.sub('[$kmpl|km/kg]','',str(x)))
data['Engine'] = data['Engine'].apply(lambda x: re.sub('[$CC]','',str(x)))
data['Power'] = data['Power'].apply(lambda x: re.sub('[$bhp]','',str(x)))

data.head()

data['Mileage'] = pd.to_numeric(data['Mileage'],errors='coerce')
data['Engine'] = pd.to_numeric(data['Engine'],errors='coerce')
data['Power'] = pd.to_numeric(data['Power'],errors='coerce')

# Removing Outlayers


data.drop(data[data['Kilometers_Driven'] >= 6500000].index, axis=0, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               6019 non-null   object 
 1   Location           6019 non-null   object 
 2   Year               6019 non-null   int64  
 3   Kilometers_Driven  6019 non-null   int64  
 4   Fuel_Type          6019 non-null   object 
 5   Transmission       6019 non-null   object 
 6   Owner_Type         6019 non-null   object 
 7   Mileage            6017 non-null   object 
 8   Engine             5983 non-null   object 
 9   Power              5983 non-null   object 
 10  Seats              5977 non-null   float64
 11  Price              6019 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 564.4+ KB


In [4]:
data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


In [5]:
data['Name'] = data['Name'].apply(lambda x: re.sub('Rover','',str(x)))

In [6]:
carnames = data.Name.str.split(expand=True)[[0,1,2]]
carnames.rename(columns={0:'Brand',1:'Model',2:'Type'}, inplace=True)
data = data.join(carnames)
data = data.drop("Name", axis=1)
data.head()
interactions = pd.DataFrame(index=data.index)

In [7]:
from itertools import combinations

object_cols = data.select_dtypes("object").columns
object_cols

low_cardinality_cols = [col for col in object_cols if data[col].nunique() < 15]
low_cardinality_cols.append("Brand")
# Iterate through each pair of features, combine them into interaction features
for features in combinations(low_cardinality_cols,2):
    
    new_interaction = data[features[0]].map(str)+"_"+data[features[1]].map(str)
    
    encoder = LabelEncoder()
    interactions["_".join(features)] = encoder.fit_transform(new_interaction)



data = data.join(interactions) #append to the dataset
data.head(5)

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,...,Location_Fuel_Type,Location_Transmission,Location_Owner_Type,Location_Brand,Fuel_Type_Transmission,Fuel_Type_Owner_Type,Fuel_Type_Brand,Transmission_Owner_Type,Transmission_Brand,Owner_Type_Brand
0,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,...,26,19,28,217,0,0,1,4,38,16
1,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,...,32,21,32,236,2,3,12,4,33,9
2,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,...,8,5,6,48,6,10,43,4,32,8
3,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,...,6,5,6,53,2,3,19,4,38,16
4,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,...,9,6,12,64,1,5,5,2,0,32


In [8]:
features = data.drop(["Price"], axis=1)
target = data["Price"]
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=0)

X_train.isnull().sum()

Location                     0
Year                         0
Kilometers_Driven            0
Fuel_Type                    0
Transmission                 0
Owner_Type                   0
Mileage                      1
Engine                      22
Power                      106
Seats                       28
Brand                        0
Model                        0
Type                         0
Location_Fuel_Type           0
Location_Transmission        0
Location_Owner_Type          0
Location_Brand               0
Fuel_Type_Transmission       0
Fuel_Type_Owner_Type         0
Fuel_Type_Brand              0
Transmission_Owner_Type      0
Transmission_Brand           0
Owner_Type_Brand             0
dtype: int64

In [9]:
num_cols = X_train.select_dtypes("number")
null_num_cols = num_cols.columns[num_cols.isnull().any()]
null_num_cols

Index(['Mileage', 'Engine', 'Power', 'Seats'], dtype='object')

In [10]:

for cols in null_num_cols:
    X_train.loc[:,cols] = X_train.loc[:,cols].fillna(X_train.groupby('Brand')[cols].transform('median'))
    X_train.loc[:,cols] = X_train.loc[:,cols].fillna(X_train[cols].mean())

    X_test.loc[:,cols] = X_test.loc[:,cols].fillna(X_test.groupby('Brand')[cols].transform('median'))
    X_test.loc[:,cols] = X_test.loc[:,cols].fillna(X_test[cols].mean())

data.select_dtypes("object").nunique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Location         11
Fuel_Type         5
Transmission      2
Owner_Type        4
Brand            31
Model           214
Type            344
dtype: int64

In [11]:
OH = OneHotEncoder(sparse=False,handle_unknown='ignore')
OHE_cat_features = ["Fuel_Type","Transmission", "Location", "Owner_Type", "Brand"]

OH_cols_train =pd.DataFrame(OH.fit_transform(X_train[OHE_cat_features]))
OH_cols_test = pd.DataFrame(OH.transform(X_test[OHE_cat_features]))

OH_cols_train.index = X_train.index
OH_cols_test.index = X_test.index

In [12]:
OH_cols_train.columns = OH.get_feature_names(OHE_cat_features)
OH_cols_test.columns = OH.get_feature_names(OHE_cat_features)

X_train_enc = X_train.join(OH_cols_train)
X_test_enc = X_test.join(OH_cols_test)
X_train_enc.drop(OHE_cat_features, axis=1, inplace = True)
X_test_enc.drop(OHE_cat_features, axis=1, inplace = True)

In [13]:
X_train_enc.head()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Model,Type,Location_Fuel_Type,Location_Transmission,...,Brand_Mitsubishi,Brand_Nissan,Brand_Porsche,Brand_Renault,Brand_Skoda,Brand_Smart,Brand_Tata,Brand_Toyota,Brand_Volkswagen,Brand_Volvo
245,2007,72000,17.0,1086.0,82.0,5.0,Santro,Xing,30,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2175,2013,70961,25.8,1498.0,98.6,5.0,Amaze,EX,18,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4134,2016,34004,15.4,1497.0,117.3,7.0,BR-V,i-VTEC,4,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4975,2016,41810,20.73,1373.0,91.1,5.0,Ciaz,ZXi,10,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2473,2018,68737,26.21,1248.0,88.5,5.0,Ciaz,ZDi,9,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
target_cat_features = X_train_enc.select_dtypes('object').columns
target_enc = ce.TargetEncoder(cols=target_cat_features)

In [15]:
target_enc.fit(X_train[target_cat_features], y_train)
X_train_enc = X_train_enc.join(target_enc.transform(X_train[target_cat_features]).add_suffix('_enc'))
X_test_enc = X_test_enc.join(target_enc.transform(X_test[target_cat_features]).add_suffix('_enc'))
X_train_enc.head()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Model,Type,Location_Fuel_Type,Location_Transmission,...,Brand_Porsche,Brand_Renault,Brand_Skoda,Brand_Smart,Brand_Tata,Brand_Toyota,Brand_Volkswagen,Brand_Volvo,Model_enc,Type_enc
245,2007,72000,17.0,1086.0,82.0,5.0,Santro,Xing,30,19,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.791967,1.738571
2175,2013,70961,25.8,1498.0,98.6,5.0,Amaze,EX,18,13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.766282,3.978158
4134,2016,34004,15.4,1497.0,117.3,7.0,BR-V,i-VTEC,4,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.579124,9.060769
4975,2016,41810,20.73,1373.0,91.1,5.0,Ciaz,ZXi,10,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.626034,6.809325
2473,2018,68737,26.21,1248.0,88.5,5.0,Ciaz,ZDi,9,7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.626034,6.349655


In [16]:
object_cols = X_train_enc.select_dtypes('object')
X_train_enc.drop(object_cols, axis=1, inplace = True)
X_test_enc.drop(object_cols, axis=1, inplace = True)

X_train_enc.head()

Unnamed: 0,Year,Kilometers_Driven,Mileage,Engine,Power,Seats,Location_Fuel_Type,Location_Transmission,Location_Owner_Type,Location_Brand,...,Brand_Porsche,Brand_Renault,Brand_Skoda,Brand_Smart,Brand_Tata,Brand_Toyota,Brand_Volkswagen,Brand_Volvo,Model_enc,Type_enc
245,2007,72000,17.0,1086.0,82.0,5.0,30,19,31,212,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.791967,1.738571
2175,2013,70961,25.8,1498.0,98.6,5.0,18,13,19,142,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.766282,3.978158
4134,2016,34004,15.4,1497.0,117.3,7.0,4,3,2,23,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.579124,9.060769
4975,2016,41810,20.73,1373.0,91.1,5.0,10,7,10,77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.626034,6.809325
2473,2018,68737,26.21,1248.0,88.5,5.0,9,7,10,77,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.626034,6.349655
