In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
# load the dataset 
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,1.75
1,1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,12.5
2,2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,4.5
3,3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,6.0
4,4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,17.74


In [2]:
# clean the data 

# dectect the missing values 
df.isnull().sum()

Unnamed: 0            0
Name                  0
Location              0
Year                  0
Kilometers_Driven     0
Fuel_Type             0
Transmission          0
Owner_Type            0
Mileage               2
Engine               36
Power                36
Seats                42
Price                 0
dtype: int64

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6019 entries, 0 to 6018
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         6019 non-null   int64  
 1   Name               6019 non-null   object 
 2   Location           6019 non-null   object 
 3   Year               6019 non-null   int64  
 4   Kilometers_Driven  6019 non-null   int64  
 5   Fuel_Type          6019 non-null   object 
 6   Transmission       6019 non-null   object 
 7   Owner_Type         6019 non-null   object 
 8   Mileage            6017 non-null   object 
 9   Engine             5983 non-null   object 
 10  Power              5983 non-null   object 
 11  Seats              6019 non-null   float64
 12  Price              6019 non-null   float64
dtypes: float64(2), int64(3), object(8)
memory usage: 611.4+ KB


In [None]:
# fill the missing value in seat column 
df['Seats'].fillna(df['Seats'].mean() ,inplace = True)

# clean the missing value in Power column
df['Power'] = df['Power'].astype(str)

df['Power'] = df['Power'].str.replace('bhp', '', regex=False).replace(['null', 'null ', 'Null', ''], np.nan)

# Convert the column to numeric
df['Power'] = pd.to_numeric(df['Power'], errors='coerce')

# Fill missing values with mean
df['Power'].fillna(df['Power'].mean(), inplace=True)


# clean the missing value in Engine
df['Engine'] = df['Engine'].str.replace('CC','').astype(float)
# fiil with the mean value 
df['Engine'].fillna(df['Engine'].mean() , inplace = True)

In [3]:
df['Mileage'] = df['Mileage'].astype(str)
df['Mileage'] = df['Mileage'].str.replace(r'kmpl|km/kg','' , regex=True).replace(['null', 'NaN', 'nan', '', ' '], np.nan)
df['Mileage'] = pd.to_numeric(df['Mileage'] , errors='coerce')
df['Mileage'].fillna(df['Mileage'].mean(), inplace = True)
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Mileage'].fillna(df['Mileage'].mean(), inplace = True)


Unnamed: 0           0
Name                 0
Location             0
Year                 0
Kilometers_Driven    0
Fuel_Type            0
Transmission         0
Owner_Type           0
Mileage              0
Engine               0
Power                0
Seats                0
Price                0
dtype: int64

In [4]:
# df.drop(columns=['Unnamed: 0'] , inplace=True)
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6,998.0,58.16,5.0,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67,1582.0,126.2,5.0,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2,1199.0,88.7,5.0,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77,1248.0,88.76,7.0,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2,1968.0,140.8,5.0,17.74


In [14]:
# detect the outliers 

# select only numerical coloums
new_column = df.select_dtypes(include=['int64','float']).columns

# detect the outlier using IQR
q1 = df[new_column].quantile(0.25) 
q3 = df[new_column].quantile(0.75)

IQR = q1 - q3
print(IQR)

Year                 0.0
Kilometers_Driven    0.0
Mileage              0.0
Engine               0.0
Power                0.0
Seats                0.0
Price                0.0
dtype: float64


In [6]:
# define limits for outliers
lower_bound = q1 - 1.5*IQR
upper_bound = q3 + 1.5*IQR

# fix the outliers
for col in new_column:
    median = df[col].median()
    df.loc[df[col] < lower_bound[col], col] = median
    df.loc[df[col] > upper_bound[col], col] = median
# verify the result 
print('fix the outliers successfully :-')
print(df[new_column].describe())

fix the outliers successfully :-
         Year  Kilometers_Driven       Mileage  Engine         Power   Seats  \
count  6019.0             6019.0  6.019000e+03  6019.0  6.019000e+03  6019.0   
mean   2014.0            53000.0  1.815000e+01  1493.0  9.860000e+01     5.0   
std       0.0                0.0  1.712550e-12     0.0  1.117066e-11     0.0   
min    2014.0            53000.0  1.815000e+01  1493.0  9.860000e+01     5.0   
25%    2014.0            53000.0  1.815000e+01  1493.0  9.860000e+01     5.0   
50%    2014.0            53000.0  1.815000e+01  1493.0  9.860000e+01     5.0   
75%    2014.0            53000.0  1.815000e+01  1493.0  9.860000e+01     5.0   
max    2014.0            53000.0  1.815000e+01  1493.0  9.860000e+01     5.0   

              Price  
count  6.019000e+03  
mean   5.640000e+00  
std    4.068195e-13  
min    5.640000e+00  
25%    5.640000e+00  
50%    5.640000e+00  
75%    5.640000e+00  
max    5.640000e+00  


# encoding 

In [7]:
 Le = LabelEncoder()
df['Name'] = Le.fit_transform(df['Name'])
df['Location'] = Le.fit_transform(df['Location'])
df['Year'] = Le.fit_transform(df['Year'])
df['Fuel_Type'] = Le.fit_transform(df['Fuel_Type'])
df['Transmission'] = Le.fit_transform(df['Transmission'])
df['Owner_Type'] = Le.fit_transform(df['Owner_Type'])

In [8]:
input_data = df.iloc[:,:-1]
output_data = df['Price']

# Scalling 

In [19]:
sc = StandardScaler()
input_data = pd.DataFrame(sc.fit_transform(input_data) , columns=input_data.columns)

# Train and Test 

In [10]:
from sklearn.model_selection import train_test_split    

In [11]:
x_train,x_test,y_train,y_test = train_test_split(input_data,output_data, test_size= 0.2 , random_state=42)

In [12]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor

In [13]:
lr = LinearRegression()
lr.fit(input_data,output_data)
lr.score(x_train,y_train)*100 , lr.score(x_test,y_test)*100

(0.0, 0.0)

In [17]:
lor = LinearRegression()
lor.fit(input_data,output_data)
lor.score(x_test,y_test)*100

0.0

In [20]:
df.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,Price
0,1200,9,0,53000,0,1,0,18.15,1493.0,98.6,5.0,5.64
1,512,10,0,53000,1,1,0,18.15,1493.0,98.6,5.0,5.64
2,486,2,0,53000,4,1,0,18.15,1493.0,98.6,5.0,5.64
3,1059,2,0,53000,1,1,0,18.15,1493.0,98.6,5.0,5.64
4,23,3,0,53000,1,0,2,18.15,1493.0,98.6,5.0,5.64
