In [None]:
# Import all the necessary libraries
import pandas as pd
import time
import numpy as np
from scipy import stats
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder,MinMaxScaler

import warnings 
warnings.filterwarnings("ignore")

In [None]:
# read the data using pandas
dataframe=pd.read_csv("/kaggle/input/car-price-prediction/CarPrice_Assignment.csv")
dataframe.head()

# Exploratory Data Analysis

In [None]:
# Info of dataset
dataframe.info()

In [None]:
# shape of the data
dataframe.shape

In [None]:
# description of dataset
dataframe.describe()

In [None]:
# To check Is there any duplicate value in the dataset
dataframe.duplicated().sum()

In [None]:
# To check Is there any null value in the dataset
dataframe.isnull().sum()

In [None]:
# Lets drop the column car_ID
dataframe.drop("car_ID", axis=1, inplace=True)

In [None]:
# lets make the correlation matrix
correlation_matrix=dataframe.corr()
correlation_matrix

In [None]:
# lets make the correlation matrix visualise
plt.figure(figsize=(10,10))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.show()

# Lets get the Categorical columns and Numerical columns

In [None]:
numerical_cols = []
categorical_cols = []

def get_numerical_and_categorical_columns(dataframe):
    for column in dataframe.columns:
        if pd.api.types.is_numeric_dtype(dataframe[column]):
            numerical_cols.append(column)
        else:
            categorical_cols.append(column)

In [None]:
get_numerical_and_categorical_columns(dataframe)


In [None]:
numerical_cols

In [None]:
categorical_cols

# Lets check the number of unique values in each column

In [None]:
print(f"No. of unique values in CarName: {dataframe['CarName'].nunique()}")
print(f"No. of unique values in fueltype: {dataframe['fueltype'].nunique()}")
print(f"No. of unique values in aspiration: {dataframe['aspiration'].nunique()}")
print(f"No. of unique values in doornumber: {dataframe['doornumber'].nunique()}")
print(f"No. of unique values in carbody: {dataframe['carbody'].nunique()}")
print(f"No. of unique values in driverwheel: {dataframe['drivewheel'].nunique()}")
print(f"No. of unique values in enginelocation: {dataframe['enginelocation'].nunique()}")
print(f"No. of unique values in enginetype: {dataframe['enginetype'].nunique()}")
print(f"No. of unique values in cylindernumber: {dataframe['cylindernumber'].nunique()}")
print(f"No. of unique values in fuelsystem: {dataframe['fuelsystem'].nunique()}")

In [None]:
# CarName
dataframe["CarName"].unique()

In [None]:
# Fuel type
dataframe["fueltype"].unique()

In [None]:
# Aspiration
dataframe['aspiration'].unique()

In [None]:
# doornumber
dataframe['doornumber'].unique()

In [None]:
# carbody
dataframe['carbody'].unique()

In [None]:
#drivewheel
dataframe['drivewheel'].unique()

In [None]:
# enginelocation
dataframe['enginelocation'].unique()

In [None]:
# enginetype
dataframe['enginetype'].unique()

In [None]:
# cylindernumber
dataframe['cylindernumber'].unique()

In [None]:
#fuelsystem
dataframe['fuelsystem'].unique()

# Convert the categorical values into numerical values using Label Encoder

In [None]:
le= LabelEncoder()
dataframe["CarName"]=le.fit_transform(dataframe["CarName"])
dataframe["fueltype"]=le.fit_transform(dataframe["fueltype"])
dataframe["aspiration"]=le.fit_transform(dataframe["aspiration"])
dataframe["doornumber"]=le.fit_transform(dataframe["doornumber"])
dataframe["carbody"]=le.fit_transform(dataframe["carbody"])
dataframe["drivewheel"]=le.fit_transform(dataframe["drivewheel"])
dataframe["enginelocation"]=le.fit_transform(dataframe["enginelocation"])
dataframe["enginetype"]=le.fit_transform(dataframe["enginetype"])
dataframe["cylindernumber"]=le.fit_transform(dataframe["cylindernumber"])
dataframe["fuelsystem"]=le.fit_transform(dataframe["fuelsystem"])

# Univariate Analysis

In [None]:
plt.figure(figsize=(5,5))
sns.histplot(dataframe["wheelbase"], kde=True)
plt.title("wheelbase", fontsize=14)
plt.xlabel("Wheelbase")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.kdeplot(data=dataframe, x="symboling", hue=None, multiple="stack")
plt.title("Symboling", fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.kdeplot(data=dataframe, x="carlength", hue=None, multiple="stack")
plt.title("Carlength", fontsize=14)
plt.show()

In [None]:
sns.histplot(dataframe["carwidth"], kde=True)
plt.title("carwidth", fontsize=14)
plt.xlabel("Carwidth")
plt.ylabel("Count")
plt.show()

In [None]:
sns.histplot(dataframe["curbweight"], kde=True)
plt.title("curbweight", fontsize=14)
plt.xlabel("curbweight")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.kdeplot(data=dataframe, x="enginesize", hue=None, multiple="stack")
plt.title("Enginesize", fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.kdeplot(data=dataframe, x="boreratio", hue=None, multiple="stack")
plt.title("boreratio", fontsize=14)
plt.show()

In [None]:
sns.histplot(dataframe["stroke"], kde=True)
plt.title("stroke", fontsize=14)
plt.xlabel("stroke")
plt.ylabel("Count")
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.kdeplot(data=dataframe, x="compressionratio", hue=None, multiple="stack")
plt.title("compressionratio", fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.kdeplot(data=dataframe, x="horsepower", hue=None)
plt.title("horsepower", fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.kdeplot(data=dataframe, x="peakrpm", hue=None)
plt.title("peakrpm", fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.kdeplot(data=dataframe, x="citympg", hue=None, multiple="stack")
plt.title("citympg", fontsize=14)
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.kdeplot(data=dataframe, x="highwaympg", hue=None, multiple="stack")
plt.title("highwaympg", fontsize=14)
plt.show()

# Check the Frequency of Categorical Columns

In [None]:
dataframe["fueltype"].value_counts().plot(kind="bar", figsize=(6,4), rot=0)
plt.title("fueltype", fontsize=14)
plt.xlabel("fueltype")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["aspiration"].value_counts().plot(kind="bar", figsize=(6,4), rot=0)
plt.title("aspiration", fontsize=14)
plt.xlabel("aspiration")
plt.ylabel("Count")
plt.show()


In [None]:
dataframe["doornumber"].value_counts().plot(kind="bar", figsize=(6,4), rot=0)
plt.title("doornumber", fontsize=14)
plt.xlabel("doornumber")
plt.ylabel("Count")
plt.show()


In [None]:
dataframe["carbody"].value_counts().plot(kind="bar", figsize=(6,4), rot=0)
plt.title("carbody", fontsize=14)
plt.xlabel("carbody")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["drivewheel"].value_counts().plot(kind="bar", figsize=(6,4), rot=0)
plt.title("drivewheel", fontsize=14)
plt.xlabel("drivewheel")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["enginelocation"].value_counts().plot(kind="bar", figsize=(6,4), rot=0)
plt.title("enginelocation", fontsize=14)
plt.xlabel("enginelocation")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["enginetype"].value_counts().plot(kind="bar", figsize=(6,4), rot=0)
plt.title("enginetype", fontsize=14)
plt.xlabel("enginetype")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["cylindernumber"].value_counts().plot(kind="bar", figsize=(6,4), rot=0)
plt.title("cylindernumber", fontsize=14)
plt.xlabel("cylindernumber")
plt.ylabel("Count")
plt.show()

In [None]:
dataframe["fuelsystem"].value_counts().plot(kind="bar", figsize=(6,4), rot=0)
plt.title("fuelsystem", fontsize=14)
plt.xlabel("fuelsystem")
plt.ylabel("Count")
plt.show()

# Bivaraite Analysis
# To Detect the outliers in the dataset using Boxplot

In [None]:
fig=plt.figure(figsize=(17,17))

ax=fig.add_subplot(331)
sns.boxplot(data=dataframe, x=dataframe["fueltype"], y=dataframe["price"], hue=None ,color='c',ax=ax)
ax.set_title('fueltype vs price', fontsize=16)

ax=fig.add_subplot(332)
sns.boxplot(data=dataframe, x=dataframe["aspiration"], y=dataframe["price"], hue=None ,color='r',ax=ax)
ax.set_title('aspiration vs price', fontsize=16)

ax=fig.add_subplot(333)
sns.boxplot(data=dataframe, x=dataframe["doornumber"], y=dataframe["price"], hue=None ,color='y',ax=ax)
ax.set_title('doornumber vs price', fontsize=16)

ax=fig.add_subplot(334)
sns.boxplot(data=dataframe, x=dataframe["drivewheel"], y=dataframe["price"], hue=None ,color='b',ax=ax)
ax.set_title('drivewheel vs price', fontsize=16)

ax=fig.add_subplot(335)
sns.boxplot(data=dataframe, x=dataframe["enginelocation"], y=dataframe["price"], hue=None ,color='#F72585',ax=ax)
ax.set_title('enginelocation vs price', fontsize=16)


ax=fig.add_subplot(336)
sns.boxplot(data=dataframe, x=dataframe["enginetype"], y=dataframe["price"], hue=None ,color='#3A0CA3',ax=ax)
ax.set_title('enginetype vs price', fontsize=16)

ax=fig.add_subplot(337)
sns.boxplot(data=dataframe, x=dataframe["cylindernumber"], y=dataframe["price"], hue=None ,color='#4CC9F0',ax=ax)
ax.set_title('cylindernumber vs price', fontsize=16)


ax=fig.add_subplot(338)
sns.boxplot(data=dataframe, x=dataframe["fuelsystem"], y=dataframe["price"], hue=None ,color='#7209B7',ax=ax)
ax.set_title('fuelsysytem vs price', fontsize=16)

ax=fig.add_subplot(339)
sns.boxplot(data=dataframe, x=dataframe["carbody"], y=dataframe["price"], hue=None ,color='c',ax=ax)
ax.set_title('carbody vs price', fontsize=16)


plt.show()

# Read the dataframe again

In [None]:
dataframe=pd.read_csv("/kaggle/input/car-price-prediction/CarPrice_Assignment.csv")
dataframe.head()

In [None]:
dataframe.drop("car_ID", axis=1, inplace=True)

# Different methods to detect the outliers
1. Percentile Capping(Winsorization)
2. Z-score Method
3. IQR Method

In [None]:
dataframe.shape

In [None]:
df=dataframe.drop(categorical_cols, axis=1)
columns=df.columns.to_list()
columns, df.shape

# Winsorization Method(Percentile Capping)

In [None]:
def Winsorization(columns, dataframe, a, b):
    outliers=[]

    for col in columns:
        q1= np.percentile(dataframe[col], a)
        q2= np.percentile(dataframe[col],b)
        
        for pos in range(len(dataframe)):
            if dataframe[col].iloc[pos]>q2 or dataframe[col].iloc[pos]<q1:
                outliers.append(pos) 
                
    return outliers

In [None]:
outliers= Winsorization(columns,df, a=1, b=99)
outliers=set(outliers)
outliers=list(outliers)

#  Z_Score Method

In [None]:

def Z_Score(columns, dataframe):
    outliers1=[]
    for col in columns:
        mean=np.mean(dataframe[col])
        std=np.std(dataframe[col])
        
        for pos in range(len(dataframe)):
            item=dataframe[col].iloc[pos]
            z_score=(item-mean)/std
            
            if np.abs(z_score)>3:
                outliers1.append(pos)
        
        return outliers1

In [None]:
outliers1= Z_Score(columns, df)
outliers1=set(outliers1)
outliers1=list(outliers1)

# IQR Method

In [None]:

def IQR(columns, dataframe):
    outliers2=[]
    for col in columns:
        q1 = dataframe[col].quantile(0.25)
        q3 = dataframe[col].quantile(0.75)
        iqr = q3 - q1
        lower_tail= q1 - (2.5 * iqr)
        upper_tail= q3 + (2.5 * iqr)
        for pos in range(len(dataframe)):
            if dataframe[col].iloc[pos] > upper_tail or dataframe[col].iloc[pos] < lower_tail:
                outliers2.append(pos)
        
                
    return outliers2

In [None]:
outliers2= IQR(columns,df)
outliers2=set(outliers2)
outliers2=list(outliers2)

# Ratio of Outliers in dataset using Different Methods

In [None]:
def ratio(dataframe, outliers):
    ratio= len(outliers)/len(dataframe)
    return ratio

In [None]:
print(f"Outliers Detected by Winsorzation Method: {ratio(df, outliers)}")
print(f"Outliers Detected by Z_Score Method: {ratio(df, outliers1)}")
print(f"outliers Detected by IQR Method:{ratio(df, outliers2)}")

# Remove the outliers from the dataframe Detected by Winsorization Method

In [None]:
dataframe.drop(dataframe.index[outliers], inplace=True)

In [None]:
dataframe.head()

# Data Preprocessing

In [None]:
x_train, x_test, y_train, y_test=train_test_split(dataframe.drop("price", axis=1),
                                                  dataframe["price"],
                                                  test_size=0.3,
                                                  random_state=42)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

# X_train encoding

In [None]:
ohe= OneHotEncoder(handle_unknown="ignore")

x_train_ohe= ohe.fit_transform(x_train[categorical_cols])
x_train_ohe= x_train_ohe.toarray()

x_train_ohe_df= pd.DataFrame(x_train_ohe, columns=ohe.get_feature_names_out([categorical_cols[i]  for i in range(len(categorical_cols))]))

# One-hot encoding removed an index. Let's put it back:
x_train_ohe_df.index= x_train.index

# Joining the tables
x_train = pd.concat([x_train, x_train_ohe_df], axis=1)

# Dropping old categorical columns
x_train.drop(categorical_cols, axis=1, inplace=True)

# Checking result
x_train.head()

# X_test Encoding

In [None]:
x_test_ohe= ohe.transform(x_test[categorical_cols])
x_test_ohe= x_test_ohe.toarray()

x_test_ohe_df= pd.DataFrame(x_test_ohe, columns=ohe.get_feature_names_out([categorical_cols[i] for i in range(len(categorical_cols))]))
#print(x_test_ohe_df)

# One-hot encoding removed an index. Let's put it back:
x_test_ohe_df.index= x_test.index

# Joining the tables
x_test= pd.concat([x_test, x_test_ohe_df], axis=1)

# Dropping old categorical columns
x_test.drop(categorical_cols, axis=1, inplace=True)

# Checking result
x_test.head()

# Linear Regression Model

In [None]:
model= LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

print (f"model : {model} and  rmse score is : {np.sqrt(mean_squared_error(y_test, y_pred))}, r2 score is {r2_score(y_test, y_pred)}")

In [None]:
root_mean_squared_error=[]
r2=[]
model_names=[]

# Create the Model
rf = RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)
et= ExtraTreesRegressor(random_state=42)

models = [rf,gb,et]

for model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    root_mean_squared_error.append(np.sqrt(mean_squared_error(y_test, y_pred)))
    r2.append(r2_score(y_test, y_pred))
   
    
model_names = ['RandomForest','GradientBoost','ExtraTree']
result_df = pd.DataFrame({'RMSE':root_mean_squared_error,'R2_score': r2},index=model_names)
result_df

In [None]:
result_df["RMSE"].plot(kind="barh", figsize=(10, 7)).legend(bbox_to_anchor=(1.0, 1.0));

# Feature Importance with Random Forest

In [None]:
x_train.shape

In [None]:
importances = rf.feature_importances_
feature_names = [f"feature {i}" for i in range(x_train.shape[1])]

for i in range(len(rf.feature_importances_)):
    if rf.feature_importances_[i] >0.01:
        print(f"{x_train.columns[i]} : {round(rf.feature_importances_[i],3)}")