In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Car Price Prediction**

###### ![](https://i.ytimg.com/vi/k5E8wk3uadc/mqdefault.jpg)

#### **Introduction**

* A car's price is a complex outcome shaped by a myriad of factors, ranging from the brand's reputation to the car's specifications, including attributes like horsepower, mileage, and many others. Predicting car prices has emerged as a crucial application within the machine learning domain, reflecting the intricate interplay of these factors. Our project is meticulously crafted to provide you with hands-on experience in developing a car price prediction model, making it an ideal entry point for those eager to explore this intriguing field.

#### **Objective**

* In this project, we aim to build a car price prediction model using machine learning. We have a dataset with detailed information about different cars, including their features and prices. Throughout this project, we'll explore data cleaning, train the model, and assess how well it predicts car prices using this dataset.

#### **Steps Covered:**

* Loading and exploring the dataset to gain insights.
* Preprocessing the data, which includes handling categorical variables.
* Partitioning the dataset into training and testing subsets.
* Selecting, training, and evaluating a Linear Regression model.
* Utilizing the trained model for predicting car prices based on new feature inputs.

**Let's begin by loading the necessary libraries and exploring the dataset.**

## **Understanding the data**

In [None]:
# Importing the libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Loading the dataset

df = pd.read_csv("/kaggle/input/car-price/CarPrice_Assignment.csv")

In [None]:
# Displaying the dataset

df

In [None]:
# Concise summary of DataFrame


df.info()

In [None]:
# Descriptive statistics of dataset

df.describe()

In [None]:
# Check for null values

df.isnull().sum()

In [None]:
# Display column names

df.columns

## **Exploratory Data Analysis**

In [None]:
plt.figure(figsize=(10,6))
sns.histplot(data=df,x="enginetype",bins=10)
plt.title("Engine Type Distribution")
plt.show()

In [None]:
plt.scatter(df['horsepower'],df['price'])
plt.xlabel('Horse Power')
plt.ylabel('Price')
plt.title("Horse Power X Price")
plt.show()


In [None]:
# Distribution of Numerical Features
numerical_features = ['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight',
                      'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
                      'peakrpm', 'citympg', 'highwaympg', 'price']

plt.figure(figsize=(12, 8))
for feature in numerical_features:
    plt.subplot(3, 5, numerical_features.index(feature) + 1)
    sns.histplot(data=df[feature], bins=20, kde=True)
    plt.title(feature)
plt.tight_layout()
plt.show()

In [None]:
# Price Analysis

plt.figure(figsize=(10,8))
sns.histplot(data=df['price'],bins=30)
plt.title('Distribution of Price')
plt.show()

## **Data Preprocessing**

In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()
var_mod= df.select_dtypes(include='object').columns

for i in var_mod:
    df[i] = le.fit_transform(df[i])

In [None]:
# Splitting into train and test data

from sklearn.model_selection import train_test_split

X = df.drop(['price'], axis=1)
y = df['price']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.30)


## **Model Selection and Training**

In [None]:
from sklearn.linear_model import LinearRegression

model= LinearRegression()
model.fit(x_train,y_train)

## **Model Evaluation**

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

ypred= model.predict(x_test)

mse= mean_squared_error(y_test,ypred)
r2= r2_score(y_test,ypred)

print("Mean Squared Error:", mse)
print("R-Squared:", r2)

## **Final Result: Price Prediction of Cars**

In [None]:
new_car_features = [4000, 0, 2, 3, 0, 96.0, 172.0, 65.4, 2221, 120, 4, 3.46, 3.19, 9.0, 68, 5500, 31, 38, 0, 0, 0, 0, 0, 0, 0]

new_car_price = model.predict([new_car_features])
print("Predicted Price:", new_car_price[0])