In [None]:
# Importing the dependencies
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder

Data collection and processing

In [None]:
# loading the dataset 
dataset = pd.read_csv('../input/vehicle-dataset-from-cardekho/car data.csv')

In [None]:
# Inspecting the first 5 rows of the dataset
dataset.head()

In [None]:
# Checking the numbers of rows and columns
dataset.shape

In [None]:
# Finding the null values of the dataset
dataset.isnull().sum()

In [None]:
# getting some information about the dataset
dataset.info()

In [None]:
# Getting some statistical information about the data
dataset.describe()

In [None]:
# Checking the distibution of categorical data
print(dataset['Fuel_Type'].value_counts())
print(dataset['Seller_Type'].value_counts())
print(dataset.Transmission.value_counts())

In [None]:
# Showing some visualization about the data
# Showing visualization between Fuel_Type and Transmission
sns.countplot(x='Fuel_Type', hue='Transmission', data=dataset)

In [None]:
# Showing visualization between Seller_Type and Transmission
sns.countplot(x='Seller_Type', hue='Transmission', data=dataset)

In [None]:
# Determining he object data and the numerical data
obj_data = dataset.select_dtypes(include='object')
num_data = dataset.select_dtypes(exclude='object')

In [None]:
print(obj_data)
print(num_data)

In [None]:
# Encoding the categorical data
le = LabelEncoder()
for i in obj_data:
    labelling = le.fit_transform(obj_data[i])
    obj_data[i] = labelling

In [None]:
# Aggregating the dataset after conducting encoding
car_dataset = pd.concat([obj_data,num_data], axis=1)

In [None]:
# Inspecting the first 5 rows of the dataset
car_dataset.head()

In [None]:
# Showing the relationship (correlation) between data
corr = car_dataset.corr()
print(corr)
sns.heatmap(data=corr, annot=True, cmap='plasma')

In [None]:
# Splitting data into the training data and Testing data
x = car_dataset.drop(['Car_Name','Selling_Price'], axis=1)
y = car_dataset['Selling_Price']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1, random_state=2)

In [None]:
# Loading the linear regression model
model = LinearRegression()

In [None]:
# Fitting the model to training data
model.fit(x_train, y_train)

Evalution of the model

In [None]:
# Prediction on the training data 
train_data_pred = model.predict(x_train)

In [None]:
# Checking accuracy score of the model
train_score = metrics.r2_score(y_train, train_data_pred)
print("R squared error for train data: ", train_score)

In [None]:
# Visualizing the results
# Visulaize the actual price and the predicted price on training data
ax = plt.axes()
ax.set(
       title='Actual price vs. Predicted price',
       xlabel='Actual price',
       ylabel='Predicted price'
       )
plt.scatter(y_train, train_data_pred, marker='X', color='blue', s=50)
plt.show()

In [None]:
# Prediction on the testing data 
test_data_pred = model.predict(x_test)

In [None]:
# Checking accuracy score of the model
test_score = metrics.r2_score(y_test, test_data_pred)
print("R squared error for test data: ", test_score)

In [None]:
# Visualizing the results
# Visulaize the actual price and the predicted price on testing data
ax = plt.axes()
ax.set(
       title='Actual price vs. Predicted price',
       xlabel='Actual price',
       ylabel='Predicted price'
       )
plt.scatter(y_test, test_data_pred, marker='X', color='blue', s=50)
plt.show()

**Loading another model XGBRegressor**

In [None]:
# Loading the XGBRegressor model
model = XGBRegressor()

In [None]:
# Fitting the model to training data
model.fit(x_train, y_train)

Evalution of the model

In [None]:
# Prediction on the training data 
train_data_pred = model.predict(x_train)

In [None]:
# Checking accuracy score of the model
train_score = metrics.r2_score(y_train, train_data_pred)
print("R squared error for train data: ", train_score)

In [None]:
# Visualizing the results
# Visulaize the actual price and the predicted price on training data
ax = plt.axes()
ax.set(
       title='Actual price vs. Predicted price',
       xlabel='Actual price',
       ylabel='Predicted price'
       )
plt.scatter(y_train, train_data_pred, marker='X', color='blue', s=50)
plt.show()

In [None]:
# Prediction on the testing data 
test_data_pred = model.predict(x_test)

In [None]:
# Checking accuracy score of the model
test_score = metrics.r2_score(y_test, test_data_pred)
print("R squared error for test data: ", test_score)

In [None]:
# Visualizing the results
# Visulaize the actual price and the predicted price on testing data
ax = plt.axes()
ax.set(
       title='Actual price vs. Predicted price',
       xlabel='Actual price',
       ylabel='Predicted price'
       )
N = 31
colors = np.random.rand(N)
plt.scatter(y_test, test_data_pred, marker='X', c=colors, s=50)
plt.show()