In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# **READING DATASET**

In [None]:
vehicle_dataset=pd.read_csv("/kaggle/input/vehicle-dataset-from-cardekho/car data.csv")

In [None]:
vehicle_dataset.shape

In [None]:
vehicle_dataset.head()

In [None]:
import datetime
date_time = datetime.datetime.now()
print(date_time)
vehicle_dataset['Age']=date_time.year - vehicle_dataset['Year']

In [None]:
vehicle_dataset.head()

In [None]:
class_counts = vehicle_dataset['Selling_Price'].value_counts()

all_classes_equal = (class_counts == class_counts.iloc[0]).all()

if all_classes_equal:
    print("All unique classes have an equal number of instances.")
else:
    print("Not all unique classes have an equal number of instances.")
    print("Class Counts:")
    print(class_counts)

In [None]:
class_counts = vehicle_dataset['Selling_Price'].value_counts()
plt.figure(figsize=(8, 6))
plt.bar(class_counts.index, class_counts.values, color='magenta')
plt.xlabel('Unique Classes')
plt.ylabel('Number of Instances')
plt.title('Class Distribution in the Dataset')
plt.xticks(rotation=45)
plt.show()

# **CHECKING FOR CORRELATED FEATURES USING HEATMAP**

In [None]:
vehicle_df = pd.DataFrame(vehicle_dataset[['Present_Price', 'Kms_Driven','Age', 'Selling_Price']])

In [None]:
vehicle_corr = vehicle_df.corr()
vehicle_corr

In [None]:
sns.heatmap(vehicle_corr, cmap = 'YlGnBu')

In [None]:
vehicle_dataset.shape

In [None]:
vehicle_dataset.isnull().sum()

In [None]:
vehicle_dataset.duplicated().sum()

In [None]:
vehicle_dataset = vehicle_dataset.drop_duplicates()
vehicle_dataset = vehicle_dataset.reset_index(drop=True)

In [None]:
vehicle_dataset.duplicated().sum()

In [None]:
categorical_missing_counts = vehicle_dataset.select_dtypes(include=['object']).isnull().sum()
print(categorical_missing_counts)

# **BINARY ENCODING**

In [None]:
vehicle_dataset['Fuel_Type'].unique()

In [None]:
vehicle_dataset['Seller_Type'].unique()

In [None]:
vehicle_dataset['Transmission'].unique()

In [None]:
vehicle_dataset['Owner'].unique()

In [None]:
vehicle_dataset['Fuel_Type'] = vehicle_dataset['Fuel_Type'].map({'Petrol':2,'Diesel':1,'CNG':0})

In [None]:
vehicle_dataset['Seller_Type'] = vehicle_dataset['Seller_Type'].map({'Dealer':0,'Individual':1})

In [None]:
vehicle_dataset['Transmission'] = vehicle_dataset['Transmission'].map({'Manual':0,'Automatic':1})

In [None]:
vehicle_dataset.head()

# **MAINTAINING SELLING PRICE**

In [None]:
sns.boxplot(vehicle_dataset['Selling_Price'])

In [None]:
sorted_price = sorted(vehicle_dataset['Selling_Price'], reverse = True)
print(sorted_price[:10])

In [None]:
vehicle_dataset[(vehicle_dataset['Selling_Price']>=33.0) & (vehicle_dataset['Selling_Price']<=35.0)]

In [None]:
vehicle_dataset = vehicle_dataset[~(vehicle_dataset['Selling_Price']>=33.0) & (vehicle_dataset['Selling_Price']<=35.0)]

In [None]:
sns.boxplot(vehicle_dataset['Selling_Price'])

# **FEATURES SCALING**

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
vehicle_dataset[['Age','Kms_Driven','Selling_Price','Present_Price']]=scaler.fit_transform(vehicle_dataset[['Age','Kms_Driven','Selling_Price','Present_Price']])

In [None]:
X=vehicle_dataset.drop(["Car_Name","Selling_Price"],axis=1)
Y=vehicle_dataset["Selling_Price"]

In [None]:
print(X)

In [None]:
print(Y)

# **SPLIT FUNCTION**

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state=42)

# **MODEL 1: DECISION TREE**

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree = DecisionTreeRegressor()
tree.fit(X_train, Y_train)

In [None]:
print("Training accuracy of the model is {:.2f}".format(tree.score(X_train, Y_train)))
print("Testing accuracy of the model is {:.2f}".format(tree.score(X_test, Y_test)))

In [None]:
training_data_predction = tree.predict(X_train)
r_train_score1 = metrics.r2_score(Y_train, training_data_predction)
print("R squared score of Training Set : ", r_train_score1)

In [None]:
test_data_predction = tree.predict(X_test)
r_test_score1 = metrics.r2_score(Y_test, test_data_predction)
print("R squared score of Test Set : ", r_test_score1)

In [None]:
ex_test_score1 = metrics.explained_variance_score(Y_test, test_data_predction)
print("Explained Variance score of Test Set : ", ex_test_score1)

In [None]:
mae_test_score1 = metrics.mean_absolute_error(Y_test, test_data_predction)
print("Mean Absolute Error score of Test Set : ", mae_test_score1)

# **MODEL 2: LINEAR REGRESSION**

In [None]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train,Y_train)

In [None]:
print("Training accuracy of the model is {:.2f}".format(linear.score(X_train, Y_train)))
print("Testing accuracy of the model is {:.2f}".format(linear.score(X_test, Y_test)))

In [None]:
training_data_predction = linear.predict(X_train)
r_train_score2 = metrics.r2_score(Y_train, training_data_predction)
print("R squared score of Training Set : ", r_train_score2)

In [None]:
test_data_predction = linear.predict(X_test)
r_test_score2 = metrics.r2_score(Y_test, test_data_predction)
print("R squared score of Test Set : ", r_test_score2)

In [None]:
ex_test_score2 = metrics.explained_variance_score(Y_test, test_data_predction)
print("Explained Variance score of Test Set : ", ex_test_score2)

In [None]:
mae_test_score2 = metrics.mean_absolute_error(Y_test, test_data_predction)
print("Mean Absolute Error score of Test Set : ", mae_test_score2)

# **MODEL 3: GRADIENT BOOSTING REGRESSOR**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
grad = GradientBoostingRegressor()
grad.fit(X_train,Y_train)

In [None]:
print("Training accuracy of the model is {:.2f}".format(grad.score(X_train, Y_train)))
print("Testing accuracy of the model is {:.2f}".format(grad.score(X_test, Y_test)))

In [None]:
training_data_predction = grad.predict(X_train)
r_train_score3 = metrics.r2_score(Y_train, training_data_predction)
print("R squared score of Training Set : ", r_train_score3)

In [None]:
test_data_predction = grad.predict(X_test)
r_test_score3 = metrics.r2_score(Y_test, test_data_predction)
print("R squared score of Test Set : ", r_test_score3)

In [None]:
ex_test_score3 = metrics.explained_variance_score(Y_test, test_data_predction)
print("Explained Variance score of Test Set : ", ex_test_score3)

In [None]:
mae_test_score3 = metrics.mean_absolute_error(Y_test, test_data_predction)
print("Mean Absolute Error score of Test Set : ", mae_test_score3)

# **MODEL 4: RIDGE REGRESSION**

In [None]:
from sklearn.linear_model import Ridge
rid= Ridge()
rid.fit(X_train, Y_train)

In [None]:
print("Training accuracy of the model is {:.2f}".format(rid.score(X_train, Y_train)))
print("Testing accuracy of the model is {:.2f}".format(rid.score(X_test, Y_test)))

In [None]:
training_data_predction = rid.predict(X_train)
r_train_score4 = metrics.r2_score(Y_train, training_data_predction)
print("R squared score of Training Set : ", r_train_score4)

In [None]:
test_data_predction = rid.predict(X_test)
r_test_score4 = metrics.r2_score(Y_test, test_data_predction)
print("R squared score of Test Set : ", r_test_score4)

In [None]:
ex_test_score4 = metrics.explained_variance_score(Y_test, test_data_predction)
print("Explained Variance score of Test Set : ", ex_test_score4)

In [None]:
mae_test_score4 = metrics.mean_absolute_error(Y_test, test_data_predction)
print("Mean Absolute Error score of Test Set : ", mae_test_score4)

# **PERFORMANCE OF EACH MODEL IN BAR CHART**

In [None]:
'''
DT = DECISION TREE
LR = LINEAR REGRESSION
GBR = GRADIENT BOOSTING REGRESSOR
RR = RIDGE REGRESSION
'''

model_data_1 = pd.DataFrame({'Models':['DT','LR','GBR','RR'],
             "R-squared score":[r_test_score1,r_test_score2,r_test_score3,r_test_score4]})

In [None]:
model_data_1

In [None]:
plt.figure(figsize=(8, 6))
plt.bar(model_data_1['Models'], model_data_1['R-squared score'], color=['red', 'green', 'blue', 'yellow'])
plt.xlabel('Models')
plt.ylabel('R-squared Score')
plt.title('R-squared Score Comparison for Different Models')
plt.ylim(0, 1)
plt.show()

In [None]:
'''
DT = DECISION TREE
LR = LINEAR REGRESSION
GBR = GRADIENT BOOSTING REGRESSOR
RR = RIDGE REGRESSION
'''

model_data_2 = pd.DataFrame({'Models':['DT','LR','GBR','RR'],
             "Explained Variance Score":[ex_test_score1,ex_test_score2,ex_test_score3,ex_test_score4]})

In [None]:
model_data_2

In [None]:
plt.figure(figsize=(8, 6))
plt.bar(model_data_2['Models'], model_data_2['Explained Variance Score'], color=['red', 'green', 'blue', 'yellow'])
plt.xlabel('Models')
plt.ylabel('Explained Variance Score')
plt.title('Explained Variance Score Comparison for Different Models')
plt.ylim(0, 1)
plt.show()

In [None]:
'''
DT = DECISION TREE
LR = LINEAR REGRESSION
GBR = GRADIENT BOOSTING REGRESSOR
RR = RIDGE REGRESSION
'''

model_data_3 = pd.DataFrame({'Models':['DT','LR','GBR','RR'],
             "Mean Absolute Error score":[mae_test_score1,mae_test_score2,mae_test_score3,mae_test_score4]})

In [None]:
model_data_3

In [None]:
plt.figure(figsize=(8, 6))
plt.bar(model_data_3['Models'], model_data_3['Mean Absolute Error score'], color=['red', 'green', 'blue', 'yellow'])
plt.xlabel('Models')
plt.ylabel('Mean Absolute Error score')
plt.title('Mean Absolute Error score Comparison for Different Models')
plt.ylim(0, 1)
plt.show()