# Loading Data


In [1]:

import pandas as pd

################################# Load the dataset #######################################
file_path = 'data.csv'
data = pd.read_csv(file_path)

################################## Displaying the first few rows of the dataser######################
data.head()

Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Market Category,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,"Factory Tuner,Luxury,High-Performance",Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,High-Performance",Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,"Luxury,Performance",Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Luxury,Compact,Convertible,28,18,3916,34500


# Examining Data Types

In [2]:

data_types = data.dtypes
print(data_types)




Make                  object
Model                 object
Year                   int64
Engine Fuel Type      object
Engine HP            float64
Engine Cylinders     float64
Transmission Type     object
Driven_Wheels         object
Number of Doors      float64
Market Category       object
Vehicle Size          object
Vehicle Style         object
highway MPG            int64
city mpg               int64
Popularity             int64
MSRP                   int64
dtype: object


# Checking Null And Missing Values

In [3]:

missing_values = data.isnull().sum()
print(missing_values)


Make                    0
Model                   0
Year                    0
Engine Fuel Type        3
Engine HP              69
Engine Cylinders       30
Transmission Type       0
Driven_Wheels           0
Number of Doors         6
Market Category      3742
Vehicle Size            0
Vehicle Style           0
highway MPG             0
city mpg                0
Popularity              0
MSRP                    0
dtype: int64


# Dealing With NULL And Missing values


In [4]:
# Check if the "Market Category" column exists before trying to delete it
if "Market Category" in data.columns:
    data = data.drop(columns=["Market Category"])

# 2. Impute missing values in "Engine HP" and "Engine Cylinders" columns with their mean
data["Engine HP"].fillna(data["Engine HP"].mean(), inplace=True)
data["Engine Cylinders"].fillna(data["Engine Cylinders"].mean(), inplace=True)

# 3. Delete rows where "Engine Fuel Type" has missing values
data = data.dropna(subset=["Engine Fuel Type"])

# Display the first few rows to check changes
data.head()


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Engine HP"].fillna(data["Engine HP"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data["Engine Cylinders"].fillna(data["Engine Cylinders"].mean(), inplace=True)


Unnamed: 0,Make,Model,Year,Engine Fuel Type,Engine HP,Engine Cylinders,Transmission Type,Driven_Wheels,Number of Doors,Vehicle Size,Vehicle Style,highway MPG,city mpg,Popularity,MSRP
0,BMW,1 Series M,2011,premium unleaded (required),335.0,6.0,MANUAL,rear wheel drive,2.0,Compact,Coupe,26,19,3916,46135
1,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,Compact,Convertible,28,19,3916,40650
2,BMW,1 Series,2011,premium unleaded (required),300.0,6.0,MANUAL,rear wheel drive,2.0,Compact,Coupe,28,20,3916,36350
3,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Compact,Coupe,28,18,3916,29450
4,BMW,1 Series,2011,premium unleaded (required),230.0,6.0,MANUAL,rear wheel drive,2.0,Compact,Convertible,28,18,3916,34500


# After we Dealt with Missing Values

In [5]:

missing_values_after = data.isnull().sum()
print(missing_values_after)


Make                 0
Model                0
Year                 0
Engine Fuel Type     0
Engine HP            0
Engine Cylinders     0
Transmission Type    0
Driven_Wheels        0
Number of Doors      6
Vehicle Size         0
Vehicle Style        0
highway MPG          0
city mpg             0
Popularity           0
MSRP                 0
dtype: int64


# Encoding

In [6]:

transmission_mapping = {
    'MANUAL': 0,
    'AUTOMATIC': 1,
    'AUTOMATED_MANUAL': 2,
    'DIRECT_DRIVE': 3,
    'UNKNOWN': 4
}

driven_wheels_mapping = {
    'rear wheel drive': 0,
    'front wheel drive': 1,
    'all wheel drive': 2,
    'four wheel drive': 3
}


data['Transmission Type'] = data['Transmission Type'].map(transmission_mapping).fillna(-1).astype(int)
data['Driven_Wheels'] = data['Driven_Wheels'].map(driven_wheels_mapping).fillna(-1).astype(int)




## Installing Library for visualization

In [7]:
!pip install plotly




# Finding Correlation of dependent variable with independent variable (Finding Patterns)

In [8]:
import plotly.express as px



correlation_matrix = data[['Engine HP', 'Engine Cylinders','Transmission Type', 'Driven_Wheels','city mpg','highway MPG','MSRP']].corr()

##################################### Getting correlations with 'MSRP' (excluding MSRP's correlation with itself)##############################
target_corr = correlation_matrix['MSRP'].sort_values(ascending=False)

################################## Displaying the correlations #####################################
print("Correlations with MSRP:\n", target_corr)

######################################## Bar chart ###########################################
fig = px.bar(target_corr[1:],  # Exclude the MSRP-MSRP correlation (first element)
             x=target_corr.index[1:],
             y=target_corr.values[1:],
             labels={'x': 'Features', 'y': 'Correlation with MSRP'},
             title='Correlation of Engine HP and Engine Cylinders with MSRP')

# Show the plot
fig.show()


Correlations with MSRP:
 MSRP                 1.000000
Engine HP            0.661638
Engine Cylinders     0.531303
Transmission Type    0.193734
Driven_Wheels       -0.030462
city mpg            -0.157708
highway MPG         -0.160063
Name: MSRP, dtype: float64


## Heat map

In [9]:
import plotly.express as px

correlation_matrix = data[['Engine HP', 'Engine Cylinders', 'Transmission Type', 'Driven_Wheels', 'city mpg', 'highway MPG', 'MSRP']].corr()

############################ Generating the heatmap #####################################
fig = px.imshow(correlation_matrix,
                labels=dict(x="Features", y="Features", color="Correlation"),
                title="Correlation Heatmap of Features with MSRP",
                color_continuous_scale="Viridis")

# Show the plot
fig.show()


# Dimension Reduction

In [10]:
from sklearn.decomposition import PCA

# Assuming 'data' is your DataFrame

# Select only the columns you want
df_selected = data[['Engine HP', 'Engine Cylinders','city mpg','highway MPG','MSRP']]

# Display the new DataFrame
print(df_selected.head())

   Engine HP  Engine Cylinders  city mpg  highway MPG   MSRP
0      335.0               6.0        19           26  46135
1      300.0               6.0        19           28  40650
2      300.0               6.0        20           28  36350
3      230.0               6.0        18           28  29450
4      230.0               6.0        18           28  34500


## XGB Model Training

In [12]:
%pip install xgboost
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import pandas as pd

Collecting xgboostNote: you may need to restart the kernel to use updated packages.

  Downloading xgboost-2.1.4-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB 259.2 kB/s eta 0:08:02
   ---------------------------------------- 0.0/124.9 MB 279.3 kB/s eta 0:07:28
   ---------------------------------------- 0.1/124.9 MB 521.8 kB/s eta 0:04:00
   ---------------------------------------- 0.1/124.9 MB 708.1 kB/s eta 0:02:57
   ---------------------------------------- 0.1/124.9 MB 708.1 kB/s eta 0:02:57
   ---------------------------------------- 0.1/124.9 MB 708.1 kB/s eta 0:02:57
   ---------------------------------------- 0.3/124.9 MB 778.1 kB/s eta 0:02:41
   ---------------------------------------- 0.3/124.9 MB 778.1 kB/s eta 0:02:41
   -

## Selecting Dependent and Independent Variables

In [13]:

df_selected = data[['Engine HP', 'Engine Cylinders', 'MSRP']]
X = df_selected[['Engine HP', 'Engine Cylinders']]  # Independent Variable
y = df_selected['MSRP']  # Dependent Variable

## Splitting data into training and testing

In [14]:
################################# Spliting the data into training and testing sets (80% train, 20% test)   #############################
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

################################## Initialize the XGB Regressor model #######################################
model = xgb.XGBRegressor(objective='reg:squarederror',
                         n_estimators=100,
                         max_depth=3,
                         learning_rate=0.1,
                         random_state=42)



## Training model

In [15]:
################################## Training the model on the training data ###########################################
model.fit(X_train, y_train)

################################# Making predictions on the test data ##################################
y_pred = model.predict(X_test)

## Evaluating model

In [16]:
#####################################  MSE, MAE, R2 and RMSE ###############################
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

# Display the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R²): {r2}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Squared Error (MSE): 530149797.08417964
Mean Absolute Error (MAE): 10993.930320185871
R-squared (R²): 0.8324112865119683
Root Mean Squared Error (RMSE): 23024.98202136496


## Visualizing Regression line

In [17]:
########################################## Visualizing Actual vs Predicted values using Plotly ######################################
fig = go.Figure()


fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', name='Predicted vs Actual',
                         marker=dict(color='blue', size=8)))


fig.add_trace(go.Scatter(x=[min(y_test), max(y_test)], y=[min(y_test), max(y_test)],
                         mode='lines', name='Perfect Prediction Line', line=dict(color='red', dash='dash')))

fig.update_layout(
    title="Actual vs Predicted MSRP using XGBoost",
    xaxis_title="Actual MSRP",
    yaxis_title="Predicted MSRP",
    showlegend=True
)

fig.show()

## Random Forest

In [18]:
from sklearn.ensemble import RandomForestRegressor


## Selecting Features

In [19]:
df_selected = data[['Engine HP', 'Engine Cylinders', 'MSRP']]
X = df_selected[['Engine HP', 'Engine Cylinders']]  # Independent Variables
y = df_selected['MSRP']  # Dependent Variable

################################ Split the data into training and testing sets (80% train, 20% test) #################################
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ##############################Initializing the RandomForestRegressor model  ########################################

rf_model = RandomForestRegressor(n_estimators=100,  # Number of trees
                                 max_depth=5,  # Maximum depth of trees
                                 random_state=42)


## Splitting Data

In [20]:
############################################ Training the model on the training data ################################
rf_model.fit(X_train, y_train)

############################################ Making predictions on the test data ####################################
y_pred = rf_model.predict(X_test)


## Evaluation Matrices

In [21]:
############################# MSE,MAE,R2,RMSE ###################################
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

# Display the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R²): {r2}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Squared Error (MSE): 488820250.9719469
Mean Absolute Error (MAE): 11758.442660051327
R-squared (R²): 0.8454762079739557
Root Mean Squared Error (RMSE): 22109.27974792365


## Visualizing Regression Line

In [22]:
################################## Visualizing Actual vs Predicted values using Plotly ########################################
fig = go.Figure()


fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', name='Predicted vs Actual',
                         marker=dict(color='blue', size=8)))


fig.add_trace(go.Scatter(x=[min(y_test), max(y_test)], y=[min(y_test), max(y_test)],
                         mode='lines', name='Perfect Prediction Line', line=dict(color='red', dash='dash')))


fig.update_layout(
    title="Actual vs Predicted MSRP using Random Forest",
    xaxis_title="Actual MSRP",
    yaxis_title="Predicted MSRP",
    showlegend=True
)

fig.show()

## Gradient Boosting Model

In [23]:
from sklearn.ensemble import GradientBoostingRegressor


## Selecting Features

In [24]:

df_selected = data[['Engine HP', 'Engine Cylinders', 'MSRP']]
X = df_selected[['Engine HP', 'Engine Cylinders']] # independent variable
y = df_selected['MSRP']  #  dependent variable

################################## Spliting data in i.e (80% -> training  and 20% -> testing ) ###################################

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

################################## Initializing the GradientBoostingRegressor model ##################################

gb_model = GradientBoostingRegressor(n_estimators=100,
                                     max_depth=3,
                                     learning_rate=0.1,
                                     random_state=42)



## Training Model

In [25]:
#################################### Training the model on the training data ############################
gb_model.fit(X_train, y_train)

################################### Making predictions on the test data ##################################
y_pred = gb_model.predict(X_test)

## Evaluation Matrices

In [26]:
########################   Calculating MSE, MAE, r2 and RMSE ###############################
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mse)

# Display the evaluation metrics
print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R-squared (R²): {r2}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Mean Squared Error (MSE): 486371715.70793426
Mean Absolute Error (MAE): 10675.979027043539
R-squared (R²): 0.8462502285943216
Root Mean Squared Error (RMSE): 22053.836757080033


## Visualizing Regression Line

In [27]:
################################## Visualizing Actual vs Predicted values using Plotly ########################################

fig = go.Figure()

fig.add_trace(go.Scatter(x=y_test, y=y_pred, mode='markers', name='Predicted vs Actual',
                         marker=dict(color='blue', size=8)))


fig.add_trace(go.Scatter(x=[min(y_test), max(y_test)], y=[min(y_test), max(y_test)],
                         mode='lines', name='Perfect Prediction Line', line=dict(color='red', dash='dash')))


fig.update_layout(
    title="Actual vs Predicted MSRP using Gradient Boosting",
    xaxis_title="Actual MSRP",
    yaxis_title="Predicted MSRP",
    showlegend=True
)

fig.show()

## Now saving the best Model i.e (Grading Boosting Model)

In [28]:

import joblib

# Save the trained model to a file
joblib.dump(gb_model, 'gradient_boosting_model.pkl')
print("Model saved successfully!")


Model saved successfully!
