# MODEL SELECTION MACHINE LEARNING PROJECT- CAR SELLING PRICE

# Task 1: Data Preprocessing

In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('https://raw.githubusercontent.com/ankitsaini-alps/Car-data/main/car%20data.csv')

# Check for missing values
missing_values = data.isnull().sum()

# Handle missing values (if any)
# Example: If there are missing values in any column, you can choose to drop or impute them.

# Categorical encoding (One-Hot Encoding for categorical variables)
data1 = pd.get_dummies(data, columns=['Fuel_Type', 'Seller_Type', 'Transmission'])

# Feature Scaling (if necessary)
# Example: You can use Min-Max scaling or Standardization.

# Display the preprocessed data
print(data1.head())

print("Orignal Data")
print(data.head())


  Car_Name  Year  Selling_Price  Present_Price  Kms_Driven  Owner  \
0     ritz  2014           3.35           5.59       27000      0   
1      sx4  2013           4.75           9.54       43000      0   
2     ciaz  2017           7.25           9.85        6900      0   
3  wagon r  2011           2.85           4.15        5200      0   
4    swift  2014           4.60           6.87       42450      0   

   Fuel_Type_CNG  Fuel_Type_Diesel  Fuel_Type_Petrol  Seller_Type_Dealer  \
0              0                 0                 1                   1   
1              0                 1                 0                   1   
2              0                 0                 1                   1   
3              0                 0                 1                   1   
4              0                 1                 0                   1   

   Seller_Type_Individual  Transmission_Automatic  Transmission_Manual  
0                       0                       0      

In [2]:
print (data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 301 entries, 0 to 300
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Car_Name       301 non-null    object 
 1   Year           301 non-null    int64  
 2   Selling_Price  301 non-null    float64
 3   Present_Price  301 non-null    float64
 4   Kms_Driven     301 non-null    int64  
 5   Fuel_Type      301 non-null    object 
 6   Seller_Type    301 non-null    object 
 7   Transmission   301 non-null    object 
 8   Owner          301 non-null    int64  
dtypes: float64(2), int64(3), object(4)
memory usage: 21.3+ KB
None


# Task 2: Feature Selection and Engineering

In [3]:
# Consider domain knowledge or conduct feature analysis to determine relevant features.
# For this example, let's assume 'Year', 'Present_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type', and 'Transmission' are relevant.

selected_features = ['Year', 'Selling_Price', 'Kms_Driven', 'Fuel_Type', 'Seller_Type', 'Transmission']

# Create a new DataFrame with selected features
data_selected = data[selected_features].copy()

# Perform feature engineering (if needed)
# Example: Create a new feature 'Age' by subtracting the 'Year' from the current year (2023).
current_year = 2023
data_selected['Age'] = current_year - data['Year']


# Display the updated dataset with selected features and engineered features
print(data_selected.head())


   Year  Selling_Price  Kms_Driven Fuel_Type Seller_Type Transmission  Age
0  2014           3.35       27000    Petrol      Dealer       Manual    9
1  2013           4.75       43000    Diesel      Dealer       Manual   10
2  2017           7.25        6900    Petrol      Dealer       Manual    6
3  2011           2.85        5200    Petrol      Dealer       Manual   12
4  2014           4.60       42450    Diesel      Dealer       Manual    9


# Task 3: Algorithms

# Random Forest Regressor:
Suitability: Random Forest is an ensemble learning method that is robust and can handle both numerical and categorical features well. It's suitable for regression tasks like car price prediction due to its ability to capture complex relationships in the data and handle feature importance.

In [4]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Read the dataset
data = pd.read_csv('car data.csv')

# Encode categorical variables using one-hot encoding
data_encoded = pd.get_dummies(data, columns=['Fuel_Type', 'Seller_Type', 'Transmission'], drop_first=True)

# Define the selected features
selected_features = ['Selling_Price', 'Year', 'Present_Price', 'Kms_Driven', 'Fuel_Type_Diesel', 'Fuel_Type_Petrol', 'Seller_Type_Individual', 'Transmission_Manual']

# Create a new DataFrame with selected features
data_selected = data_encoded[selected_features]

# Split the data into features (X) and target (y)
X = data_selected.drop('Selling_Price', axis=1)
y = data_selected['Selling_Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred)
r2_rf = r2_score(y_test, y_pred)

print(f"Random Forest Mean Squared Error: {mse_rf}")
print(f"Random Forest R-squared: {r2_rf}")


Random Forest Mean Squared Error: 0.8092164740983593
Random Forest R-squared: 0.964871024940323


# Mean Squared Error (MSE) of approximately 0.809 indicates how close the predicted values are to the actual values. Lower MSE values are better.
R-squared (R²) of approximately 0.965 represents the proportion of the variance in the dependent variable (Selling_Price) that is predictable from the independent variables. Higher R² values are better.

# Gradient Boosting
is another ensemble method that performs well in regression tasks. It builds multiple decision trees sequentially, each correcting the errors of the previous one. It's suitable when you want high predictive accuracy.

In [5]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into features (X) and target (y)
X = data_selected.drop('Selling_Price', axis=1)
y = data_selected['Selling_Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(random_state=42)

# Train the model
gb_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred_gb = gb_regressor.predict(X_test)

# Evaluate the model
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"Gradient Boosting Mean Squared Error: {mse_gb}")
print(f"Gradient Boosting R-squared: {r2_gb}")


Gradient Boosting Mean Squared Error: 0.7152134580183913
Gradient Boosting R-squared: 0.9689517990138945


# Linear Regression:

Linear Regression is a simple and interpretable algorithm. It's suitable for regression tasks when the relationship between features and the target variable is approximately linear.

In [6]:
from sklearn.linear_model import LinearRegression

# Create a Linear Regression model
lr_regressor = LinearRegression()

# Train the model
lr_regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred_lr = lr_regressor.predict(X_test)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

print(f"Linear Regression Mean Squared Error: {mse_lr}")
print(f"Linear Regression R-squared: {r2_lr}")


Linear Regression Mean Squared Error: 3.3602215525053554
Linear Regression R-squared: 0.8541290953765206


# Based on these metrics, it appears that the Gradient Boosting model is the best-performing among the three algorithms 

# Task 4: Training, Evaluation & Selection

In [7]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into features (X) and target (y)
X = data_selected.drop('Selling_Price', axis=1)
y = data_selected['Selling_Price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize models
rf_regressor = RandomForestRegressor(random_state=42)
gb_regressor = GradientBoostingRegressor(random_state=42)
lr_regressor = LinearRegression()

# Train the Random Forest model
rf_regressor.fit(X_train, y_train)
y_pred_rf = rf_regressor.predict(X_test)

# Evaluate the Random Forest model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print("Random Forest:")
print(f"Mean Squared Error: {mse_rf}")
print(f"R-squared: {r2_rf}")
print("")

# Train the Gradient Boosting model
gb_regressor.fit(X_train, y_train)
y_pred_gb = gb_regressor.predict(X_test)

# Evaluate the Gradient Boosting model
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
print("Gradient Boosting:")
print(f"Mean Squared Error: {mse_gb}")
print(f"R-squared: {r2_gb}")
print("")

# Train the Linear Regression model
lr_regressor.fit(X_train, y_train)
y_pred_lr = lr_regressor.predict(X_test)

# Evaluate the Linear Regression model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print("Linear Regression:")
print(f"Mean Squared Error: {mse_lr}")
print(f"R-squared: {r2_lr}")
print("")

# Select the best-performing model based on R-squared
best_model = None
best_r2 = -1

if r2_rf > best_r2:
    best_model = rf_regressor
    best_r2 = r2_rf

if r2_gb > best_r2:
    best_model = gb_regressor
    best_r2 = r2_gb

if r2_lr > best_r2:
    best_model = lr_regressor
    best_r2 = r2_lr

print("Best Model:")
print(best_model)


Random Forest:
Mean Squared Error: 0.8092164740983593
R-squared: 0.964871024940323

Gradient Boosting:
Mean Squared Error: 0.7152134580183913
R-squared: 0.9689517990138945

Linear Regression:
Mean Squared Error: 3.3602215525053554
R-squared: 0.8541290953765206

Best Model:
GradientBoostingRegressor(random_state=42)
