In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
import pandas as pd
from sklearn.model_selection import RandomizedSearchCV

In [None]:
import pandas as pd

# Load the dataset from Google Drive
file_path = '/content/drive/MyDrive/Car/cardekho_dataset.csv'
df = pd.read_csv(file_path)

# Drop unnecessary columns
df_cleaned = df.drop(columns=["Unnamed: 0", "car_name"])

# Convert vehicle age to year of manufacture (assuming current year is 2023)
current_year = 2023
df_cleaned['year_of_manufacture'] = current_year - df_cleaned['vehicle_age']

# Drop the original vehicle_age column
df_cleaned = df_cleaned.drop(columns=['vehicle_age'])

# Filter for models with more than 300 entries
model_counts = df_cleaned['model'].value_counts()
popular_models = model_counts[model_counts > 300].index
df_filtered = df_cleaned[df_cleaned['model'].isin(popular_models)]

# Define feature columns (X) and target (y)
X = df_filtered.drop(columns=['selling_price', 'seller_type'])
y = df_filtered['selling_price']

# Categorical columns that need to be encoded
categorical_cols = ['fuel_type', 'transmission_type', 'brand', 'model']



In [None]:
df_cleaned.head()



Unnamed: 0,brand,model,km_driven,seller_type,fuel_type,transmission_type,mileage,engine,max_power,seats,selling_price,year_of_manufacture
0,Maruti,Alto,120000,Individual,Petrol,Manual,19.7,796,46.3,5,120000,2014
1,Hyundai,Grand,20000,Individual,Petrol,Manual,18.9,1197,82.0,5,550000,2018
2,Hyundai,i20,60000,Individual,Petrol,Manual,17.0,1197,80.0,5,215000,2012
3,Maruti,Alto,37000,Individual,Petrol,Manual,20.92,998,67.1,5,226000,2014
4,Ford,Ecosport,30000,Dealer,Diesel,Manual,22.77,1498,98.59,5,570000,2017


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Preprocessing pipeline with One-Hot Encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ], remainder='passthrough'  # Keep other features as they are
)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV

# Random Forest model pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Hyperparameter tuning using RandomizedSearchCV
param_grid = {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [None, 10, 20, 30],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4]
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,
    n_iter=10,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Fit the model with the training data
random_search.fit(X_train, y_train)

# Best model from hyperparameter tuning
best_model = random_search.best_estimator_

# Predict on the test set
y_pred = best_model.predict(X_test)

# Evaluate the model
r2 = r2_score(y_test, y_pred)

print(f'R² Score: {r2}')


Fitting 5 folds for each of 10 candidates, totalling 50 fits
R² Score: 0.9254601911684139


In [None]:
import joblib

# Save the best model to Google Drive
model_path = '/content/drive/MyDrive/Car/best_car_price_model.pkl'
joblib.dump(best_model, model_path)

print(f"Model saved to {model_path}")


Model saved to /content/drive/MyDrive/Car/best_car_price_model.pkl


In [None]:
import pandas as pd
import joblib

# Load the complete pipeline from Google Drive
model_path = '/content/drive/MyDrive/Car/best_car_price_model.pkl'
loaded_pipeline = joblib.load(model_path)

# New sample test data with brand and model first
test_data = [
    ('Hyundai', 'i20', 2021, 10000, 'Petrol', 'Manual', 21.0, 1197, 83.0, 5),     # Hyundai i20, 2021
    ('BMW', 'X5', 2020, 5000, 'Diesel', 'Automatic', 15.0, 2993, 265.0, 5),      # BMW X5, 2020
    ('Toyota', 'Fortuner', 2019, 12000, 'Diesel', 'Manual', 14.0, 2755, 177.0, 7),# Toyota Fortuner, 2019
    ('Volkswagen', 'Polo', 2022, 8000, 'Petrol', 'Manual', 18.0, 999, 75.0, 5),   # VW Polo, 2022
    ('Kia', 'Seltos', 2023, 500, 'Petrol', 'Automatic', 16.5, 1497, 115.0, 5)     # Kia Seltos, 2023
]

# Function to test the model
def test_model(test_data):
    print("Testing Model Predictions:")
    for data in test_data:
        brand, model, year_of_manufacture, km_driven, fuel_type, transmission_type, mileage, engine, max_power, seats = data

        # Create a DataFrame with the same structure as the training data
        input_data = pd.DataFrame([[year_of_manufacture, km_driven, fuel_type, transmission_type, mileage, engine, max_power, seats, brand, model]],
                                   columns=['year_of_manufacture', 'km_driven', 'fuel_type', 'transmission_type', 'mileage', 'engine', 'max_power', 'seats', 'brand', 'model'])

        # Ensure categorical columns are treated as strings
        input_data[['fuel_type', 'transmission_type', 'brand', 'model']] = input_data[['fuel_type', 'transmission_type', 'brand', 'model']].astype(str)

        # Predict using the complete pipeline
        predicted_price = loaded_pipeline.predict(input_data)[0]

        # Output the result
        print(f"Predicted Price for {brand} {model}: {predicted_price:.2f}")

# Run the test
test_model(test_data)


Testing Model Predictions:
Predicted Price for Hyundai i20: 656002.24
Predicted Price for BMW X5: 1644229.10
Predicted Price for Toyota Fortuner: 1806681.42
Predicted Price for Volkswagen Polo: 695432.80
Predicted Price for Kia Seltos: 972177.24


In [None]:
import shap
import pandas as pd
import joblib

# Load the model pipeline
model_pipeline = joblib.load('/content/drive/MyDrive/Car/best_car_price_model.pkl')

# New sample test data with different cars and attributes (brand and model first)
test_data = [
    ('Hyundai', 'i20', 2021, 10000, 'Petrol', 'Manual', 21.0, 1197, 83.0, 5),     # Hyundai i20, 2021
    ('BMW', 'X5', 2020, 5000, 'Diesel', 'Automatic', 15.0, 2993, 265.0, 5),      # BMW X5, 2020
    ('Toyota', 'Fortuner', 2019, 12000, 'Diesel', 'Manual', 14.0, 2755, 177.0, 7),# Toyota Fortuner, 2019
    ('Volkswagen', 'Polo', 2022, 8000, 'Petrol', 'Manual', 18.0, 999, 75.0, 5),   # VW Polo, 2022
    ('Kia', 'Seltos', 2023, 500, 'Petrol', 'Automatic', 16.5, 1497, 115.0, 5)     # Kia Seltos, 2023
]

# Function to predict price, find highest contributing attribute and percentage
def test_model(test_data):
    print("Testing Model Predictions with SHAP breakdown:")

    for data in test_data:
        brand, model, year_of_manufacture, km_driven, fuel_type, transmission_type, mileage, engine, max_power, seats = data

        # Create a DataFrame with brand and model first, followed by other features
        input_data = pd.DataFrame([[brand, model, year_of_manufacture, km_driven, fuel_type, transmission_type, mileage, engine, max_power, seats]],
                                  columns=['brand', 'model', 'year_of_manufacture', 'km_driven', 'fuel_type', 'transmission_type', 'mileage', 'engine', 'max_power', 'seats'])

        # Ensure categorical columns are treated as strings
        input_data[['fuel_type', 'transmission_type', 'brand', 'model']] = input_data[['fuel_type', 'transmission_type', 'brand', 'model']].astype(str)

        # Transform input using the preprocessing pipeline
        input_transformed = model_pipeline.named_steps['preprocessor'].transform(input_data)

        # Predict the price using the model
        predicted_price = model_pipeline.named_steps['model'].predict(input_transformed)[0]

        # Calculate SHAP values
        explainer = shap.Explainer(model_pipeline.named_steps['model'])
        shap_values = explainer(input_transformed)

        # Convert SHAP values to DataFrame for easy breakdown
        shap_values_df = pd.DataFrame(shap_values.values, columns=model_pipeline.named_steps['preprocessor'].get_feature_names_out())

        # Find the highest contributing feature
        max_contrib_feature = shap_values_df.iloc[0].idxmax()
        max_contrib_value = shap_values_df.iloc[0].max()

        # Calculate percentage contribution relative to the predicted price
        contribution_percentage = (max_contrib_value / predicted_price) * 100

        # Output the results
        print(f"\nPredicted Price for {brand} {model}: {predicted_price:.2f}")
        print(f"Highest Contributing Feature: {max_contrib_feature}")
        print(f"Percentage Contribution: {contribution_percentage:.2f}%")
        print("-" * 50)

# Run the test
test_model(test_data)




Testing Model Predictions with SHAP breakdown:

Predicted Price for Hyundai i20: 656002.24
Highest Contributing Feature: remainder__year_of_manufacture
Percentage Contribution: 24.36%
--------------------------------------------------

Predicted Price for BMW X5: 1644229.10
Highest Contributing Feature: remainder__max_power
Percentage Contribution: 44.95%
--------------------------------------------------

Predicted Price for Toyota Fortuner: 1806681.42
Highest Contributing Feature: remainder__max_power
Percentage Contribution: 45.81%
--------------------------------------------------

Predicted Price for Volkswagen Polo: 695432.80
Highest Contributing Feature: remainder__year_of_manufacture
Percentage Contribution: 25.19%
--------------------------------------------------

Predicted Price for Kia Seltos: 972177.24
Highest Contributing Feature: remainder__year_of_manufacture
Percentage Contribution: 23.84%
--------------------------------------------------
