<a href="https://colab.research.google.com/github/ashloshaju24-sys/USER-CENTRIC-PREDICTIVE-MODELING-OF-BMW-CAR-PRICES-WITH-PYTHON-BASED-GUI-INTEGRATION-/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================
# CELL 0: UPLOAD DATA FILE (RUN THIS FIRST)
# ===============================
from google.colab import files
uploaded = files.upload()

# Make sure the file name is exactly:
# bmw_pricing_challenge.csv


# ===============================
# CELL 1: IMPORTS (NO NUMPY DOWNGRADE – COLAB SAFE)
# ===============================
# ===============================

# ===============================
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import math

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

import gradio as gr


# ===============================
# CELL 3: TRY IMPORTING XGBOOST (OPTIONAL)
# ===============================
try:
    from xgboost import XGBRegressor
    XGB_AVAILABLE = True
except:
    XGB_AVAILABLE = False


# ===============================
# CELL 4: LOAD DATASET
# ===============================
df = pd.read_csv("/content/bmw_pricing_challenge.csv")

# Convert date columns
df['registration_date'] = pd.to_datetime(df['registration_date'], errors='coerce')
df['sold_at'] = pd.to_datetime(df['sold_at'], errors='coerce')

# Create car age feature
df['car_age_years'] = ((df['sold_at'] - df['registration_date']).dt.days / 365.25).clip(lower=0)


# ===============================
# CELL 5: FEATURE SELECTION
# ===============================
features = (
    ['mileage', 'engine_power', 'car_age_years', 'fuel', 'paint_color', 'car_type']
    + [f'feature_{i}' for i in range(1, 9)]
)

X = df[features]
y = df['price']

numeric_features = ['mileage', 'engine_power', 'car_age_years']
categorical_features = ['fuel', 'paint_color', 'car_type']


# ===============================
# CELL 6: PREPROCESSING PIPELINE
# ===============================
preprocessor = ColumnTransformer(
    transformers=[
        (
            'num',
            Pipeline([
                ('imputer', SimpleImputer(strategy='median')),
                ('scaler', StandardScaler())
            ]),
            numeric_features
        ),
        (
            'cat',
            Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
            ]),
            categorical_features
        )
    ],
    remainder='passthrough'
)


# ===============================
# CELL 7: MODELS
# ===============================
models = {
    "LinearRegression": Pipeline([
        ('prep', preprocessor),
        ('model', LinearRegression())
    ]),

    "RandomForest": Pipeline([
        ('prep', preprocessor),
        ('model', RandomForestRegressor(n_estimators=150, random_state=42))
    ])
}

if XGB_AVAILABLE:
    models["XGBoost"] = Pipeline([
        ('prep', preprocessor),
        ('model', XGBRegressor(n_estimators=200, learning_rate=0.08))
    ])


# ===============================
# CELL 8: TRAIN / TEST SPLIT
# ===============================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


# ===============================
# CELL 9: TRAIN MODELS & SELECT BEST
# ===============================
errors = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    errors[name] = mean_absolute_error(y_test, predictions)

best_model_name = min(errors, key=errors.get)
best_pipeline = models[best_model_name]

joblib.dump(best_pipeline, "best_model.pkl")

print("\nBest Model Selected:", best_model_name)


# ===============================
# CELL 10: PREDICTION FUNCTION (WITH GRAPHS)
# ===============================
def predict_price(
    mileage, engine_power, car_age_years,
    fuel, paint_color, car_type,
    feature_1, feature_2, feature_3, feature_4,
    feature_5, feature_6, feature_7, feature_8
):

    row = pd.DataFrame([{
        'mileage': mileage,
        'engine_power': engine_power,
        'car_age_years': car_age_years,
        'fuel': fuel,
        'paint_color': paint_color,
        'car_type': car_type,
        'feature_1': feature_1,
        'feature_2': feature_2,
        'feature_3': feature_3,
        'feature_4': feature_4,
        'feature_5': feature_5,
        'feature_6': feature_6,
        'feature_7': feature_7,
        'feature_8': feature_8
    }])

    prediction = best_pipeline.predict(row)[0]
    images = []

    # Graph 1: Mileage vs Price
    mean_mileage = df[df['mileage'].between(mileage-10000, mileage+10000)]['price'].mean()
    plt.figure(figsize=(6,4))
    sns.scatterplot(data=df.sample(700), x='mileage', y='price', alpha=0.4)
    plt.scatter(mileage, prediction, color='red', s=150)
    plt.text(mileage, prediction, f"₹{prediction:,.0f}", color='red')
    plt.title(f"Mileage vs Price | Similar Avg: ₹{mean_mileage:,.0f}")
    plt.savefig("g1.png")
    plt.close()
    images.append("g1.png")

    # Graph 2: Fuel Type
    median_fuel = df[df['fuel'] == fuel]['price'].median()
    plt.figure(figsize=(6,4))
    sns.boxplot(data=df, x='fuel', y='price')
    plt.scatter(sorted(df['fuel'].unique()).index(fuel), prediction, s=150, color='red')
    plt.text(sorted(df['fuel'].unique()).index(fuel), prediction, f"₹{prediction:,.0f}", color='red')
    plt.title(f"Fuel Comparison | {fuel} Median: ₹{median_fuel:,.0f}")
    plt.savefig("g2.png")
    plt.close()
    images.append("g2.png")

    # Graph 3: Engine Power
    mean_engine = df[df['engine_power'].between(engine_power-20, engine_power+20)]['price'].mean()
    plt.figure(figsize=(6,4))
    sns.scatterplot(data=df.sample(600), x='engine_power', y='price', alpha=0.4)
    plt.scatter(engine_power, prediction, s=150, color='red')
    plt.text(engine_power, prediction, f"₹{prediction:,.0f}", color='red')
    plt.title(f"Engine Power vs Price | Similar Avg: ₹{mean_engine:,.0f}")
    plt.savefig("g3.png")
    plt.close()
    images.append("g3.png")

    # Graph 4: Car Age
    mean_age = df[df['car_age_years'].between(car_age_years-1, car_age_years+1)]['price'].mean()
    plt.figure(figsize=(6,4))
    sns.scatterplot(data=df.sample(600), x='car_age_years', y='price', alpha=0.4)
    plt.scatter(car_age_years, prediction, s=150, color='red')
    plt.text(car_age_years, prediction, f"₹{prediction:,.0f}", color='red')
    plt.title(f"Car Age vs Price | Similar Avg: ₹{mean_age:,.0f}")
    plt.savefig("g4.png")
    plt.close()
    images.append("g4.png")

    # Graph 5: Paint Color
    paint_mean = df.groupby('paint_color')['price'].mean().sort_values()
    plt.figure(figsize=(7,4))
    paint_mean.plot(kind='bar')
    plt.scatter(list(paint_mean.index).index(paint_color), paint_mean[paint_color], s=150, color='red')
    plt.title(f"Paint Color Influence | {paint_color}: ₹{paint_mean[paint_color]:,.0f}")
    plt.savefig("g5.png")
    plt.close()
    images.append("g5.png")

    # Graph 6: Car Type
    ct_mean = df.groupby('car_type')['price'].mean().sort_values()
    plt.figure(figsize=(7,4))
    ct_mean.plot(kind='bar')
    plt.scatter(list(ct_mean.index).index(car_type), ct_mean[car_type], s=150, color='red')
    plt.title(f"Car Type Influence | {car_type}: ₹{ct_mean[car_type]:,.0f}")
    plt.savefig("g6.png")
    plt.close()
    images.append("g6.png")

    return (
        f"""
**Predicted Price:** ₹ {prediction:,.0f}

**Numerical Insights**
• Similar Mileage Avg: ₹ {mean_mileage:,.0f}
• Median {fuel} Price: ₹ {median_fuel:,.0f}
• Similar Engine Power Avg: ₹ {mean_engine:,.0f}
• Similar Age Avg: ₹ {mean_age:,.0f}
• Mean {paint_color} Color Price: ₹ {paint_mean[paint_color]:,.0f}
• Mean {car_type} Type Price: ₹ {ct_mean[car_type]:,.0f}
""",
        *images
    )


# ===============================
# CELL 11: GRADIO GUI
# ===============================
gui = gr.Interface(
    fn=predict_price,
    inputs=[
        gr.Slider(0, 250000, 60000, label="Mileage"),
        gr.Slider(50, 400, 150, label="Engine Power"),
        gr.Slider(0, 20, 7, label="Car Age (years)"),
        gr.Dropdown(sorted(df['fuel'].unique()), label="Fuel Type"),
        gr.Dropdown(sorted(df['paint_color'].unique()), label="Paint Color"),
        gr.Dropdown(sorted(df['car_type'].unique()), label="Car Type"),
        *[gr.Checkbox(label=f"feature_{i}") for i in range(1, 9)]
    ],
    outputs=[
        gr.Text(label="Prediction Summary"),
        *[gr.Image(label=f"Graph {i}") for i in range(1, 7)]
    ],
    title="BMW Price Prediction GUI with Visual & Numerical Analysis",
    description="Enter car attributes → Get predicted price + graph-based reasoning"
)

gui.launch(debug=True)

Saving bmw_pricing_challenge.csv to bmw_pricing_challenge.csv

Best Model Selected: RandomForest
It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://29618d0ee8918b2361.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
