In [None]:
# Install all required packages if not already installed
!pip install pandas matplotlib seaborn scikit-learn xgboost joblib -q

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    matthews_corrcoef, roc_auc_score,
    classification_report, confusion_matrix
)

## 1. Data Loading & Exploration

In [None]:
# Load the dataset
car_df = pd.read_csv('global_cars_enhanced.csv')

# Display first few rows
print('First 5 rows:')
print(car_df.head())

In [None]:
# Dataset info
print(car_df.info())

In [None]:
# Check for missing values
print('Missing values:')
print(car_df.isnull().sum())

In [None]:
# Summary statistics
print('Summary statistics:')
print(car_df.describe())

In [None]:
# Visualize Price_Category distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Price_Category', data=car_df)
plt.title('Distribution of Car Price Category')
plt.show()
print(car_df['Price_Category'].value_counts())

## 2. Preprocessing

In [None]:
# Drop Car_ID column (not useful for classification)
car_df = car_df.drop('Car_ID', axis=1)

# Drop Price_USD column (would leak the target)
if 'Price_USD' in car_df.columns:
    car_df = car_df.drop('Price_USD', axis=1)

# Encode all remaining categorical (object) columns using LabelEncoder
label_encoders = {}
for col in car_df.select_dtypes(include=['object']).columns:
    if col == 'Price_Category':
        continue
    le = LabelEncoder()
    car_df[col] = le.fit_transform(car_df[col])
    label_encoders[col] = le
    print(f'Encoded column: {col}')

print('\nDataFrame after encoding:')
print(car_df.head())

In [None]:
# Define features (X) and target (y)
X = car_df.drop('Price_Category', axis=1)
y = car_df['Price_Category']

# Encode the target variable for metrics that need numerical labels
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)
class_names = le_target.classes_
print(f'Target classes: {class_names}')
print(f'Encoded values: {np.unique(y_encoded)}')

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

feature_names = X.columns.tolist()

print(f'X_train shape: {X_train_scaled.shape}')
print(f'X_test shape: {X_test_scaled.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

In [None]:
# Create Models directory and save preprocessing artifacts
os.makedirs('Models', exist_ok=True)

# Save the scaler, label encoders, and target encoder for the Streamlit app
joblib.dump(scaler, 'Models/scaler.joblib')
joblib.dump(label_encoders, 'Models/label_encoders.joblib')
joblib.dump(le_target, 'Models/target_encoder.joblib')
joblib.dump(feature_names, 'Models/feature_names.joblib')

print('Preprocessing artifacts saved to Models/ folder.')

## 3. Helper Function for Evaluation Metrics

In [None]:
# Dictionary to store results for comparison
results = {}

def evaluate_model(model_name, model, X_test, y_test):
    """
    Evaluate a trained model and print all required metrics.
    Returns a dictionary of metric values.
    """
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)

    # 1. Accuracy
    acc = accuracy_score(y_test, y_pred)

    # 2. AUC Score (One-vs-Rest for multi-class)
    auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr', average='weighted')

    # 3. Precision (weighted average for multi-class)
    prec = precision_score(y_test, y_pred, average='weighted')

    # 4. Recall (weighted average for multi-class)
    rec = recall_score(y_test, y_pred, average='weighted')

    # 5. F1 Score (weighted average for multi-class)
    f1 = f1_score(y_test, y_pred, average='weighted')

    # 6. Matthews Correlation Coefficient
    mcc = matthews_corrcoef(y_test, y_pred)

    # Print results
    print("=" * 55)
    print(f"  {model_name} - EVALUATION RESULTS")
    print("=" * 55)
    print(f"  Accuracy  : {acc:.4f} ({acc*100:.2f}%)")
    print(f"  AUC Score : {auc:.4f}")
    print(f"  Precision : {prec:.4f}")
    print(f"  Recall    : {rec:.4f}")
    print(f"  F1 Score  : {f1:.4f}")
    print(f"  MCC Score : {mcc:.4f}")
    print("=" * 55)

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names))

    # Confusion Matrix Heatmap
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    # Store results
    results[model_name] = {
        'Accuracy': acc,
        'AUC Score': auc,
        'Precision': prec,
        'Recall': rec,
        'F1 Score': f1,
        'MCC Score': mcc
    }

    return results[model_name]

## 4. Model 1 - Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
log_reg.fit(X_train_scaled, y_train)

# Save model
joblib.dump(log_reg, 'Models/Logistic_Regression.joblib')
print('Model saved to Models/Logistic_Regression.joblib')

evaluate_model('Logistic Regression', log_reg, X_test_scaled, y_test)

## 5. Model 2 - Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42
)
dt_clf.fit(X_train_scaled, y_train)

# Save model
joblib.dump(dt_clf, 'Models/Decision_Tree.joblib')
print('Model saved to Models/Decision_Tree.joblib')

evaluate_model('Decision Tree', dt_clf, X_test_scaled, y_test)

## 6. Model 3 - K-Nearest Neighbor Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier(
    n_neighbors=5,
    weights='uniform',
    metric='minkowski'
)
knn_clf.fit(X_train_scaled, y_train)

# Save model
joblib.dump(knn_clf, 'Models/KNN.joblib')
print('Model saved to Models/KNN.joblib')

evaluate_model('K-Nearest Neighbors', knn_clf, X_test_scaled, y_test)

## 7. Model 4 - Naive Bayes Classifier (Gaussian)

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb_clf = GaussianNB()
gnb_clf.fit(X_train_scaled, y_train)

# Save model
joblib.dump(gnb_clf, 'Models/Gaussian_Naive_Bayes.joblib')
print('Model saved to Models/Gaussian_Naive_Bayes.joblib')

evaluate_model('Gaussian Naive Bayes', gnb_clf, X_test_scaled, y_test)

## 8. Model 5 - Ensemble: Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(
    n_estimators=100,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
rf_clf.fit(X_train_scaled, y_train)

# Save model
joblib.dump(rf_clf, 'Models/Random_Forest.joblib')
print('Model saved to Models/Random_Forest.joblib')

evaluate_model('Random Forest', rf_clf, X_test_scaled, y_test)

## 9. Model 6 - Ensemble: XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)
xgb_clf.fit(X_train_scaled, y_train)

# Save model
joblib.dump(xgb_clf, 'Models/XGBoost.joblib')
print('Model saved to Models/XGBoost.joblib')

evaluate_model('XGBoost', xgb_clf, X_test_scaled, y_test)

## 10. Model Comparison Summary

In [None]:
# Create a comparison DataFrame
comparison_df = pd.DataFrame(results).T
comparison_df = comparison_df.round(4)
print("\n" + "=" * 80)
print("  MODEL COMPARISON - ALL 6 CLASSIFIERS")
print("=" * 80)
print(comparison_df.to_string())
print("=" * 80)

In [None]:
# Visualize the comparison
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Model Comparison Across All Metrics', fontsize=16, fontweight='bold')

metrics = ['Accuracy', 'AUC Score', 'Precision', 'Recall', 'F1 Score', 'MCC Score']
colors = ['#2196F3', '#4CAF50', '#FF9800', '#E91E63', '#9C27B0', '#00BCD4']

for idx, (metric, color) in enumerate(zip(metrics, colors)):
    ax = axes[idx // 3][idx % 3]
    values = [results[model][metric] for model in results]
    model_names = list(results.keys())
    bars = ax.barh(model_names, values, color=color, alpha=0.8)
    ax.set_title(metric, fontsize=12, fontweight='bold')
    ax.set_xlim(0, 1.05)
    for bar, val in zip(bars, values):
        ax.text(bar.get_width() + 0.01, bar.get_y() + bar.get_height()/2,
                f'{val:.4f}', va='center', fontsize=9)

plt.tight_layout()
plt.show()

In [None]:
# Identify the best model for each metric
print("\nBest Model per Metric:")
print("-" * 40)
for metric in metrics:
    best_model = max(results, key=lambda m: results[m][metric])
    best_value = results[best_model][metric]
    print(f"  {metric:12s} : {best_model} ({best_value:.4f})")

# Overall best model (by average rank across all metrics)
print("\n" + "=" * 40)
avg_scores = {model: np.mean(list(vals.values())) for model, vals in results.items()}
best_overall = max(avg_scores, key=avg_scores.get)
print(f"Best Overall Model: {best_overall} (avg score: {avg_scores[best_overall]:.4f})")
print("=" * 40)

In [None]:
# List all saved models
print("\nSaved Models in 'Models/' folder:")
for f in os.listdir('Models'):
    print(f"  - {f}")