In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
df = pd.read_csv('cars_cleaned_encoded.csv')
print(f"✓ Data loaded successfully!")
print(f"  Shape: {df.shape}")
print(f"  Features: {df.shape[1] - 1}")
print(f"  Samples: {df.shape[0]}")

✓ Data loaded successfully!
  Shape: (6018, 18)
  Features: 17
  Samples: 6018


In [9]:
y = df['Price']
X = df.drop('Price', axis=1)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
models = {}
models['Simple Linear Regression'] = LinearRegression()
models['Multiple Linear Regression'] = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])
models['Polynomial Regression (Degree 2)'] = Pipeline([
    ('poly', PolynomialFeatures(degree=2, include_bias=False)),
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])
models['Ridge Regression'] = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Ridge(alpha=1.0, random_state=42))
])
models['Lasso Regression'] = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Lasso(alpha=1.0, random_state=42))
])
models['ElasticNet Regression'] = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(alpha=1.0, l1_ratio=0.5, random_state=42))
])

In [10]:
results = {}

for name, model in models.items():
    print(f"\nTraining {name}...", end=' ')
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Calculate metrics for TRAINING set
    train_r2 = r2_score(y_train, y_train_pred)
    train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
    train_mae = mean_absolute_error(y_train, y_train_pred)
    
    # Calculate metrics for TEST set
    test_r2 = r2_score(y_test, y_test_pred)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_mae = mean_absolute_error(y_test, y_test_pred)
    
    # Calculate accuracy (R² as percentage)
    train_accuracy = train_r2 * 100
    test_accuracy = test_r2 * 100
    
    # Store results
    results[name] = {
        'model': model,
        'train_predictions': y_train_pred,
        'test_predictions': y_test_pred,
        'train_r2': train_r2,
        'test_r2': test_r2,
        'train_accuracy': train_accuracy,
        'test_accuracy': test_accuracy,
        'train_rmse': train_rmse,
        'test_rmse': test_rmse,
        'train_mae': train_mae,
        'test_mae': test_mae
    }
    
    print(f"✓ Done")



Training Simple Linear Regression... ✓ Done

Training Multiple Linear Regression... ✓ Done

Training Polynomial Regression (Degree 2)... ✓ Done

Training Ridge Regression... ✓ Done

Training Lasso Regression... ✓ Done

Training ElasticNet Regression... ✓ Done


In [11]:
print(" MODEL PERFORMANCE COMPARISON")
print("=" * 70)

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Model': list(results.keys()),
    'Train_Accuracy(%)': [results[m]['train_accuracy'] for m in results],
    'Test_Accuracy(%)': [results[m]['test_accuracy'] for m in results],
    'Train_R2': [results[m]['train_r2'] for m in results],
    'Test_R2': [results[m]['test_r2'] for m in results],
    'Test_RMSE': [results[m]['test_rmse'] for m in results],
    'Test_MAE': [results[m]['test_mae'] for m in results]
})

# Sort by Test Accuracy (descending)
comparison_df = comparison_df.sort_values('Test_Accuracy(%)', ascending=False)

print("\n" + comparison_df.to_string(index=False))

# Find best model
best_model_name = comparison_df.iloc[0]['Model']
best_test_accuracy = comparison_df.iloc[0]['Test_Accuracy(%)']
best_test_r2 = comparison_df.iloc[0]['Test_R2']

print(f"\n{'=' * 70}")
print(f"🏆 BEST MODEL: {best_model_name}")
print(f"   Test Accuracy: {best_test_accuracy:.2f}%")
print(f"   Test R² Score: {best_test_r2:.4f}")
print(f"   Test RMSE: {results[best_model_name]['test_rmse']:.2f}")
print(f"   Test MAE: {results[best_model_name]['test_mae']:.2f}")
print(f"{'=' * 70}")


 MODEL PERFORMANCE COMPARISON

                           Model  Train_Accuracy(%)  Test_Accuracy(%)  Train_R2  Test_R2  Test_RMSE  Test_MAE
Polynomial Regression (Degree 2)          88.400164         86.631528  0.884002 0.866315   2.076085  1.401068
                Ridge Regression          83.563189         82.223786  0.835632 0.822238   2.393999  1.741412
        Simple Linear Regression          83.563204         82.220781  0.835632 0.822208   2.394201  1.741557
      Multiple Linear Regression          83.563204         82.220781  0.835632 0.822208   2.394201  1.741557
           ElasticNet Regression          74.942680         74.281276  0.749427 0.742813   2.879579  2.192910
                Lasso Regression          74.861586         73.773118  0.748616 0.737731   2.907887  2.181854

🏆 BEST MODEL: Polynomial Regression (Degree 2)
   Test Accuracy: 86.63%
   Test R² Score: 0.8663
   Test RMSE: 2.08
   Test MAE: 1.40


In [12]:
print("Cross-Validation (5-Fold) for All Models")
print("=" * 70)

cv_results = {}

for name, model in models.items():
    print(f"\n{name}:", end=' ')
    cv_scores = cross_val_score(model, X_train, y_train, 
                                cv=5, scoring='r2', n_jobs=-1)
    cv_mean = cv_scores.mean()
    cv_std = cv_scores.std()
    cv_results[name] = {'mean': cv_mean, 'std': cv_std}
    print(f"Mean R² = {cv_mean:.4f} (±{cv_std:.4f})")

Cross-Validation (5-Fold) for All Models

Simple Linear Regression: Mean R² = 0.8335 (±0.0100)

Multiple Linear Regression: Mean R² = 0.8335 (±0.0100)

Polynomial Regression (Degree 2): Mean R² = 0.8400 (±0.0458)

Ridge Regression: Mean R² = 0.8335 (±0.0101)

Lasso Regression: Mean R² = 0.7468 (±0.0068)

ElasticNet Regression: Mean R² = 0.7481 (±0.0043)


In [13]:
print(" Creating Visualizations")
print("=" * 70)

# Plot 1: Accuracy Comparison (Train vs Test)
fig, ax = plt.subplots(figsize=(14, 8))
x = np.arange(len(comparison_df))
width = 0.35

bars1 = ax.bar(x - width/2, comparison_df['Train_Accuracy(%)'], width, 
               label='Train Accuracy', color='lightblue', alpha=0.8)
bars2 = ax.bar(x + width/2, comparison_df['Test_Accuracy(%)'], width, 
               label='Test Accuracy', color='coral', alpha=0.8)

ax.set_xlabel('Models', fontweight='bold', fontsize=12)
ax.set_ylabel('Accuracy (%)', fontweight='bold', fontsize=12)
ax.set_title('Model Accuracy Comparison (Train vs Test)', fontweight='bold', fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(comparison_df['Model'], rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar in bars1:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom', fontsize=9)
for bar in bars2:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('accuracy_comparison.png', dpi=300, bbox_inches='tight')
print("✓ Saved: accuracy_comparison.png")
plt.close()

# Plot 2: R² Score Comparison
fig, ax = plt.subplots(figsize=(12, 7))
colors = ['gold' if model == best_model_name else 'skyblue' 
          for model in comparison_df['Model']]
bars = ax.barh(comparison_df['Model'], comparison_df['Test_R2'], 
               color=colors, alpha=0.8, edgecolor='black', linewidth=1.2)
ax.set_xlabel('R² Score (Test Set)', fontweight='bold', fontsize=12)
ax.set_title('Model Comparison - R² Score', fontweight='bold', fontsize=14)
ax.set_xlim(0, 1)

for i, (bar, value) in enumerate(zip(bars, comparison_df['Test_R2'])):
    ax.text(value + 0.01, bar.get_y() + bar.get_height()/2, 
            f'{value:.4f}', va='center', fontweight='bold', fontsize=10)

ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('r2_score_comparison.png', dpi=300, bbox_inches='tight')
print("✓ Saved: r2_score_comparison.png")
plt.close()

# Plot 3: RMSE Comparison
fig, ax = plt.subplots(figsize=(12, 7))
colors = ['gold' if model == best_model_name else 'lightcoral' 
          for model in comparison_df['Model']]
bars = ax.barh(comparison_df['Model'], comparison_df['Test_RMSE'], 
               color=colors, alpha=0.8, edgecolor='black', linewidth=1.2)
ax.set_xlabel('RMSE (Lower is Better)', fontweight='bold', fontsize=12)
ax.set_title('Model Comparison - Root Mean Squared Error', fontweight='bold', fontsize=14)

# Add value labels
for i, (bar, value) in enumerate(zip(bars, comparison_df['Test_RMSE'])):
    ax.text(value + 1000, bar.get_y() + bar.get_height()/2, 
            f'{value:.0f}', va='center', fontweight='bold', fontsize=10)

ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('rmse_comparison.png', dpi=300, bbox_inches='tight')
print("✓ Saved: rmse_comparison.png")
plt.close()

# Plot 4: Actual vs Predicted (Best Model)
fig, ax = plt.subplots(figsize=(10, 8))
best_predictions = results[best_model_name]['test_predictions']
ax.scatter(y_test, best_predictions, alpha=0.6, edgecolors='k', linewidth=0.5, s=50)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 
        'r--', lw=3, label='Perfect Prediction')
ax.set_xlabel('Actual Price', fontweight='bold', fontsize=12)
ax.set_ylabel('Predicted Price', fontweight='bold', fontsize=12)
ax.set_title(f'Actual vs Predicted - {best_model_name}\nR² = {best_test_r2:.4f}', 
             fontweight='bold', fontsize=14)
ax.legend(fontsize=11)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('actual_vs_predicted_best.png', dpi=300, bbox_inches='tight')
print("✓ Saved: actual_vs_predicted_best.png")
plt.close()

# Plot 5: Residuals Plot (Best Model)
residuals = y_test - best_predictions
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(best_predictions, residuals, alpha=0.6, edgecolors='k', linewidth=0.5, s=50)
ax.axhline(y=0, color='r', linestyle='--', linewidth=2)
ax.set_xlabel('Predicted Price', fontweight='bold', fontsize=12)
ax.set_ylabel('Residuals', fontweight='bold', fontsize=12)
ax.set_title(f'Residual Plot - {best_model_name}', fontweight='bold', fontsize=14)
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig('residuals_plot_best.png', dpi=300, bbox_inches='tight')
print("✓ Saved: residuals_plot_best.png")
plt.close()

 Creating Visualizations
✓ Saved: accuracy_comparison.png
✓ Saved: r2_score_comparison.png
✓ Saved: rmse_comparison.png
✓ Saved: actual_vs_predicted_best.png
✓ Saved: residuals_plot_best.png


In [14]:
print("Grid Search - Hyperparameter Tuning for Ridge, Lasso & ElasticNet")
print("=" * 70)

# Grid Search for Ridge
print("\nA. Grid Search for Ridge Regression...")
ridge_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Ridge(random_state=42))
])

ridge_params = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}

ridge_grid = GridSearchCV(ridge_pipeline, ridge_params, cv=5, 
                         scoring='r2', n_jobs=-1, verbose=0)
ridge_grid.fit(X_train, y_train)

print(f"✓ Best Ridge Alpha: {ridge_grid.best_params_['regressor__alpha']}")
print(f"✓ Best CV R² Score: {ridge_grid.best_score_:.4f}")

ridge_test_score = ridge_grid.score(X_test, y_test)
print(f"✓ Test R² Score: {ridge_test_score:.4f}")

# Grid Search for Lasso
print("\nB. Grid Search for Lasso Regression...")
lasso_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', Lasso(random_state=42))
])

lasso_params = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10, 100]
}

lasso_grid = GridSearchCV(lasso_pipeline, lasso_params, cv=5, 
                         scoring='r2', n_jobs=-1, verbose=0)
lasso_grid.fit(X_train, y_train)

print(f"✓ Best Lasso Alpha: {lasso_grid.best_params_['regressor__alpha']}")
print(f"✓ Best CV R² Score: {lasso_grid.best_score_:.4f}")

lasso_test_score = lasso_grid.score(X_test, y_test)
print(f"✓ Test R² Score: {lasso_test_score:.4f}")

# Grid Search for ElasticNet
print("\nC. Grid Search for ElasticNet Regression...")
elastic_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', ElasticNet(random_state=42))
])

elastic_params = {
    'regressor__alpha': [0.001, 0.01, 0.1, 1, 10],
    'regressor__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}

elastic_grid = GridSearchCV(elastic_pipeline, elastic_params, cv=5, 
                           scoring='r2', n_jobs=-1, verbose=0)
elastic_grid.fit(X_train, y_train)

print(f"✓ Best ElasticNet Alpha: {elastic_grid.best_params_['regressor__alpha']}")
print(f"✓ Best ElasticNet L1_ratio: {elastic_grid.best_params_['regressor__l1_ratio']}")
print(f"✓ Best CV R² Score: {elastic_grid.best_score_:.4f}")

elastic_test_score = elastic_grid.score(X_test, y_test)
print(f"✓ Test R² Score: {elastic_test_score:.4f}")

# Compare Grid Search Results
print("\n" + "=" * 70)
print("Grid Search Results Summary")
print("=" * 70)
grid_comparison = pd.DataFrame({
    'Model': ['Ridge (Tuned)', 'Lasso (Tuned)', 'ElasticNet (Tuned)'],
    'Test_R2': [ridge_test_score, lasso_test_score, elastic_test_score],
    'Test_Accuracy(%)': [ridge_test_score*100, lasso_test_score*100, elastic_test_score*100]
})
print("\n" + grid_comparison.to_string(index=False))


Grid Search - Hyperparameter Tuning for Ridge, Lasso & ElasticNet

A. Grid Search for Ridge Regression...
✓ Best Ridge Alpha: 10
✓ Best CV R² Score: 0.8335
✓ Test R² Score: 0.8225

B. Grid Search for Lasso Regression...
✓ Best Lasso Alpha: 0.01
✓ Best CV R² Score: 0.8335
✓ Test R² Score: 0.8225

C. Grid Search for ElasticNet Regression...
✓ Best ElasticNet Alpha: 0.01
✓ Best ElasticNet L1_ratio: 0.9
✓ Best CV R² Score: 0.8335
✓ Test R² Score: 0.8225

Grid Search Results Summary

             Model  Test_R2  Test_Accuracy(%)
     Ridge (Tuned) 0.822469         82.246941
     Lasso (Tuned) 0.822492         82.249159
ElasticNet (Tuned) 0.822541         82.254060


In [15]:
#SAVING BEST MODEL AND SCALER
import pickle
import joblib

best_model = results[best_model_name]['model']

# Save the model using pickle
with open('best_model.pkl', 'wb') as file:
    pickle.dump(best_model, file)
print(f"\n✓ Best model saved as: best_model.pkl")
print(f"  Model: {best_model_name}")

joblib.dump(best_model, 'best_model.joblib')
print(f"✓ Best model also saved as: best_model.joblib")

feature_names = X.columns.tolist()
with open('feature_names.pkl', 'wb') as file:
    pickle.dump(feature_names, file)
print(f"✓ Feature names saved as: feature_names.pkl")

model_info = {
    'model_name': best_model_name,
    'test_accuracy': best_test_accuracy,
    'test_r2': best_test_r2,
    'test_rmse': results[best_model_name]['test_rmse'],
    'test_mae': results[best_model_name]['test_mae'],
    'feature_names': feature_names,
    'n_features': len(feature_names)
}

with open('model_info.pkl', 'wb') as file:
    pickle.dump(model_info, file)
print(f"✓ Model info saved as: model_info.pkl")

print("\n" + "=" * 70)
print("✓ ALL FILES SAVED SUCCESSFULLY!")
print("=" * 70)
print("\nFiles created:")
print("  1. best_model.pkl - Your trained model")
print("  2. best_model.joblib - Alternative format")
print("  3. feature_names.pkl - Column names for prediction")
print("  4. model_info.pkl - Model metadata")


✓ Best model saved as: best_model.pkl
  Model: Polynomial Regression (Degree 2)
✓ Best model also saved as: best_model.joblib
✓ Feature names saved as: feature_names.pkl
✓ Model info saved as: model_info.pkl

✓ ALL FILES SAVED SUCCESSFULLY!

Files created:
  1. best_model.pkl - Your trained model
  2. best_model.joblib - Alternative format
  3. feature_names.pkl - Column names for prediction
  4. model_info.pkl - Model metadata
