# Link of the model on Kaggle 
https://www.kaggle.com/datasets/msnbehdani/mock-dataset-of-second-hand-car-sales

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('car_sales_data.csv')

In [3]:
df.isnull().sum()

Manufacturer           0
Model                  0
Engine size            0
Fuel type              0
Year of manufacture    0
Mileage                0
Price                  0
dtype: int64

In [4]:
df.head()

Unnamed: 0,Manufacturer,Model,Engine size,Fuel type,Year of manufacture,Mileage,Price
0,Ford,Fiesta,1.0,Petrol,2002,127300,3074
1,Porsche,718 Cayman,4.0,Petrol,2016,57850,49704
2,Ford,Mondeo,1.6,Diesel,2014,39190,24072
3,Toyota,RAV4,1.8,Hybrid,1988,210814,1705
4,VW,Polo,1.0,Petrol,2006,127869,4101


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Manufacturer         50000 non-null  object 
 1   Model                50000 non-null  object 
 2   Engine size          50000 non-null  float64
 3   Fuel type            50000 non-null  object 
 4   Year of manufacture  50000 non-null  int64  
 5   Mileage              50000 non-null  int64  
 6   Price                50000 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 2.7+ MB


In [None]:
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, r2_score
import os

In [None]:
# --- 2. Feature Engineering & Preprocessing Setup ---
print("\nStep 2: Setting up feature engineering and preprocessing...")

# Create 'Age' feature, which is often more predictive than 'Year of manufacture'
df['Age'] = 2024 - df['Year of manufacture']

# Define features (X) and target (y)
# We also drop the original 'Year of manufacture' column in favor of 'Age'.
X = df.drop(columns=['Price', 'Year of manufacture'])
y = df['Price']

# Identify numerical and categorical columns
numerical_features = ['Engine size', 'Mileage', 'Age']
categorical_features = ['Manufacturer', 'Fuel type','Model']

# Create a preprocessing pipeline for columns
# For numerical features, we'll scale them.
# For categorical features, we'll one-hot encode them.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns if any (none in this case)
)



Step 2: Setting up feature engineering and preprocessing...


In [23]:

# --- 3. Define, Train, and Evaluate Models ---
print("\nStep 3: Defining, training, and evaluating models...")

# Define the models we want to train
models = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a directory to save models
output_dir = 'saved_models'
os.makedirs(output_dir, exist_ok=True)
print(f"Models and preprocessors will be saved in the '{output_dir}' directory.")

# Loop through models, create a pipeline, train, evaluate, and save
for name, model in models.items():
    print(f"\n--- Training {name} ---")
    
    # Create the full pipeline by combining the preprocessor and the model
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # Train the model
    pipeline.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = pipeline.predict(X_test)
    
    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"Evaluation for {name}:")
    print(f"  Mean Absolute Error (MAE): {mae:.2f}")
    print(f"  R-squared (R²): {r2:.4f}")
    
    # --- 4. Save the Model Pipeline ---
    # We save the entire pipeline. This file now contains all the
    # logic for scaling, encoding, and predicting.
    file_path = os.path.join(output_dir, f'{name}_pipeline.joblib')
    joblib.dump(pipeline, file_path)
    print(f"  ✅ Saved pipeline for {name} to {file_path}")

print("\nAll models have been trained, evaluated, and saved.")


Step 3: Defining, training, and evaluating models...
Models and preprocessors will be saved in the 'saved_models' directory.

--- Training LinearRegression ---
Evaluation for LinearRegression:
  Mean Absolute Error (MAE): 5786.31
  R-squared (R²): 0.7102
  ✅ Saved pipeline for LinearRegression to saved_models/LinearRegression_pipeline.joblib

--- Training Ridge ---
Evaluation for Ridge:
  Mean Absolute Error (MAE): 5786.14
  R-squared (R²): 0.7102
  ✅ Saved pipeline for Ridge to saved_models/Ridge_pipeline.joblib

--- Training RandomForest ---
Evaluation for RandomForest:
  Mean Absolute Error (MAE): 286.04
  R-squared (R²): 0.9986
  ✅ Saved pipeline for RandomForest to saved_models/RandomForest_pipeline.joblib

--- Training GradientBoosting ---
Evaluation for GradientBoosting:
  Mean Absolute Error (MAE): 1037.39
  R-squared (R²): 0.9899
  ✅ Saved pipeline for GradientBoosting to saved_models/GradientBoosting_pipeline.joblib

All models have been trained, evaluated, and saved.


In [None]:
import pandas as pd
import joblib
import os

# Define the directory where models are saved
output_dir = 'saved_models'

print("\n--- Loading and using the improved RandomForest model ---")

try:
    # Load the pipeline that was trained WITH the 'Model' column
    # Make sure the filename is correct
    model_path = os.path.join(output_dir, 'RandomForest_pipeline.joblib')
    loaded_pipeline = joblib.load(model_path)
    print(f"✅ Successfully loaded model from: {model_path}")

    # --- KEY CHANGE IS HERE ---
    # Create new data that includes the 'Model' column.
    # The columns must match the data the model was trained on.
    new_data = pd.DataFrame({
        'Manufacturer': ['BMW', 'Ford', 'Toyota'],
        'Model': ['X5', 'Focus', 'Camry'], 
        'Engine size': [4.4, 1.6, 2.5],
        'Fuel type': ['Gasoline', 'Gasoline', 'Hybrid'],
        'Mileage': [35000, 80000, 15000],
        'Age': [4, 7, 1] # Remember to use 'Age', not 'Year of manufacture'
    })

    print("\nNew unseen data (with 'Model' column):")
    print(new_data)

    # Use the loaded pipeline to make predictions.
    # The pipeline handles all the scaling and encoding automatically.
    predicted_prices = loaded_pipeline.predict(new_data)

    print("\nPredicted prices for the new data:")
    # Combine the input data with the predictions for a nice display
    new_data['Predicted Price'] = predicted_prices
    
    # Format the price for better readability
    new_data['Predicted Price'] = new_data['Predicted Price'].map("${:,.2f}".format)
    print(new_data)

except FileNotFoundError:
    print(f"\n❌ ERROR: Could not find the model file at '{model_path}'.")
    print("Please ensure the training script ran successfully and the file exists.")
except Exception as e:
    print(f"\n❌ An error occurred: {e}")


--- Loading and using the improved RandomForest model ---
✅ Successfully loaded model from: saved_models/RandomForest_pipeline.joblib

New unseen data (with 'Model' column):
  Manufacturer  Model  Engine size Fuel type  Mileage  Age
0          BMW     X5          4.4  Gasoline    35000    4
1         Ford  Focus          1.6  Gasoline    80000    7
2       Toyota  Camry          2.5    Hybrid    15000    1

Predicted prices for the new data:
  Manufacturer  Model  Engine size Fuel type  Mileage  Age Predicted Price
0          BMW     X5          4.4  Gasoline    35000    4      $61,267.99
1         Ford  Focus          1.6  Gasoline    80000    7      $23,767.89
2       Toyota  Camry          2.5    Hybrid    15000    1      $72,083.01
