In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import randint

In [2]:
# Load the dataset
df = pd.read_csv('NewCopy15cleanCarsCopy1df_filtered.csv')

In [3]:
# Define the new column names as requested
rename_dict = {
        'Company Names': 'Make',
        'Cars Names': 'Model',
        'Cars Prices': 'Price',
        'Fuel Types': 'Fuel Type'
    }
df.rename(columns=rename_dict, inplace=True)
print("Columns renamed successfully.")

Columns renamed successfully.


In [4]:
features = ['Make', 'Capacity', 'Fuel Type', 'Seats']
target = 'Price'

df_selected = df[features + [target]].copy()
df_selected.dropna(inplace=True) # Drop rows with any missing values in selected columns

In [5]:
    # Use One-Hot Encoding for 'Make' and 'Fuel Type'
df_processed = pd.get_dummies(df_selected, columns=['Make', 'Fuel Type'], drop_first=True)
print("\nApplied One-Hot Encoding to 'Make' and 'Fuel Type'.")


Applied One-Hot Encoding to 'Make' and 'Fuel Type'.


In [6]:
# Define our final feature set (X) and target (y)
X = df_processed.drop(target, axis=1)
y = df_processed[target]

In [7]:
    # Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training samples: {X_train.shape[0]}, Testing samples: {X_test.shape[0]}")

Training samples: 805, Testing samples: 202


In [8]:
 # --- Step 4: Hyperparameter Tuning with RandomizedSearchCV ---
print("\nStarting hyperparameter tuning with RandomizedSearchCV...")

param_dist = {
        'n_estimators': randint(100, 500),
        'max_depth': [None, 10, 20, 30, 40, 50],
        'min_samples_split': randint(2, 11),
        'min_samples_leaf': randint(1, 5),
        'max_features': ['sqrt', 'log2']
    }


Starting hyperparameter tuning with RandomizedSearchCV...


In [9]:
# Create a RandomForestRegressor instance
rf = RandomForestRegressor(random_state=42)

    # Create the RandomizedSearchCV object
random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=100,
        cv=5,
        n_jobs=-1, # Use all available cores
        random_state=42,
        scoring='neg_mean_absolute_error'
    )

    # Fit the random search to the data
random_search.fit(X_train, y_train)

    # Get the best estimator
best_rf_model = random_search.best_estimator_
print(f"\nBest hyperparameters found: {random_search.best_params_}")

  _data = np.array(data, dtype=dtype, copy=copy,



Best hyperparameters found: {'max_depth': 20, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 271}


In [10]:
# --- Step 5: Evaluate the Best Model ---
y_pred = best_rf_model.predict(X_test)

    # Calculate performance metrics
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("\n--- Tuned RandomForestRegressor Model Evaluation ---")
print(f"R-squared (R2) Score on Test Set: {r2:.4f}")
print(f"Mean Absolute Error (MAE) on Test Set: ${mae:,.2f}")


--- Tuned RandomForestRegressor Model Evaluation ---
R-squared (R2) Score on Test Set: 0.7027
Mean Absolute Error (MAE) on Test Set: $84,403.87
