In [24]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [25]:
# Load dataset
df = pd.read_csv('NewCopy15cleanCarsCopy1df_filtered.csv',encoding="ISO-8859-1")

In [26]:
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,Capacity,HorsePower,Total Speed,Performance,Cars Prices,Fuel Types,Seats,Torque
0,FERRARI,SF90 STRADALE,V8,3990,963.0,340,2.5,1100000.0,Hybrid,2,800.0
1,ROLLS ROYCE,PHANTOM,V12,6749,563.0,250,5.3,460000.0,Petrol,5,900.0
2,Ford,KA+,1.2L Petrol,1200,77.5,165,10.5,13500.0,Petrol,5,120.0
3,MERCEDES,GT 63 S,V8,3982,630.0,250,3.2,161000.0,Petrol,4,900.0
4,AUDI,AUDI R8 Gt,V10,5204,602.0,320,3.6,253290.0,Petrol,2,560.0


In [27]:
df.shape

(1007, 11)

In [28]:
df.isna().sum()

Company Names    0
Cars Names       0
Engines          0
Capacity         0
HorsePower       0
Total Speed      0
Performance      0
Cars Prices      0
Fuel Types       0
Seats            0
Torque           0
dtype: int64

In [29]:
# Select categorical columns (object or string types)
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

# Print unique categories for each column
for col in cat_cols:
    print(f"\n Column: {col}")
    print(df[col].unique().tolist())   # convert numpy array to list for readability
    print(f"Total unique categories: {df[col].nunique()}")


 Column: Company Names
['FERRARI', 'ROLLS ROYCE', 'Ford', 'MERCEDES', 'AUDI', 'BMW', 'ASTON MARTIN', 'BENTLEY', 'LAMBORGHINI', 'TOYOTA', 'NISSAN', 'ROLLS ROYCE ', 'KIA', 'HONDA', 'KIA  ', 'HYUNDAI', 'MAHINDRA', 'MARUTI SUZUKI', 'Volkswagen', 'Nissan', 'Porsche', 'Cadillac', 'Tata Motors', 'Jeep', 'Mazda', 'Chevrolet', 'GMC', 'Kia', 'Bugatti', 'Volvo', 'Jaguar Land Rover', 'Acura', 'Peugeot', 'Mitsubishi', 'Toyota']
Total unique categories: 35

 Column: Cars Names
['SF90 STRADALE', 'PHANTOM', 'KA+', ' GT 63 S', 'AUDI R8 Gt', 'Mclaren 720s', 'VANTAGE F1', 'Continental GT Azure', 'VENENO ROADSTER', 'F8 TRIBUTO', '812 GTS', 'PORTOFINO', 'ROMA', 'MONZA SP2', 'F8 SPIDER', 'PORTOFINO M', 'ROMA SPIDER', 'GR SUPRA', 'TOYOTA 86', 'TOYOTA  GR86', 'TOYOTA LAND CRUISER', 'TOYOTA SEQUOIA', 'GT-R', '370Z', 'Z PROTO', 'ALTIMA', 'MAXIMA', 'SENTRA', 'ROGUE', 'PATHFINDER', 'FRONTIER', 'TITAN', 'VALKYRIE', 'VALHALLA', 'DBS SUPERLEGGERA', 'DB11', 'VANTAGE ', 'DBX', 'RAPIDE AMR', 'VANQUISH', 'LAGONDA TARAF

In [30]:
####label-encoding

In [31]:
# Encode categorical variables
cat_cols = ["Company Names", "Cars Names", "Engines", "Fuel Types"]
encoder = LabelEncoder()
for col in cat_cols:
    df[col] = encoder.fit_transform(df[col])

In [32]:
df.head()

Unnamed: 0,Company Names,Cars Names,Engines,Capacity,HorsePower,Total Speed,Performance,Cars Prices,Fuel Types,Seats,Torque
0,8,724,266,3990,963.0,340,2.5,1100000.0,2,2,800.0
1,28,594,259,6749,563.0,250,5.3,460000.0,4,5,900.0
2,9,493,17,1200,77.5,165,10.5,13500.0,4,5,120.0
3,21,0,266,3982,630.0,250,3.2,161000.0,4,4,900.0
4,1,126,258,5204,602.0,320,3.6,253290.0,4,2,560.0


In [33]:
# Select categorical columns (object or string types)
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()

# Print unique categories for each column
for col in cat_cols:
    print(f"\n Column: {col}")
    print(df[col].unique().tolist())   # convert numpy array to list for readability
    print(f"Total unique categories: {df[col].nunique()}")

In [34]:
df[col].unique()

array([2, 4, 1, 3, 0])

In [35]:
# Features and target
X = df.drop("Cars Prices", axis=1)
y = df["Cars Prices"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# 1. Decision Tree
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)

In [37]:
# 2. Random Forest
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)


In [38]:
# 3. Gradient Boosting
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb.fit(X_train, y_train)

In [39]:
# Evaluate models
models = {"Decision Tree": dt, "Random Forest": rf, "Gradient Boosting": gb}
for name, model in models.items():
    preds = model.predict(X_test)
    print(f"\n{name} Performance:")
    print("MAE:", mean_absolute_error(y_test, preds))
    print("RMSE:", mean_squared_error(y_test, preds, squared=False))
    print("R2 Score:", r2_score(y_test, preds))


Decision Tree Performance:
MAE: 112099.52475247525
RMSE: 683549.5280393382
R2 Score: 0.06969454836208933

Random Forest Performance:
MAE: 96121.75904290429
RMSE: 529445.4942129838
R2 Score: 0.4418793341502506

Gradient Boosting Performance:
MAE: 99782.5744339095
RMSE: 609502.9890348883
R2 Score: 0.2603312513861098




In [40]:
#### one-hot encoding

In [41]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Define categorical columns
cat_cols = ["Company Names", "Cars Names", "Engines", "Fuel Types"]

# Preprocessing: OneHotEncoding for categorical variables
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ],
    remainder="passthrough"  # Keep the other (numeric) columns as is
)

# Now you can create pipelines for each model

# 1. Decision Tree
dt_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", DecisionTreeRegressor(random_state=42))
])
dt_pipeline.fit(X_train, y_train)

# 2. Random Forest
rf_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(n_estimators=100, random_state=42))
])
rf_pipeline.fit(X_train, y_train)

# 3. Gradient Boosting
gb_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42))
])
gb_pipeline.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [42]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Use the pipeline models
models = {
    "Decision Tree": dt_pipeline,
    "Random Forest": rf_pipeline,
    "Gradient Boosting": gb_pipeline
}

# Evaluate each model
for name, model in models.items():
    preds = model.predict(X_test)
    print(f"\n{name} Performance:")
    print("MAE:", mean_absolute_error(y_test, preds))
    print("RMSE:", mean_squared_error(y_test, preds, squared=False))
    print("R2 Score:", r2_score(y_test, preds))



Decision Tree Performance:
MAE: 50688.66831683168
RMSE: 395386.20716345677
R2 Score: 0.6887361960745193

Random Forest Performance:
MAE: 62376.42625412542
RMSE: 371426.5507660978
R2 Score: 0.7253171903843478

Gradient Boosting Performance:
MAE: 78106.643683269
RMSE: 423696.18040786264
R2 Score: 0.6425669655083293




In [43]:
#Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

param_grid = {
    "model__n_estimators": [100, 200, 500],
    "model__max_depth": [None, 10, 20],
    "model__min_samples_split": [2, 5, 10],
    "model__min_samples_leaf": [1, 2, 4],
    "model__max_features": ["sqrt", "log2"]
}

grid_search = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    scoring="r2"
)

grid_search.fit(X_train, y_train)
print("Best Params:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

Best Params: {'model__max_depth': 10, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 2, 'model__n_estimators': 500}
Best R2 Score: 0.5849251746784466


In [44]:
#Model Evaluation with Cross-Validation
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf_pipeline, X, y, cv=5, scoring="r2")
print("Cross-validated R2:", scores.mean())


Cross-validated R2: 0.2567600191735243


In [45]:
#Feature Importance
import pandas as pd

# Extract the trained model inside the pipeline
rf_model = rf_pipeline.named_steps["model"]

# One-hot encoder feature names
ohe = rf_pipeline.named_steps["preprocessor"].named_transformers_["cat"]
feature_names = ohe.get_feature_names_out(cat_cols)
all_features = list(feature_names) + [col for col in X.columns if col not in cat_cols]

# Feature importances
importances = rf_model.feature_importances_
feat_importances = pd.DataFrame({"Feature": all_features, "Importance": importances})
feat_importances = feat_importances.sort_values(by="Importance", ascending=False)

print(feat_importances.head(10))


              Feature  Importance
427    Cars Names_499    0.389224
1076      Total Speed    0.289693
1075       HorsePower    0.130565
1078            Seats    0.049843
100     Cars Names_82    0.031948
958       Engines_148    0.021160
5     Company Names_5    0.020624
1036      Engines_233    0.017629
1074         Capacity    0.007806
292    Cars Names_328    0.006878


In [46]:
# ColumnTransformer to OneHotEncode categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
    ],
    remainder="passthrough"
)


In [47]:
from sklearn.linear_model import BayesianRidge

# Bayesian Ridge Regression pipeline
br_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", BayesianRidge())
])

# Fit on training data
br_pipeline.fit(X_train, y_train)


The format of the columns of the 'remainder' transformer in ColumnTransformer.transformers_ will change in version 1.7 to match the format of the other transformers.
At the moment the remainder columns are stored as indices (of type int). With the same ColumnTransformer configuration, in the future they will be stored as column names (of type str).



In [48]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

models = {
    "Decision Tree": dt_pipeline,
    "Random Forest": rf_pipeline,
    "Gradient Boosting": gb_pipeline,
    "Bayesian Ridge": br_pipeline
}

for name, model in models.items():
    preds = model.predict(X_test)
    print(f"\n{name} Performance:")
    print("MAE:", mean_absolute_error(y_test, preds))
    print("RMSE:", mean_squared_error(y_test, preds, squared=False))
    print("R2 Score:", r2_score(y_test, preds))



Decision Tree Performance:
MAE: 50688.66831683168
RMSE: 395386.20716345677
R2 Score: 0.6887361960745193

Random Forest Performance:
MAE: 62376.42625412542
RMSE: 371426.5507660978
R2 Score: 0.7253171903843478

Gradient Boosting Performance:
MAE: 78106.643683269
RMSE: 423696.18040786264
R2 Score: 0.6425669655083293

Bayesian Ridge Performance:
MAE: 163859.33450652185
RMSE: 359581.40949472
R2 Score: 0.7425576143420209


