In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset (assuming it is in CSV format)
df = pd.read_csv('used_cars.csv')

# Remove rows with any NULL values
df.dropna(inplace=True)

# Split into features (X) and labels (y)
X = df.drop(columns=['MSRP'])  # Features (all columns except MSRP)
y = df['MSRP']  # Target variable

# Split the dataset (70:30 split with random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Define regression models
models = {
    'BaggingRegressor': BaggingRegressor(random_state=1),
    'RandomForestRegressor': RandomForestRegressor(random_state=1),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=1),
    'AdaBoostRegressor': AdaBoostRegressor(random_state=1)
}

# Train models and compute scores on test set
scores = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    scores[name] = model.score(X_test, y_test)

# Print results
print(f"Score on Test Set using BaggingRegressor: {scores['BaggingRegressor']:.4f}")
print(f"Score on Test Set using RandomForestRegressor: {scores['RandomForestRegressor']:.4f}")
print(f"Score on Test Set using GradientBoostingRegressor: {scores['GradientBoostingRegressor']:.4f}")
print(f"Score on Test Set using AdaBoostRegressor: {scores['AdaBoostRegressor']:.4f}")

Score on Test Set using BaggingRegressor: 0.7920
Score on Test Set using RandomForestRegressor: 0.8358
Score on Test Set using GradientBoostingRegressor: 0.8270
Score on Test Set using AdaBoostRegressor: 0.6985


In [4]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import AdaBoostRegressor

# Load the dataset (assuming it is in CSV format)
df = pd.read_csv('used_cars.csv')

# Remove rows with any NULL values
df.dropna(inplace=True)

# Split into features (X) and labels (y)
X = df.drop(columns=['MSRP'])  # Features (all columns except MSRP)
y = df['MSRP']  # Target variable

# Split the dataset (70:30 split with random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Define the AdaBoostRegressor model with random_state=1
model = AdaBoostRegressor(random_state=1)

# Define the hyperparameter grid for tuning
param_grid = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.5, 1, 2]
}

# Set up GridSearchCV with 4-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=4, scoring='neg_mean_squared_error', n_jobs=-1)

# Train the model using GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Compute the score on the test set (R^2 score)
test_score = best_model.score(X_test, y_test)

# Get the best hyperparameters after GridSearchCV
best_n_estimators = best_params['n_estimators']
best_learning_rate = best_params['learning_rate']

# Print results
print(f"Score on Test Set: {test_score:.4f}")
print(f"Best n_estimators: {best_n_estimators}")
print(f"Best learning_rate: {best_learning_rate}")

Score on Test Set: 0.7092
Best n_estimators: 500
Best learning_rate: 2
