27/03/2024
# CSC354 – Assignmen2 – ML – Decision Trees
# Aimah Siddique
# Fa21-bse-092
# Performing regression task on given dataset using decision tree. First using baseline model to get initial error value and then using grid search and random method to improve the model by using different parameters


In [20]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import randint


In [2]:
df = pd.read_csv('/content/cars-dataset.csv')

In [15]:
# Assuming these are the correct column names in your DataFrame
features = ['year', 'km_driven', 'fuel', 'seller_type', 'transmission', 'owner']
target = 'selling_price'

# Define X (features) and y (target variable)
X = df[features]
y = df[target]


In [16]:

# Preprocess categorical columns using OneHotEncoder
categorical_cols = ['fuel', 'seller_type', 'transmission', 'owner']
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
    ],
    remainder='passthrough'  # pass through columns not specified
)

# Fit and transform the data with the preprocessor
X_transformed = preprocessor.fit_transform(X)

# Split the transformed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Create the baseline model with default parameters
baseline_model = DecisionTreeRegressor(random_state=42)

# Fit the model on the training data
baseline_model.fit(X_train, y_train)

# Make predictions on the testing data
baseline_predictions = baseline_model.predict(X_test)

# Evaluate the baseline model using mean squared error
baseline_mse = mean_squared_error(y_test, baseline_predictions)
print("Baseline Mean Squared Error:", baseline_mse)

Baseline Mean Squared Error: 204769024475.5166


In [19]:
# Preprocess categorical columns using OneHotEncoder
categorical_cols = ['fuel', 'seller_type', 'transmission', 'owner']
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
    ],
    remainder='passthrough'  # pass through columns not specified
)

# Fit and transform the data with the preprocessor
X_transformed = preprocessor.fit_transform(X)

# Split the transformed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Define the parameter grid for Grid Search
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [ 'sqrt', 'log2']
}

# Create the Decision Tree Regressor model
dt_regressor = DecisionTreeRegressor(random_state=42)

# Initialize Grid Search with the parameter grid and the model
grid_search = GridSearchCV(estimator=dt_regressor, param_grid=param_grid, cv=5)

# Perform Grid Search to find the best parameters
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Make predictions on the testing data using the best model
best_predictions = best_model.predict(X_test)

# Evaluate the best model using mean squared error
best_mse = mean_squared_error(y_test, best_predictions)
print("Best Mean Squared Error (Grid Search):", best_mse)
print("Best Parameters:", best_params)

Best Mean Squared Error (Grid Search): 164325309079.34628
Best Parameters: {'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10}


In [22]:
# Preprocess categorical columns using OneHotEncoder
categorical_cols = ['fuel', 'seller_type', 'transmission', 'owner']
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False)
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_cols),
    ],
    remainder='passthrough'  # pass through columns not specified
)

# Fit and transform the data with the preprocessor
X_transformed = preprocessor.fit_transform(X)

# Split the transformed data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Define the parameter distributions for Random Search
param_dist = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10),
    'max_features': [ 'sqrt', 'log2']
}

# Create the Decision Tree Regressor model
dt_regressor = DecisionTreeRegressor(random_state=42)

# Initialize Random Search with the parameter distributions and the model
random_search = RandomizedSearchCV(estimator=dt_regressor, param_distributions=param_dist, n_iter=100, cv=5, random_state=42)

# Perform Random Search to find the best parameters
random_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = random_search.best_params_
best_model = random_search.best_estimator_

# Make predictions on the testing data using the best model
best_predictions = best_model.predict(X_test)

# Evaluate the best model using mean squared error
best_mse = mean_squared_error(y_test, best_predictions)
print("Best Mean Squared Error (Random Search):", best_mse)
print("Best Parameters:", best_params)

Best Mean Squared Error (Random Search): 171039310889.13156
Best Parameters: {'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 9}
