In [46]:
# Standard library imports
import numpy as np
import pandas as pd


# Third-party imports
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import VotingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge

## Problem One: Blending scratch mounting

Training the individual models to be compared with the blended models

In [47]:
# Load the dataset
data = pd.read_csv("train.csv")

# Select the columns for explanatory variables and the target variable
features = data[['GrLivArea', 'YearBuilt']]
target = data['SalePrice']

# Split the dataset into training and validation sets (80% training, 20% validation)
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Initialize and fit the linear regression model
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, y_train)

# Evaluate the linear regression model
train_score = linear_regression_model.score(X_train, y_train)
y_pred_lr = linear_regression_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)

print("Linear Regression Model Score (R^2):", train_score)
print("Linear Regression Mean Squared Error (MSE) on Test Set:", mse_lr)

Linear Regression Model Score (R^2): 0.6466337010342488
Linear Regression Mean Squared Error (MSE) on Test Set: 2495554898.6683207


In [48]:
# Initialize and fit the SVR model
svr_model = SVR()
svr_model.fit(X_train, y_train)

# Make predictions using the SVR model
y_pred_svr = svr_model.predict(X_test)

# Evaluate the SVR model
svr_score = svr_model.score(X_test, y_test)
mse_svr = mean_squared_error(y_test, y_pred_svr)
mae_svr = mean_absolute_error(y_test, y_pred_svr)
rmse_svr = mean_squared_error(y_test, y_pred_svr, squared=False)

print("SVR Model Score (R^2):", svr_score)
print("SVR Mean Squared Error (MSE):", mse_svr)
print("SVR Mean Absolute Error (MAE):", mae_svr)
print("SVR Root Mean Squared Error (RMSE):", rmse_svr)

SVR Model Score (R^2): -0.02265677064868643
SVR Mean Squared Error (MSE): 7844111028.863974
SVR Mean Absolute Error (MAE): 59452.55819034524
SVR Root Mean Squared Error (RMSE): 88566.98611143982


In [49]:
# Initialize and fit the Decision Tree Regressor model
decision_tree_regressor = DecisionTreeRegressor()
decision_tree_regressor.fit(X_train, y_train)

# Make predictions using the Decision Tree Regressor model
y_pred_tree = decision_tree_regressor.predict(X_test)

# Calculate evaluation metrics for the Decision Tree Regressor model
mae_tree = mean_absolute_error(y_test, y_pred_tree)
mse_tree = mean_squared_error(y_test, y_pred_tree)
rmse_tree = mean_squared_error(y_test, y_pred_tree, squared=False)
r2_tree = r2_score(y_test, y_pred_tree)

print("Decision Tree Regressor Mean Absolute Error (MAE):", mae_tree)
print("Decision Tree Regressor Mean Squared Error (MSE):", mse_tree)
print("Decision Tree Regressor Root Mean Squared Error (RMSE):", rmse_tree)
print("Decision Tree Regressor R-squared (R2) Score:", r2_tree)

Decision Tree Regressor Mean Absolute Error (MAE): 32199.97317351598
Decision Tree Regressor Mean Squared Error (MSE): 2157809574.624524
Decision Tree Regressor Root Mean Squared Error (RMSE): 46452.228952166806
Decision Tree Regressor R-squared (R2) Score: 0.7186808596741755


Now we train the blended models so we can compare their performances in contrast to the individual models

In [50]:
# Train diverse models
linear_regression_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor()
random_forest_model = RandomForestRegressor()

linear_regression_model.fit(X_train, y_train)
decision_tree_model.fit(X_train, y_train)
random_forest_model.fit(X_train, y_train)

# Generate predictions for the validation data
pred_lr = linear_regression_model.predict(X_test)
pred_dt = decision_tree_model.predict(X_test)
pred_rf = random_forest_model.predict(X_test)

# Combine predictions using blending (average)
blended_pred = (pred_lr + pred_dt + pred_rf) / 3

# Evaluate blended predictions
mse_blended = mean_squared_error(y_test, blended_pred)
mae_blended = mean_absolute_error(y_test, blended_pred)
rmse_blended = mean_squared_error(y_test, blended_pred, squared=False)
r2_blended = r2_score(y_test, blended_pred)

print("Blended Model Mean Absolute Error (MAE):", mae_blended)
print("Blended Model Mean Squared Error (MSE):", mse_blended)
print("Blended Model Root Mean Squared Error (RMSE):", rmse_blended)
print("Blended Model R-squared (R2) Score:", r2_blended)

Blended Model Mean Absolute Error (MAE): 28474.657591678533
Blended Model Mean Squared Error (MSE): 1692940084.552014
Blended Model Root Mean Squared Error (RMSE): 41145.35313437004
Blended Model R-squared (R2) Score: 0.7792870813022629


In [51]:
# Train diverse models using SVR, Ridge, and Lasso
svr_model = SVR()
ridge_model = Ridge()
lasso_model = Lasso()

# Fit the models
svr_model.fit(X_train, y_train)
ridge_model.fit(X_train, y_train)
lasso_model.fit(X_train, y_train)

# Generate predictions for the test data
pred_svr = svr_model.predict(X_test)
pred_ridge = ridge_model.predict(X_test)
pred_lasso = lasso_model.predict(X_test)

# Combine predictions using blending (average)
blended_predictions = (pred_svr + pred_ridge + pred_lasso) / 3

# Evaluate blended predictions
mse_blended = mean_squared_error(y_test, blended_predictions)
mae_blended = mean_absolute_error(y_test, blended_predictions)
rmse_blended = mean_squared_error(y_test, blended_predictions, squared=False)
r2_blended = r2_score(y_test, blended_predictions)

print("Blended Model Mean Absolute Error (MAE):", mae_blended)
print("Blended Model Mean Squared Error (MSE):", mse_blended)
print("Blended Model Root Mean Squared Error (RMSE):", rmse_blended)
print("Blended Model R-squared (R2) Score:", r2_blended)

Blended Model Mean Absolute Error (MAE): 34596.97911401083
Blended Model Mean Squared Error (MSE): 3330704630.293318
Blended Model Root Mean Squared Error (RMSE): 57712.25719284698
Blended Model R-squared (R2) Score: 0.5657675383906835


In [52]:
#voting classifier for regression

In [53]:
# Initialize individual regressors
linear_regression_model = LinearRegression()
decision_tree_model = DecisionTreeRegressor()
random_forest_model = RandomForestRegressor()

# Create a list of tuples, where each tuple contains a name and the regressor
estimators = [
    ('linear', linear_regression_model),
    ('decision_tree', decision_tree_model),
    ('random_forest', random_forest_model)
]

# Initialize the voting regressor
voting_regressor = VotingRegressor(estimators)

# Fit the voting regressor on the training data
voting_regressor.fit(X_train, y_train)

# Generate predictions for the test data
y_pred_voting = voting_regressor.predict(X_test)

# Evaluate the performance of the voting regressor
mse_voting = mean_squared_error(y_test, y_pred_voting)
mae_voting = mean_absolute_error(y_test, y_pred_voting)
rmse_voting = mean_squared_error(y_test, y_pred_voting, squared=False)
r2_voting = r2_score(y_test, y_pred_voting)

print("Voting Regressor Mean Squared Error (MSE):", mse_voting)
print("Voting Regressor Mean Absolute Error (MAE):", mae_voting)
print("Voting Regressor Root Mean Squared Error (RMSE):", rmse_voting)
print("Voting Regressor R-squared (R2) Score:", r2_voting)

Voting Regressor Mean Squared Error (MSE): 1737699385.943962
Voting Regressor Mean Absolute Error (MAE): 28676.5185846732
Voting Regressor Root Mean Squared Error (RMSE): 41685.721607571606
Voting Regressor R-squared (R2) Score: 0.7734516969674992


## Problem Two: Scratch mounting of bagging

In [54]:
# Define the number of base models (trees) to train
n_estimators = 10

# Initialize a list to store the base models
base_models = []

# Initialize a list to store the predictions of each base model
predictions = []

# Train multiple instances of the base model on different subsets of the training data
for i in range(n_estimators):
    # Sample with replacement from the training data to create a bootstrap sample
    bootstrap_indices = np.random.choice(len(X_train), size=len(X_train), replace=True)
    X_bootstrap = X_train.iloc[bootstrap_indices]
    y_bootstrap = y_train.iloc[bootstrap_indices]
    
    # Initialize and train the base model (Decision Tree Regressor)
    base_model = DecisionTreeRegressor()
    base_model.fit(X_bootstrap, y_bootstrap)
    
    # Make predictions on the test data
    y_pred_base = base_model.predict(X_test)
    
    # Store the base model and its predictions
    base_models.append(base_model)
    predictions.append(y_pred_base)

# Combine the predictions of all base models by averaging
bagged_predictions = np.mean(predictions, axis=0)

# Evaluate the performance of bagging
mse_bagging = mean_squared_error(y_test, bagged_predictions)
mae_bagging = mean_absolute_error(y_test, bagged_predictions)
rmse_bagging = mean_squared_error(y_test, bagged_predictions, squared=False)
r2_bagging = r2_score(y_test, bagged_predictions)

print("Bagging Mean Absolute Error (MAE):", mae_bagging)
print("Bagging Mean Squared Error (MSE):", mse_bagging)
print("Bagging Root Mean Squared Error (RMSE):", rmse_bagging)
print("Bagging R-squared (R2) Score:", r2_bagging)

Bagging Mean Absolute Error (MAE): 28550.185803979126
Bagging Mean Squared Error (MSE): 1611641783.6645823
Bagging Root Mean Squared Error (RMSE): 40145.2585452452
Bagging R-squared (R2) Score: 0.7898861482378067


There are notable differences across the various metrics of the individual models in contrast to the ensembled models. The ensembled models seem to perform better on the test data, we can confirm this by their lower error numbers across the evaluation metrics.

## problem Three: Stacking scratch mounting

In [55]:
# Define the number of base models to train
n_estimators = 3

# Initialize lists to store base models and their predictions
base_models = []
base_predictions = []

# Split the data into training and validation sets for base models
X_train_base, X_valid_base, y_train_base, y_valid_base = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train multiple base models
for i in range(n_estimators):
    # Initialize a base model (Decision Tree Regressor)
    base_model = DecisionTreeRegressor()
    
    # Fit the base model to the training data
    base_model.fit(X_train_base, y_train_base)
    
    # Make predictions on the validation data
    base_pred = base_model.predict(X_valid_base)
    
    # Store the base model and its predictions
    base_models.append(base_model)
    base_predictions.append(base_pred)

# Stack the predictions of base models along with the original features
X_stacked = np.column_stack(base_predictions)

# Train a meta-model (Linear Regression) on the stacked predictions
meta_model = LinearRegression()
meta_model.fit(X_stacked, y_valid_base)

# Generate predictions for the validation data using the meta-model
meta_predictions = meta_model.predict(X_stacked)

# Evaluate the performance of stacking
mse_stacking = mean_squared_error(y_valid_base, meta_predictions)
mae_stacking = mean_absolute_error(y_valid_base, meta_predictions)
rmse_stacking = mean_squared_error(y_valid_base, meta_predictions, squared=False)
r2_stacking = r2_score(y_valid_base, meta_predictions)

print("Stacking Mean Absolute Error (MAE):", mae_stacking)
print("Stacking Mean Squared Error (MSE):", mse_stacking)
print("Stacking Root Mean Squared Error (RMSE):", rmse_stacking)
print("Stacking R-squared (R2) Score:", r2_stacking)

Stacking Mean Absolute Error (MAE): 33381.17483049444
Stacking Mean Squared Error (MSE): 2487751697.93969
Stacking Root Mean Squared Error (RMSE): 49877.36658986408
Stacking R-squared (R2) Score: 0.6151261228127817
