Clone the respository

In [None]:
!git clone https://github.com/Zerve-AI/pypelines.git

Installing the pypeline

In [None]:
import os
folder = ''
os.chdir(f'{folder}/pypelines')

In [None]:
!pip install .

LIST OF MODELS

MODELS FOR REGRESSION PROBLEM

In [None]:
import pypelines.supervised_pipeline as pipe
from pypelines import utils


utils.list_supported_models(model_type='regression')

REGRESSION

Loading the library

In [4]:
import pypelines.supervised_pipeline as pipe
from pypelines import utils
import pandas as pd
housing = pd.read_csv("pypelines/datasets/regression/housing.csv")

SINGLE REGRESSION

Data Load and Model Selection

In [5]:


reg_pypelines_all = pipe.SupervisedPipeline(data = housing,target = 'median_house_value',predictions_data=housing
                            , model_type = 'regression'
                            , models = ['Random Forest Regression']
                            , nfolds = 5)

Default Hyperparameters

In [6]:
reg_pypelines_all.get_hyperparameters()
reg_pypelines_all.code_to_clipboard()

Printing Hyperparameters

In [None]:
print(reg_pypelines_all.model_grid_search_settings(model_name='Random Forest Regression'))

Updating Hyperparameters

In [None]:
hyperparameter = {
    'numerical': [
        {'search': True, 'name': 'n_estimators', 'min': 50, 'max': 150, 'step': 35},
        {'search': True, 'name': 'max_depth', 'min': 5, 'max': 50, 'step': 10},
        {'search': True, 'name': 'min_samples_leaf', 'min': 1, 'max': 50, 'step': 20}
    ],
    'categorical': [
        {'search': False, 'name': 'bootstrap', 'selected': [True], 'values': [True, False]},
    ]
}

print(reg_pypelines_all.set_model_grid_search_settings(hyperparam_dict=hyperparameter, model_name='Random Forest Regression'))

Model tranining code generation

Training code for single regression model

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# target dataframe: housing
target = "median_house_value"
features = list(housing.columns.drop("median_house_value"))
feature_df = housing[features]

prediction_df = housing

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
housing[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(include=['string']).columns.tolist()


sample_size = np.min([10000, housing.shape[0]])
unique_theshold = np.min([100, sample_size/10])

# check categorical columns for high cardinality and make it text column
for col in categorical_cols:
    if housing[col].sample(sample_size).nunique() > unique_theshold:
        text_cols.append(col)
        categorical_cols.remove(col)
        

# check text columns for low cardinality and make it categorical columns
for col in text_cols:
    if housing[col].sample(sample_size).nunique() < unique_theshold:
        categorical_cols.append(col)
        text_cols.remove(col)

print(numerical_cols)
print(categorical_cols)
print(text_cols)

# define numeric transformer steps
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", MinMaxScaler())]
)

# define categorical transformer steps
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# define text transformer steps
text_transformer = Pipeline(
    steps=[
        ('text', TfidfVectorizer())
    ]
)

# create the preprocessing pipelines for both numeric and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer , numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        *[(f'text_{t_col}', text_transformer, t_col) for t_col in text_cols]]
)

# train test split
X = housing[features]
y = housing[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_comparison_list = []

##### End of Data Processing Pipeline #####


##### Model Pipeline for Random Forest Regression #####

from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
random_forest_regression_param_grid = {
"random_forest_regression__n_estimators": np.arange(50, 150, 35),
"random_forest_regression__max_depth": np.arange(5, 50, 10),
"random_forest_regression__min_samples_leaf": np.arange(1, 50, 20),
}


# Create the pipeline
random_forest_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('random_forest_regression', RandomForestRegressor())
])

# Create the grid search
random_forest_regression_grid_search = GridSearchCV(estimator=random_forest_regression_pipe, param_grid=random_forest_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=3)
random_forest_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
random_forest_regression_best_estimator = random_forest_regression_grid_search.best_estimator_

# Store results as a dataframe  
random_forest_regression_search_results = pd.DataFrame(random_forest_regression_grid_search.cv_results_)

# Model metrics

# Generate Predictions
random_forest_regression_predictions = random_forest_regression_best_estimator.predict(X_test)
random_forest_regression_predictions_df = pd.DataFrame(random_forest_regression_best_estimator.predict(X_test))x`

# Generate Model Metrics
random_forest_regression_r2_score = r2_score(y_test, random_forest_regression_predictions_df.iloc[:,0])
random_forest_regression_mean_squared_error = mean_squared_error(y_test, random_forest_regression_predictions_df.iloc[:,0])
random_forest_regression_explained_variance_score = explained_variance_score(y_test, random_forest_regression_predictions_df.iloc[:,0])
random_forest_regression_performance_metrics = [['random_forest_regression','r2_score', random_forest_regression_r2_score], 
                                  ['random_forest_regression','mean_squared_error',random_forest_regression_mean_squared_error],
                                  ['random_forest_regression','explained_variance_score', random_forest_regression_explained_variance_score]]
random_forest_regression_performance_metrics = pd.DataFrame(random_forest_regression_performance_metrics, columns=['model','metric', 'value'])

# Generate Actual vs Predicted Plot
random_forest_regression_actual_predicted_plot, random_forest_regression_actual_predicted_plot_ax = plt.subplots()
random_forest_regression_actual_predicted_plot = random_forest_regression_actual_predicted_plot_ax.scatter(x=y_test, y=random_forest_regression_predictions_df.iloc[:,0], alpha=0.5)
# Add diagonal line
random_forest_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
random_forest_regression_actual_predicted_plot_ax.set_xlabel('Actual')
random_forest_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
random_forest_regression_actual_predicted_plot_ax.set_title(f'random_forest_regression Actual vs. Predicted')
plt.show(block=False)

# Generate Decile Lift Chart
# Calculate the deciles based on the residuals
random_forest_regression_deciles = np.percentile(random_forest_regression_predictions, np.arange(0, 100, 10))
# Calculate the mean actual and predicted values for each decile
random_forest_regression_mean_actual = []
random_forest_regression_mean_predicted = []
for i in range(len(random_forest_regression_deciles) - 1):
    mask = (random_forest_regression_predictions >= random_forest_regression_deciles[i]) & (random_forest_regression_predictions < random_forest_regression_deciles[i + 1])
    random_forest_regression_mean_actual.append(np.mean(y_test[mask]))
    random_forest_regression_mean_predicted.append(np.mean(random_forest_regression_predictions[mask]))

# Create a bar chart of the mean actual and predicted values for each decile
random_forest_regression_lift_plot, random_forest_regression_lift_plot_ax = plt.subplots()
random_forest_regression_lift_plot_ax.bar(np.arange(len(random_forest_regression_mean_actual)), random_forest_regression_mean_actual, label='Actual')
random_forest_regression_lift_plot_ax.plot(np.arange(len(random_forest_regression_mean_predicted)), random_forest_regression_mean_predicted, color='red', linewidth=2, label='Predicted')
random_forest_regression_lift_plot_ax.set_xlabel('Deciles')
random_forest_regression_lift_plot_ax.set_ylabel('Mean')
random_forest_regression_lift_plot_ax.set_title(f'random_forest_regression Decile Analysis Chart')
random_forest_regression_lift_plot_ax.legend()
plt.show(block=False)


model_comparison_list.append(random_forest_regression_performance_metrics)##### End of Model Pipeline for Random Forest Regression #####
##### Model Comparison #####

table = pd.concat(model_comparison_list)
table = table.sort_values(by=['value'], ascending=False)
table = table[table['metric'] == 'r2_score']
print(table)
print(f"The best model is {table['model'].iloc[0]} with {table['value'].iloc[0]} as {table['metric'].iloc[0]}")


# Predict test data using the best model
test_predictions = eval(table['model'].iloc[0]+"_best_estimator").predict(prediction_df)
print('Predictions from best model are stored in test_predictions')


Multiple Regression

In [23]:

reg_pypelines_all = pipe.SupervisedPipeline(data = housing,target = 'median_house_value',predictions_data=housing
                            , model_type = 'regression'
                            , models = ['Linear Regression', 'AdaBoost Regression']
                            , nfolds = 5)

Default Hyperparameters

In [None]:
reg_pypelines_all.get_hyperparameters()

Model tranining code generation

In [25]:
reg_pypelines_all.code_to_clipboard()

Training code for multiple regression model

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# target dataframe: housing
target = "median_house_value"
features = list(housing.columns.drop("median_house_value"))
feature_df = housing[features]

prediction_df = housing

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
housing[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(include=['string']).columns.tolist()


sample_size = np.min([10000, housing.shape[0]])
unique_theshold = np.min([100, sample_size/10])

# check categorical columns for high cardinality and make it text column
for col in categorical_cols:
    if housing[col].sample(sample_size).nunique() > unique_theshold:
        text_cols.append(col)
        categorical_cols.remove(col)
        

# check text columns for low cardinality and make it categorical columns
for col in text_cols:
    if housing[col].sample(sample_size).nunique() < unique_theshold:
        categorical_cols.append(col)
        text_cols.remove(col)

print(numerical_cols)
print(categorical_cols)
print(text_cols)

# define numeric transformer steps
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", MinMaxScaler())]
)

# define categorical transformer steps
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# define text transformer steps
text_transformer = Pipeline(
    steps=[
        ('text', TfidfVectorizer())
    ]
)

# create the preprocessing pipelines for both numeric and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer , numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        *[(f'text_{t_col}', text_transformer, t_col) for t_col in text_cols]]
)

# train test split
X = housing[features]
y = housing[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_comparison_list = []

##### End of Data Processing Pipeline #####


##### Model Pipeline for Linear Regression #####

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
lin_reg_param_grid = {
}


# Create the pipeline
lin_reg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('lin_reg', LinearRegression())
])

# Create the grid search
lin_reg_grid_search = GridSearchCV(estimator=lin_reg_pipe, param_grid=lin_reg_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=3)
lin_reg_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
lin_reg_best_estimator = lin_reg_grid_search.best_estimator_

# Store results as a dataframe  
lin_reg_search_results = pd.DataFrame(lin_reg_grid_search.cv_results_)

# Model metrics

# Generate Predictions
lin_reg_predictions = lin_reg_best_estimator.predict(X_test)
lin_reg_predictions_df = pd.DataFrame(lin_reg_best_estimator.predict(X_test))x`

# Generate Model Metrics
lin_reg_r2_score = r2_score(y_test, lin_reg_predictions_df.iloc[:,0])
lin_reg_mean_squared_error = mean_squared_error(y_test, lin_reg_predictions_df.iloc[:,0])
lin_reg_explained_variance_score = explained_variance_score(y_test, lin_reg_predictions_df.iloc[:,0])
lin_reg_performance_metrics = [['lin_reg','r2_score', lin_reg_r2_score], 
                                  ['lin_reg','mean_squared_error',lin_reg_mean_squared_error],
                                  ['lin_reg','explained_variance_score', lin_reg_explained_variance_score]]
lin_reg_performance_metrics = pd.DataFrame(lin_reg_performance_metrics, columns=['model','metric', 'value'])

# Generate Actual vs Predicted Plot
lin_reg_actual_predicted_plot, lin_reg_actual_predicted_plot_ax = plt.subplots()
lin_reg_actual_predicted_plot = lin_reg_actual_predicted_plot_ax.scatter(x=y_test, y=lin_reg_predictions_df.iloc[:,0], alpha=0.5)
# Add diagonal line
lin_reg_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
lin_reg_actual_predicted_plot_ax.set_xlabel('Actual')
lin_reg_actual_predicted_plot_ax.set_ylabel('Predicted')
lin_reg_actual_predicted_plot_ax.set_title(f'lin_reg Actual vs. Predicted')
plt.show(block=False)

# Generate Decile Lift Chart
# Calculate the deciles based on the residuals
lin_reg_deciles = np.percentile(lin_reg_predictions, np.arange(0, 100, 10))
# Calculate the mean actual and predicted values for each decile
lin_reg_mean_actual = []
lin_reg_mean_predicted = []
for i in range(len(lin_reg_deciles) - 1):
    mask = (lin_reg_predictions >= lin_reg_deciles[i]) & (lin_reg_predictions < lin_reg_deciles[i + 1])
    lin_reg_mean_actual.append(np.mean(y_test[mask]))
    lin_reg_mean_predicted.append(np.mean(lin_reg_predictions[mask]))

# Create a bar chart of the mean actual and predicted values for each decile
lin_reg_lift_plot, lin_reg_lift_plot_ax = plt.subplots()
lin_reg_lift_plot_ax.bar(np.arange(len(lin_reg_mean_actual)), lin_reg_mean_actual, label='Actual')
lin_reg_lift_plot_ax.plot(np.arange(len(lin_reg_mean_predicted)), lin_reg_mean_predicted, color='red', linewidth=2, label='Predicted')
lin_reg_lift_plot_ax.set_xlabel('Deciles')
lin_reg_lift_plot_ax.set_ylabel('Mean')
lin_reg_lift_plot_ax.set_title(f'lin_reg Decile Analysis Chart')
lin_reg_lift_plot_ax.legend()
plt.show(block=False)


model_comparison_list.append(lin_reg_performance_metrics)##### End of Model Pipeline for Linear Regression #####
##### Model Pipeline for AdaBoost Regression #####

from sklearn.ensemble import AdaBoostRegressor 
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import matplotlib.pyplot as plt
adaboost_regression_param_grid = {
"adaboost_regression__n_estimators": np.arange(10, 100, 20),
}


# Create the pipeline
adaboost_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('adaboost_regression', AdaBoostRegressor())
])

# Create the grid search
adaboost_regression_grid_search = GridSearchCV(estimator=adaboost_regression_pipe, param_grid=adaboost_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=3)
adaboost_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
adaboost_regression_best_estimator = adaboost_regression_grid_search.best_estimator_

# Store results as a dataframe  
adaboost_regression_search_results = pd.DataFrame(adaboost_regression_grid_search.cv_results_)

# Model metrics

# Generate Predictions
adaboost_regression_predictions = adaboost_regression_best_estimator.predict(X_test)
adaboost_regression_predictions_df = pd.DataFrame(adaboost_regression_best_estimator.predict(X_test))x`

# Generate Model Metrics
adaboost_regression_r2_score = r2_score(y_test, adaboost_regression_predictions_df.iloc[:,0])
adaboost_regression_mean_squared_error = mean_squared_error(y_test, adaboost_regression_predictions_df.iloc[:,0])
adaboost_regression_explained_variance_score = explained_variance_score(y_test, adaboost_regression_predictions_df.iloc[:,0])
adaboost_regression_performance_metrics = [['adaboost_regression','r2_score', adaboost_regression_r2_score], 
                                  ['adaboost_regression','mean_squared_error',adaboost_regression_mean_squared_error],
                                  ['adaboost_regression','explained_variance_score', adaboost_regression_explained_variance_score]]
adaboost_regression_performance_metrics = pd.DataFrame(adaboost_regression_performance_metrics, columns=['model','metric', 'value'])

# Generate Actual vs Predicted Plot
adaboost_regression_actual_predicted_plot, adaboost_regression_actual_predicted_plot_ax = plt.subplots()
adaboost_regression_actual_predicted_plot = adaboost_regression_actual_predicted_plot_ax.scatter(x=y_test, y=adaboost_regression_predictions_df.iloc[:,0], alpha=0.5)
# Add diagonal line
adaboost_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
adaboost_regression_actual_predicted_plot_ax.set_xlabel('Actual')
adaboost_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
adaboost_regression_actual_predicted_plot_ax.set_title(f'adaboost_regression Actual vs. Predicted')
plt.show(block=False)

# Generate Decile Lift Chart
# Calculate the deciles based on the residuals
adaboost_regression_deciles = np.percentile(adaboost_regression_predictions, np.arange(0, 100, 10))
# Calculate the mean actual and predicted values for each decile
adaboost_regression_mean_actual = []
adaboost_regression_mean_predicted = []
for i in range(len(adaboost_regression_deciles) - 1):
    mask = (adaboost_regression_predictions >= adaboost_regression_deciles[i]) & (adaboost_regression_predictions < adaboost_regression_deciles[i + 1])
    adaboost_regression_mean_actual.append(np.mean(y_test[mask]))
    adaboost_regression_mean_predicted.append(np.mean(adaboost_regression_predictions[mask]))

# Create a bar chart of the mean actual and predicted values for each decile
adaboost_regression_lift_plot, adaboost_regression_lift_plot_ax = plt.subplots()
adaboost_regression_lift_plot_ax.bar(np.arange(len(adaboost_regression_mean_actual)), adaboost_regression_mean_actual, label='Actual')
adaboost_regression_lift_plot_ax.plot(np.arange(len(adaboost_regression_mean_predicted)), adaboost_regression_mean_predicted, color='red', linewidth=2, label='Predicted')
adaboost_regression_lift_plot_ax.set_xlabel('Deciles')
adaboost_regression_lift_plot_ax.set_ylabel('Mean')
adaboost_regression_lift_plot_ax.set_title(f'adaboost_regression Decile Analysis Chart')
adaboost_regression_lift_plot_ax.legend()
plt.show(block=False)


model_comparison_list.append(adaboost_regression_performance_metrics)##### End of Model Pipeline for AdaBoost Regression #####
##### Model Comparison #####

table = pd.concat(model_comparison_list)
table = table.sort_values(by=['value'], ascending=False)
table = table[table['metric'] == 'r2_score']
print(table)
print(f"The best model is {table['model'].iloc[0]} with {table['value'].iloc[0]} as {table['metric'].iloc[0]}")


# Predict test data using the best model
test_predictions = eval(table['model'].iloc[0]+"_best_estimator").predict(prediction_df)
print('Predictions from best model are stored in test_predictions')


REGRESSION MODEL - DEFAULT RUN

In [14]:
reg_pypelines_all = pipe.SupervisedPipeline(data = housing,target = 'median_house_value',predictions_data=housing
                            , model_type = 'regression'
                            , nfolds = 5)

Default Hyperparameters

In [None]:
reg_pypelines_all.get_hyperparameters()
reg_pypelines_all.model_list()

Model tranining code generation for regression default run

In [16]:
reg_pypelines_all.code_to_clipboard()

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


# target dataframe: housing
target = "median_house_value"
features = list(housing.columns.drop("median_house_value"))
feature_df = housing[features]

prediction_df = housing

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
housing[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(include=['string']).columns.tolist()


sample_size = np.min([10000, housing.shape[0]])
unique_theshold = np.min([100, sample_size/10])

# check categorical columns for high cardinality and make it text column
for col in categorical_cols:
    if housing[col].sample(sample_size).nunique() > unique_theshold:
        text_cols.append(col)
        categorical_cols.remove(col)
        

# check text columns for low cardinality and make it categorical columns
for col in text_cols:
    if housing[col].sample(sample_size).nunique() < unique_theshold:
        categorical_cols.append(col)
        text_cols.remove(col)

print(numerical_cols)
print(categorical_cols)
print(text_cols)

# define numeric transformer steps
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", MinMaxScaler())]
)

# define categorical transformer steps
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# define text transformer steps
text_transformer = Pipeline(
    steps=[
        ('text', TfidfVectorizer())
    ]
)

# create the preprocessing pipelines for both numeric and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer , numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        *[(f'text_{t_col}', text_transformer, t_col) for t_col in text_cols]]
)

# train test split
X = housing[features]
y = housing[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_comparison_list = []

##### End of Data Processing Pipeline #####


##### Model Pipeline for Elastic Net Regression #####

from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import matplotlib.pyplot as plt
elastic_net_regression_param_grid = {
"elastic_net_regression__alpha": np.arange(0.1, 2.0, 0.5),
"elastic_net_regression__l1_ratio": np.arange(0.1, 1.0, 0.3),
}


# Create the pipeline
elastic_net_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('elastic_net_regression', ElasticNet())
])

# Create the grid search
elastic_net_regression_grid_search = GridSearchCV(estimator=elastic_net_regression_pipe, param_grid=elastic_net_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=3)
elastic_net_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
elastic_net_regression_best_estimator = elastic_net_regression_grid_search.best_estimator_

# Store results as a dataframe  
elastic_net_regression_search_results = pd.DataFrame(elastic_net_regression_grid_search.cv_results_)

# Model metrics

# Generate Predictions
elastic_net_regression_predictions = elastic_net_regression_best_estimator.predict(X_test)
elastic_net_regression_predictions_df = pd.DataFrame(elastic_net_regression_best_estimator.predict(X_test))x`

# Generate Model Metrics
elastic_net_regression_r2_score = r2_score(y_test, elastic_net_regression_predictions_df.iloc[:,0])
elastic_net_regression_mean_squared_error = mean_squared_error(y_test, elastic_net_regression_predictions_df.iloc[:,0])
elastic_net_regression_explained_variance_score = explained_variance_score(y_test, elastic_net_regression_predictions_df.iloc[:,0])
elastic_net_regression_performance_metrics = [['elastic_net_regression','r2_score', elastic_net_regression_r2_score], 
                                  ['elastic_net_regression','mean_squared_error',elastic_net_regression_mean_squared_error],
                                  ['elastic_net_regression','explained_variance_score', elastic_net_regression_explained_variance_score]]
elastic_net_regression_performance_metrics = pd.DataFrame(elastic_net_regression_performance_metrics, columns=['model','metric', 'value'])

# Generate Actual vs Predicted Plot
elastic_net_regression_actual_predicted_plot, elastic_net_regression_actual_predicted_plot_ax = plt.subplots()
elastic_net_regression_actual_predicted_plot = elastic_net_regression_actual_predicted_plot_ax.scatter(x=y_test, y=elastic_net_regression_predictions_df.iloc[:,0], alpha=0.5)
# Add diagonal line
elastic_net_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
elastic_net_regression_actual_predicted_plot_ax.set_xlabel('Actual')
elastic_net_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
elastic_net_regression_actual_predicted_plot_ax.set_title(f'elastic_net_regression Actual vs. Predicted')
plt.show(block=False)

# Generate Decile Lift Chart
# Calculate the deciles based on the residuals
elastic_net_regression_deciles = np.percentile(elastic_net_regression_predictions, np.arange(0, 100, 10))
# Calculate the mean actual and predicted values for each decile
elastic_net_regression_mean_actual = []
elastic_net_regression_mean_predicted = []
for i in range(len(elastic_net_regression_deciles) - 1):
    mask = (elastic_net_regression_predictions >= elastic_net_regression_deciles[i]) & (elastic_net_regression_predictions < elastic_net_regression_deciles[i + 1])
    elastic_net_regression_mean_actual.append(np.mean(y_test[mask]))
    elastic_net_regression_mean_predicted.append(np.mean(elastic_net_regression_predictions[mask]))

# Create a bar chart of the mean actual and predicted values for each decile
elastic_net_regression_lift_plot, elastic_net_regression_lift_plot_ax = plt.subplots()
elastic_net_regression_lift_plot_ax.bar(np.arange(len(elastic_net_regression_mean_actual)), elastic_net_regression_mean_actual, label='Actual')
elastic_net_regression_lift_plot_ax.plot(np.arange(len(elastic_net_regression_mean_predicted)), elastic_net_regression_mean_predicted, color='red', linewidth=2, label='Predicted')
elastic_net_regression_lift_plot_ax.set_xlabel('Deciles')
elastic_net_regression_lift_plot_ax.set_ylabel('Mean')
elastic_net_regression_lift_plot_ax.set_title(f'elastic_net_regression Decile Analysis Chart')
elastic_net_regression_lift_plot_ax.legend()
plt.show(block=False)


model_comparison_list.append(elastic_net_regression_performance_metrics)##### End of Model Pipeline for Elastic Net Regression #####
##### Model Pipeline for Linear Regression #####

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
lin_reg_param_grid = {
}


# Create the pipeline
lin_reg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('lin_reg', LinearRegression())
])

# Create the grid search
lin_reg_grid_search = GridSearchCV(estimator=lin_reg_pipe, param_grid=lin_reg_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=3)
lin_reg_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
lin_reg_best_estimator = lin_reg_grid_search.best_estimator_

# Store results as a dataframe  
lin_reg_search_results = pd.DataFrame(lin_reg_grid_search.cv_results_)

# Model metrics

# Generate Predictions
lin_reg_predictions = lin_reg_best_estimator.predict(X_test)
lin_reg_predictions_df = pd.DataFrame(lin_reg_best_estimator.predict(X_test))x`

# Generate Model Metrics
lin_reg_r2_score = r2_score(y_test, lin_reg_predictions_df.iloc[:,0])
lin_reg_mean_squared_error = mean_squared_error(y_test, lin_reg_predictions_df.iloc[:,0])
lin_reg_explained_variance_score = explained_variance_score(y_test, lin_reg_predictions_df.iloc[:,0])
lin_reg_performance_metrics = [['lin_reg','r2_score', lin_reg_r2_score], 
                                  ['lin_reg','mean_squared_error',lin_reg_mean_squared_error],
                                  ['lin_reg','explained_variance_score', lin_reg_explained_variance_score]]
lin_reg_performance_metrics = pd.DataFrame(lin_reg_performance_metrics, columns=['model','metric', 'value'])

# Generate Actual vs Predicted Plot
lin_reg_actual_predicted_plot, lin_reg_actual_predicted_plot_ax = plt.subplots()
lin_reg_actual_predicted_plot = lin_reg_actual_predicted_plot_ax.scatter(x=y_test, y=lin_reg_predictions_df.iloc[:,0], alpha=0.5)
# Add diagonal line
lin_reg_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
lin_reg_actual_predicted_plot_ax.set_xlabel('Actual')
lin_reg_actual_predicted_plot_ax.set_ylabel('Predicted')
lin_reg_actual_predicted_plot_ax.set_title(f'lin_reg Actual vs. Predicted')
plt.show(block=False)

# Generate Decile Lift Chart
# Calculate the deciles based on the residuals
lin_reg_deciles = np.percentile(lin_reg_predictions, np.arange(0, 100, 10))
# Calculate the mean actual and predicted values for each decile
lin_reg_mean_actual = []
lin_reg_mean_predicted = []
for i in range(len(lin_reg_deciles) - 1):
    mask = (lin_reg_predictions >= lin_reg_deciles[i]) & (lin_reg_predictions < lin_reg_deciles[i + 1])
    lin_reg_mean_actual.append(np.mean(y_test[mask]))
    lin_reg_mean_predicted.append(np.mean(lin_reg_predictions[mask]))

# Create a bar chart of the mean actual and predicted values for each decile
lin_reg_lift_plot, lin_reg_lift_plot_ax = plt.subplots()
lin_reg_lift_plot_ax.bar(np.arange(len(lin_reg_mean_actual)), lin_reg_mean_actual, label='Actual')
lin_reg_lift_plot_ax.plot(np.arange(len(lin_reg_mean_predicted)), lin_reg_mean_predicted, color='red', linewidth=2, label='Predicted')
lin_reg_lift_plot_ax.set_xlabel('Deciles')
lin_reg_lift_plot_ax.set_ylabel('Mean')
lin_reg_lift_plot_ax.set_title(f'lin_reg Decile Analysis Chart')
lin_reg_lift_plot_ax.legend()
plt.show(block=False)


model_comparison_list.append(lin_reg_performance_metrics)##### End of Model Pipeline for Linear Regression #####
##### Model Pipeline for Lasso Regression #####

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import matplotlib.pyplot as plt
lasso_regression_param_grid = {
"lasso_regression__alpha": np.arange(0.0, 2.0, 0.5),
}


# Create the pipeline
lasso_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('lasso_regression', Lasso())
])

# Create the grid search
lasso_regression_grid_search = GridSearchCV(estimator=lasso_regression_pipe, param_grid=lasso_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=3)
lasso_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
lasso_regression_best_estimator = lasso_regression_grid_search.best_estimator_

# Store results as a dataframe  
lasso_regression_search_results = pd.DataFrame(lasso_regression_grid_search.cv_results_)

# Model metrics

# Generate Predictions
lasso_regression_predictions = lasso_regression_best_estimator.predict(X_test)
lasso_regression_predictions_df = pd.DataFrame(lasso_regression_best_estimator.predict(X_test))x`

# Generate Model Metrics
lasso_regression_r2_score = r2_score(y_test, lasso_regression_predictions_df.iloc[:,0])
lasso_regression_mean_squared_error = mean_squared_error(y_test, lasso_regression_predictions_df.iloc[:,0])
lasso_regression_explained_variance_score = explained_variance_score(y_test, lasso_regression_predictions_df.iloc[:,0])
lasso_regression_performance_metrics = [['lasso_regression','r2_score', lasso_regression_r2_score], 
                                  ['lasso_regression','mean_squared_error',lasso_regression_mean_squared_error],
                                  ['lasso_regression','explained_variance_score', lasso_regression_explained_variance_score]]
lasso_regression_performance_metrics = pd.DataFrame(lasso_regression_performance_metrics, columns=['model','metric', 'value'])

# Generate Actual vs Predicted Plot
lasso_regression_actual_predicted_plot, lasso_regression_actual_predicted_plot_ax = plt.subplots()
lasso_regression_actual_predicted_plot = lasso_regression_actual_predicted_plot_ax.scatter(x=y_test, y=lasso_regression_predictions_df.iloc[:,0], alpha=0.5)
# Add diagonal line
lasso_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
lasso_regression_actual_predicted_plot_ax.set_xlabel('Actual')
lasso_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
lasso_regression_actual_predicted_plot_ax.set_title(f'lasso_regression Actual vs. Predicted')
plt.show(block=False)

# Generate Decile Lift Chart
# Calculate the deciles based on the residuals
lasso_regression_deciles = np.percentile(lasso_regression_predictions, np.arange(0, 100, 10))
# Calculate the mean actual and predicted values for each decile
lasso_regression_mean_actual = []
lasso_regression_mean_predicted = []
for i in range(len(lasso_regression_deciles) - 1):
    mask = (lasso_regression_predictions >= lasso_regression_deciles[i]) & (lasso_regression_predictions < lasso_regression_deciles[i + 1])
    lasso_regression_mean_actual.append(np.mean(y_test[mask]))
    lasso_regression_mean_predicted.append(np.mean(lasso_regression_predictions[mask]))

# Create a bar chart of the mean actual and predicted values for each decile
lasso_regression_lift_plot, lasso_regression_lift_plot_ax = plt.subplots()
lasso_regression_lift_plot_ax.bar(np.arange(len(lasso_regression_mean_actual)), lasso_regression_mean_actual, label='Actual')
lasso_regression_lift_plot_ax.plot(np.arange(len(lasso_regression_mean_predicted)), lasso_regression_mean_predicted, color='red', linewidth=2, label='Predicted')
lasso_regression_lift_plot_ax.set_xlabel('Deciles')
lasso_regression_lift_plot_ax.set_ylabel('Mean')
lasso_regression_lift_plot_ax.set_title(f'lasso_regression Decile Analysis Chart')
lasso_regression_lift_plot_ax.legend()
plt.show(block=False)


model_comparison_list.append(lasso_regression_performance_metrics)##### End of Model Pipeline for Lasso Regression #####
##### Model Pipeline for Ridge Regression #####

from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import matplotlib.pyplot as plt
ridge_regression_param_grid = {
"ridge_regression__alpha": np.arange(0.1, 2.0, 0.5),
}


# Create the pipeline
ridge_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('ridge_regression', Ridge())
])

# Create the grid search
ridge_regression_grid_search = GridSearchCV(estimator=ridge_regression_pipe, param_grid=ridge_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=3)
ridge_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
ridge_regression_best_estimator = ridge_regression_grid_search.best_estimator_

# Store results as a dataframe  
ridge_regression_search_results = pd.DataFrame(ridge_regression_grid_search.cv_results_)

# Model metrics

# Generate Predictions
ridge_regression_predictions = ridge_regression_best_estimator.predict(X_test)
ridge_regression_predictions_df = pd.DataFrame(ridge_regression_best_estimator.predict(X_test))x`

# Generate Model Metrics
ridge_regression_r2_score = r2_score(y_test, ridge_regression_predictions_df.iloc[:,0])
ridge_regression_mean_squared_error = mean_squared_error(y_test, ridge_regression_predictions_df.iloc[:,0])
ridge_regression_explained_variance_score = explained_variance_score(y_test, ridge_regression_predictions_df.iloc[:,0])
ridge_regression_performance_metrics = [['ridge_regression','r2_score', ridge_regression_r2_score], 
                                  ['ridge_regression','mean_squared_error',ridge_regression_mean_squared_error],
                                  ['ridge_regression','explained_variance_score', ridge_regression_explained_variance_score]]
ridge_regression_performance_metrics = pd.DataFrame(ridge_regression_performance_metrics, columns=['model','metric', 'value'])

# Generate Actual vs Predicted Plot
ridge_regression_actual_predicted_plot, ridge_regression_actual_predicted_plot_ax = plt.subplots()
ridge_regression_actual_predicted_plot = ridge_regression_actual_predicted_plot_ax.scatter(x=y_test, y=ridge_regression_predictions_df.iloc[:,0], alpha=0.5)
# Add diagonal line
ridge_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
ridge_regression_actual_predicted_plot_ax.set_xlabel('Actual')
ridge_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
ridge_regression_actual_predicted_plot_ax.set_title(f'ridge_regression Actual vs. Predicted')
plt.show(block=False)

# Generate Decile Lift Chart
# Calculate the deciles based on the residuals
ridge_regression_deciles = np.percentile(ridge_regression_predictions, np.arange(0, 100, 10))
# Calculate the mean actual and predicted values for each decile
ridge_regression_mean_actual = []
ridge_regression_mean_predicted = []
for i in range(len(ridge_regression_deciles) - 1):
    mask = (ridge_regression_predictions >= ridge_regression_deciles[i]) & (ridge_regression_predictions < ridge_regression_deciles[i + 1])
    ridge_regression_mean_actual.append(np.mean(y_test[mask]))
    ridge_regression_mean_predicted.append(np.mean(ridge_regression_predictions[mask]))

# Create a bar chart of the mean actual and predicted values for each decile
ridge_regression_lift_plot, ridge_regression_lift_plot_ax = plt.subplots()
ridge_regression_lift_plot_ax.bar(np.arange(len(ridge_regression_mean_actual)), ridge_regression_mean_actual, label='Actual')
ridge_regression_lift_plot_ax.plot(np.arange(len(ridge_regression_mean_predicted)), ridge_regression_mean_predicted, color='red', linewidth=2, label='Predicted')
ridge_regression_lift_plot_ax.set_xlabel('Deciles')
ridge_regression_lift_plot_ax.set_ylabel('Mean')
ridge_regression_lift_plot_ax.set_title(f'ridge_regression Decile Analysis Chart')
ridge_regression_lift_plot_ax.legend()
plt.show(block=False)


model_comparison_list.append(ridge_regression_performance_metrics)##### End of Model Pipeline for Ridge Regression #####
##### Model Pipeline for Random Forest Regression #####

from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
random_forest_regression_param_grid = {
"random_forest_regression__n_estimators": np.arange(50, 150, 35),
"random_forest_regression__max_depth": np.arange(5, 50, 10),
"random_forest_regression__min_samples_leaf": np.arange(1, 50, 20),
}


# Create the pipeline
random_forest_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('random_forest_regression', RandomForestRegressor())
])

# Create the grid search
random_forest_regression_grid_search = GridSearchCV(estimator=random_forest_regression_pipe, param_grid=random_forest_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=3)
random_forest_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
random_forest_regression_best_estimator = random_forest_regression_grid_search.best_estimator_

# Store results as a dataframe  
random_forest_regression_search_results = pd.DataFrame(random_forest_regression_grid_search.cv_results_)

# Model metrics

# Generate Predictions
random_forest_regression_predictions = random_forest_regression_best_estimator.predict(X_test)
random_forest_regression_predictions_df = pd.DataFrame(random_forest_regression_best_estimator.predict(X_test))x`

# Generate Model Metrics
random_forest_regression_r2_score = r2_score(y_test, random_forest_regression_predictions_df.iloc[:,0])
random_forest_regression_mean_squared_error = mean_squared_error(y_test, random_forest_regression_predictions_df.iloc[:,0])
random_forest_regression_explained_variance_score = explained_variance_score(y_test, random_forest_regression_predictions_df.iloc[:,0])
random_forest_regression_performance_metrics = [['random_forest_regression','r2_score', random_forest_regression_r2_score], 
                                  ['random_forest_regression','mean_squared_error',random_forest_regression_mean_squared_error],
                                  ['random_forest_regression','explained_variance_score', random_forest_regression_explained_variance_score]]
random_forest_regression_performance_metrics = pd.DataFrame(random_forest_regression_performance_metrics, columns=['model','metric', 'value'])

# Generate Actual vs Predicted Plot
random_forest_regression_actual_predicted_plot, random_forest_regression_actual_predicted_plot_ax = plt.subplots()
random_forest_regression_actual_predicted_plot = random_forest_regression_actual_predicted_plot_ax.scatter(x=y_test, y=random_forest_regression_predictions_df.iloc[:,0], alpha=0.5)
# Add diagonal line
random_forest_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
random_forest_regression_actual_predicted_plot_ax.set_xlabel('Actual')
random_forest_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
random_forest_regression_actual_predicted_plot_ax.set_title(f'random_forest_regression Actual vs. Predicted')
plt.show(block=False)

# Generate Decile Lift Chart
# Calculate the deciles based on the residuals
random_forest_regression_deciles = np.percentile(random_forest_regression_predictions, np.arange(0, 100, 10))
# Calculate the mean actual and predicted values for each decile
random_forest_regression_mean_actual = []
random_forest_regression_mean_predicted = []
for i in range(len(random_forest_regression_deciles) - 1):
    mask = (random_forest_regression_predictions >= random_forest_regression_deciles[i]) & (random_forest_regression_predictions < random_forest_regression_deciles[i + 1])
    random_forest_regression_mean_actual.append(np.mean(y_test[mask]))
    random_forest_regression_mean_predicted.append(np.mean(random_forest_regression_predictions[mask]))

# Create a bar chart of the mean actual and predicted values for each decile
random_forest_regression_lift_plot, random_forest_regression_lift_plot_ax = plt.subplots()
random_forest_regression_lift_plot_ax.bar(np.arange(len(random_forest_regression_mean_actual)), random_forest_regression_mean_actual, label='Actual')
random_forest_regression_lift_plot_ax.plot(np.arange(len(random_forest_regression_mean_predicted)), random_forest_regression_mean_predicted, color='red', linewidth=2, label='Predicted')
random_forest_regression_lift_plot_ax.set_xlabel('Deciles')
random_forest_regression_lift_plot_ax.set_ylabel('Mean')
random_forest_regression_lift_plot_ax.set_title(f'random_forest_regression Decile Analysis Chart')
random_forest_regression_lift_plot_ax.legend()
plt.show(block=False)


model_comparison_list.append(random_forest_regression_performance_metrics)##### End of Model Pipeline for Random Forest Regression #####
##### Model Pipeline for Decision Tree Regression #####

from sklearn.tree import DecisionTreeRegressor 
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import matplotlib.pyplot as plt
decision_tree_regression_param_grid = {
"decision_tree_regression__max_depth": np.arange(1, 10, 3),
"decision_tree_regression__max_features": ['auto'],
}


# Create the pipeline
decision_tree_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('decision_tree_regression', DecisionTreeRegressor())
])

# Create the grid search
decision_tree_regression_grid_search = GridSearchCV(estimator=decision_tree_regression_pipe, param_grid=decision_tree_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=3)
decision_tree_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
decision_tree_regression_best_estimator = decision_tree_regression_grid_search.best_estimator_

# Store results as a dataframe  
decision_tree_regression_search_results = pd.DataFrame(decision_tree_regression_grid_search.cv_results_)

# Model metrics

# Generate Predictions
decision_tree_regression_predictions = decision_tree_regression_best_estimator.predict(X_test)
decision_tree_regression_predictions_df = pd.DataFrame(decision_tree_regression_best_estimator.predict(X_test))x`

# Generate Model Metrics
decision_tree_regression_r2_score = r2_score(y_test, decision_tree_regression_predictions_df.iloc[:,0])
decision_tree_regression_mean_squared_error = mean_squared_error(y_test, decision_tree_regression_predictions_df.iloc[:,0])
decision_tree_regression_explained_variance_score = explained_variance_score(y_test, decision_tree_regression_predictions_df.iloc[:,0])
decision_tree_regression_performance_metrics = [['decision_tree_regression','r2_score', decision_tree_regression_r2_score], 
                                  ['decision_tree_regression','mean_squared_error',decision_tree_regression_mean_squared_error],
                                  ['decision_tree_regression','explained_variance_score', decision_tree_regression_explained_variance_score]]
decision_tree_regression_performance_metrics = pd.DataFrame(decision_tree_regression_performance_metrics, columns=['model','metric', 'value'])

# Generate Actual vs Predicted Plot
decision_tree_regression_actual_predicted_plot, decision_tree_regression_actual_predicted_plot_ax = plt.subplots()
decision_tree_regression_actual_predicted_plot = decision_tree_regression_actual_predicted_plot_ax.scatter(x=y_test, y=decision_tree_regression_predictions_df.iloc[:,0], alpha=0.5)
# Add diagonal line
decision_tree_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
decision_tree_regression_actual_predicted_plot_ax.set_xlabel('Actual')
decision_tree_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
decision_tree_regression_actual_predicted_plot_ax.set_title(f'decision_tree_regression Actual vs. Predicted')
plt.show(block=False)

# Generate Decile Lift Chart
# Calculate the deciles based on the residuals
decision_tree_regression_deciles = np.percentile(decision_tree_regression_predictions, np.arange(0, 100, 10))
# Calculate the mean actual and predicted values for each decile
decision_tree_regression_mean_actual = []
decision_tree_regression_mean_predicted = []
for i in range(len(decision_tree_regression_deciles) - 1):
    mask = (decision_tree_regression_predictions >= decision_tree_regression_deciles[i]) & (decision_tree_regression_predictions < decision_tree_regression_deciles[i + 1])
    decision_tree_regression_mean_actual.append(np.mean(y_test[mask]))
    decision_tree_regression_mean_predicted.append(np.mean(decision_tree_regression_predictions[mask]))

# Create a bar chart of the mean actual and predicted values for each decile
decision_tree_regression_lift_plot, decision_tree_regression_lift_plot_ax = plt.subplots()
decision_tree_regression_lift_plot_ax.bar(np.arange(len(decision_tree_regression_mean_actual)), decision_tree_regression_mean_actual, label='Actual')
decision_tree_regression_lift_plot_ax.plot(np.arange(len(decision_tree_regression_mean_predicted)), decision_tree_regression_mean_predicted, color='red', linewidth=2, label='Predicted')
decision_tree_regression_lift_plot_ax.set_xlabel('Deciles')
decision_tree_regression_lift_plot_ax.set_ylabel('Mean')
decision_tree_regression_lift_plot_ax.set_title(f'decision_tree_regression Decile Analysis Chart')
decision_tree_regression_lift_plot_ax.legend()
plt.show(block=False)


model_comparison_list.append(decision_tree_regression_performance_metrics)##### End of Model Pipeline for Decision Tree Regression #####
##### Model Pipeline for GBT Regression #####

from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import matplotlib.pyplot as plt
gbt_regression_param_grid = {
"gbt_regression__n_estimators": np.arange(25, 200, 50),
"gbt_regression__max_depth": np.arange(1, 10, 3),
"gbt_regression__alpha": np.arange(0.1, 1.0, 0.5),
}


# Create the pipeline
gbt_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('gbt_regression', GradientBoostingRegressor())
])

# Create the grid search
gbt_regression_grid_search = GridSearchCV(estimator=gbt_regression_pipe, param_grid=gbt_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=3)
gbt_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
gbt_regression_best_estimator = gbt_regression_grid_search.best_estimator_

# Store results as a dataframe  
gbt_regression_search_results = pd.DataFrame(gbt_regression_grid_search.cv_results_)

# Model metrics

# Generate Predictions
gbt_regression_predictions = gbt_regression_best_estimator.predict(X_test)
gbt_regression_predictions_df = pd.DataFrame(gbt_regression_best_estimator.predict(X_test))x`

# Generate Model Metrics
gbt_regression_r2_score = r2_score(y_test, gbt_regression_predictions_df.iloc[:,0])
gbt_regression_mean_squared_error = mean_squared_error(y_test, gbt_regression_predictions_df.iloc[:,0])
gbt_regression_explained_variance_score = explained_variance_score(y_test, gbt_regression_predictions_df.iloc[:,0])
gbt_regression_performance_metrics = [['gbt_regression','r2_score', gbt_regression_r2_score], 
                                  ['gbt_regression','mean_squared_error',gbt_regression_mean_squared_error],
                                  ['gbt_regression','explained_variance_score', gbt_regression_explained_variance_score]]
gbt_regression_performance_metrics = pd.DataFrame(gbt_regression_performance_metrics, columns=['model','metric', 'value'])

# Generate Actual vs Predicted Plot
gbt_regression_actual_predicted_plot, gbt_regression_actual_predicted_plot_ax = plt.subplots()
gbt_regression_actual_predicted_plot = gbt_regression_actual_predicted_plot_ax.scatter(x=y_test, y=gbt_regression_predictions_df.iloc[:,0], alpha=0.5)
# Add diagonal line
gbt_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
gbt_regression_actual_predicted_plot_ax.set_xlabel('Actual')
gbt_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
gbt_regression_actual_predicted_plot_ax.set_title(f'gbt_regression Actual vs. Predicted')
plt.show(block=False)

# Generate Decile Lift Chart
# Calculate the deciles based on the residuals
gbt_regression_deciles = np.percentile(gbt_regression_predictions, np.arange(0, 100, 10))
# Calculate the mean actual and predicted values for each decile
gbt_regression_mean_actual = []
gbt_regression_mean_predicted = []
for i in range(len(gbt_regression_deciles) - 1):
    mask = (gbt_regression_predictions >= gbt_regression_deciles[i]) & (gbt_regression_predictions < gbt_regression_deciles[i + 1])
    gbt_regression_mean_actual.append(np.mean(y_test[mask]))
    gbt_regression_mean_predicted.append(np.mean(gbt_regression_predictions[mask]))

# Create a bar chart of the mean actual and predicted values for each decile
gbt_regression_lift_plot, gbt_regression_lift_plot_ax = plt.subplots()
gbt_regression_lift_plot_ax.bar(np.arange(len(gbt_regression_mean_actual)), gbt_regression_mean_actual, label='Actual')
gbt_regression_lift_plot_ax.plot(np.arange(len(gbt_regression_mean_predicted)), gbt_regression_mean_predicted, color='red', linewidth=2, label='Predicted')
gbt_regression_lift_plot_ax.set_xlabel('Deciles')
gbt_regression_lift_plot_ax.set_ylabel('Mean')
gbt_regression_lift_plot_ax.set_title(f'gbt_regression Decile Analysis Chart')
gbt_regression_lift_plot_ax.legend()
plt.show(block=False)


model_comparison_list.append(gbt_regression_performance_metrics)##### End of Model Pipeline for GBT Regression #####
##### Model Comparison #####

table = pd.concat(model_comparison_list)
table = table.sort_values(by=['value'], ascending=False)
table = table[table['metric'] == 'r2_score']
print(table)
print(f"The best model is {table['model'].iloc[0]} with {table['value'].iloc[0]} as {table['metric'].iloc[0]}")


# Predict test data using the best model
test_predictions = eval(table['model'].iloc[0]+"_best_estimator").predict(prediction_df)
print('Predictions from best model are stored in test_predictions')