In [None]:
import pypelines.supervised_pipeline as pipe
from pypelines import utils

### regression

In [None]:
utils.list_supported_models(model_type='regression')

In [None]:
import pandas as pd
housing = pd.read_csv("pypelines/datasets/regression/housing.csv")

### regression - all models

In [None]:
reg_pypelines_all = pipe.SupervisedPipeline(data = housing,target = 'median_house_value'
                            , model_type = 'regression'
                            #, models = ['Linear Regression','Random Forest Regression']
                            , nfolds = 5)

In [None]:
reg_pypelines_all.get_hyperparameters()

In [None]:
reg_pypelines_all.model_list()

In [None]:
reg_pypelines_all.get_code()

In [None]:
reg_pypelines_all.code_to_clipboard()

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import plotly.express as px
import plotly.graph_objects as go


# target dataframe: housing
target = "median_house_value"
features = list(housing.columns.drop("median_house_value"))
feature_df = housing[features]

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
housing[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(include=['string']).columns.tolist()


sample_size = np.min([10000, housing.shape[0]])
unique_theshold = np.min([100, sample_size/10])

# check categorical columns for high cardinality and make it text column
for col in categorical_cols:
    if housing[col].sample(sample_size).nunique() > unique_theshold:
        text_cols.append(col)
        categorical_cols.remove(col)
        

# check text columns for low cardinality and make it categorical columns
for col in text_cols:
    if housing[col].sample(sample_size).nunique() < unique_theshold:
        categorical_cols.append(col)
        text_cols.remove(col)

print(numerical_cols)
print(categorical_cols)
print(text_cols)

# define numeric transformer steps
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", StandardScaler())]
)

# define categorical transformer steps
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# define text transformer steps
text_transformer = Pipeline(
    steps=[
        ('text', TfidfVectorizer())
    ]
)

# create the preprocessing pipelines for both numeric and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer , numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        *[(f'text_{t_col}', text_transformer, t_col) for t_col in text_cols]]
)

# train test split
X = housing[features]
y = housing[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### End of Data Processing Pipeline #####



##### Model Pipeline for Elastic Net Regression #####

from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import matplotlib.pyplot as plt
elastic_net_regression_param_grid = {
"elastic_net_regression__alpha": np.arange(0.1, 2.0, 0.5),
"elastic_net_regression__l1_ratio": np.arange(0.1, 1.0, 0.3),
}


# Create the pipeline
elastic_net_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('elastic_net_regression', ElasticNet())
])

# Create the grid search
elastic_net_regression_grid_search = GridSearchCV(estimator=elastic_net_regression_pipe, param_grid=elastic_net_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=1)
elastic_net_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
elastic_net_regression_best_estimator = elastic_net_regression_grid_search.best_estimator_

# Store results as a dataframe  
elastic_net_regression_search_results = pd.DataFrame(elastic_net_regression_grid_search.cv_results_)

# Model metrics

elastic_net_regression_predictions = pd.DataFrame(elastic_net_regression_best_estimator.predict(X_test))
elastic_net_regression_r2_score = r2_score(y_test, elastic_net_regression_predictions.iloc[:,0])
elastic_net_regression_mean_squared_error = mean_squared_error(y_test, elastic_net_regression_predictions.iloc[:,0])
elastic_net_regression_explained_variance_score = explained_variance_score(y_test, elastic_net_regression_predictions.iloc[:,0])
elastic_net_regression_performance_metrics = [['elastic_net_regression','r2_score', elastic_net_regression_r2_score], 
                                  ['elastic_net_regression','mean_squared_error',elastic_net_regression_mean_squared_error],
                                  ['elastic_net_regression','explained_variance_score', elastic_net_regression_explained_variance_score]]
elastic_net_regression_performance_metrics = pd.DataFrame(elastic_net_regression_performance_metrics, columns=['model','metric', 'value'])


elastic_net_regression_actual_predicted_plot, elastic_net_regression_actual_predicted_plot_ax = plt.subplots()
elastic_net_regression_actual_predicted_plot = elastic_net_regression_actual_predicted_plot_ax.scatter(x=y_test, y=elastic_net_regression_predictions.iloc[:,0], alpha=0.5)
# Add diagonal line
elastic_net_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
elastic_net_regression_actual_predicted_plot_ax.set_xlabel('Actual')
elastic_net_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
elastic_net_regression_actual_predicted_plot_ax.set_title(f'elastic_net_regression_Actual vs. Predicted')



##### Model Metrics Elastic Net Regression #####

print(elastic_net_regression_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for Elastic Net Regression #####

##### Model Pipeline for Linear Regression #####

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
lin_reg_param_grid = {
}


# Create the pipeline
lin_reg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('lin_reg', LinearRegression())
])

# Create the grid search
lin_reg_grid_search = GridSearchCV(estimator=lin_reg_pipe, param_grid=lin_reg_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=1)
lin_reg_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
lin_reg_best_estimator = lin_reg_grid_search.best_estimator_

# Store results as a dataframe  
lin_reg_search_results = pd.DataFrame(lin_reg_grid_search.cv_results_)

# Model metrics

lin_reg_predictions = pd.DataFrame(lin_reg_best_estimator.predict(X_test))
lin_reg_r2_score = r2_score(y_test, lin_reg_predictions.iloc[:,0])
lin_reg_mean_squared_error = mean_squared_error(y_test, lin_reg_predictions.iloc[:,0])
lin_reg_explained_variance_score = explained_variance_score(y_test, lin_reg_predictions.iloc[:,0])
lin_reg_performance_metrics = [['lin_reg','r2_score', lin_reg_r2_score], 
                                  ['lin_reg','mean_squared_error',lin_reg_mean_squared_error],
                                  ['lin_reg','explained_variance_score', lin_reg_explained_variance_score]]
lin_reg_performance_metrics = pd.DataFrame(lin_reg_performance_metrics, columns=['model','metric', 'value'])


lin_reg_actual_predicted_plot, lin_reg_actual_predicted_plot_ax = plt.subplots()
lin_reg_actual_predicted_plot = lin_reg_actual_predicted_plot_ax.scatter(x=y_test, y=lin_reg_predictions.iloc[:,0], alpha=0.5)
# Add diagonal line
lin_reg_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
lin_reg_actual_predicted_plot_ax.set_xlabel('Actual')
lin_reg_actual_predicted_plot_ax.set_ylabel('Predicted')
lin_reg_actual_predicted_plot_ax.set_title(f'lin_reg_Actual vs. Predicted')



##### Model Metrics Linear Regression #####

print(lin_reg_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for Linear Regression #####

##### Model Pipeline for Lasso Regression #####

from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import matplotlib.pyplot as plt
lasso_regression_param_grid = {
"lasso_regression__alpha": np.arange(0.0, 2.0, 0.5),
}


# Create the pipeline
lasso_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('lasso_regression', Lasso())
])

# Create the grid search
lasso_regression_grid_search = GridSearchCV(estimator=lasso_regression_pipe, param_grid=lasso_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=1)
lasso_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
lasso_regression_best_estimator = lasso_regression_grid_search.best_estimator_

# Store results as a dataframe  
lasso_regression_search_results = pd.DataFrame(lasso_regression_grid_search.cv_results_)

# Model metrics

lasso_regression_predictions = pd.DataFrame(lasso_regression_best_estimator.predict(X_test))
lasso_regression_r2_score = r2_score(y_test, lasso_regression_predictions.iloc[:,0])
lasso_regression_mean_squared_error = mean_squared_error(y_test, lasso_regression_predictions.iloc[:,0])
lasso_regression_explained_variance_score = explained_variance_score(y_test, lasso_regression_predictions.iloc[:,0])
lasso_regression_performance_metrics = [['lasso_regression','r2_score', lasso_regression_r2_score], 
                                  ['lasso_regression','mean_squared_error',lasso_regression_mean_squared_error],
                                  ['lasso_regression','explained_variance_score', lasso_regression_explained_variance_score]]
lasso_regression_performance_metrics = pd.DataFrame(lasso_regression_performance_metrics, columns=['model','metric', 'value'])


lasso_regression_actual_predicted_plot, lasso_regression_actual_predicted_plot_ax = plt.subplots()
lasso_regression_actual_predicted_plot = lasso_regression_actual_predicted_plot_ax.scatter(x=y_test, y=lasso_regression_predictions.iloc[:,0], alpha=0.5)
# Add diagonal line
lasso_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
lasso_regression_actual_predicted_plot_ax.set_xlabel('Actual')
lasso_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
lasso_regression_actual_predicted_plot_ax.set_title(f'lasso_regression_Actual vs. Predicted')



##### Model Metrics Lasso Regression #####

print(lasso_regression_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for Lasso Regression #####

##### Model Pipeline for Ridge Regression #####

from sklearn.linear_model import Ridge 
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import matplotlib.pyplot as plt
ridge_regression_param_grid = {
"ridge_regression__alpha": np.arange(0.1, 2.0, 0.5),
}


# Create the pipeline
ridge_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('ridge_regression', Ridge())
])

# Create the grid search
ridge_regression_grid_search = GridSearchCV(estimator=ridge_regression_pipe, param_grid=ridge_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=1)
ridge_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
ridge_regression_best_estimator = ridge_regression_grid_search.best_estimator_

# Store results as a dataframe  
ridge_regression_search_results = pd.DataFrame(ridge_regression_grid_search.cv_results_)

# Model metrics

ridge_regression_predictions = pd.DataFrame(ridge_regression_best_estimator.predict(X_test))
ridge_regression_r2_score = r2_score(y_test, ridge_regression_predictions.iloc[:,0])
ridge_regression_mean_squared_error = mean_squared_error(y_test, ridge_regression_predictions.iloc[:,0])
ridge_regression_explained_variance_score = explained_variance_score(y_test, ridge_regression_predictions.iloc[:,0])
ridge_regression_performance_metrics = [['ridge_regression','r2_score', ridge_regression_r2_score], 
                                  ['ridge_regression','mean_squared_error',ridge_regression_mean_squared_error],
                                  ['ridge_regression','explained_variance_score', ridge_regression_explained_variance_score]]
ridge_regression_performance_metrics = pd.DataFrame(ridge_regression_performance_metrics, columns=['model','metric', 'value'])


ridge_regression_actual_predicted_plot, ridge_regression_actual_predicted_plot_ax = plt.subplots()
ridge_regression_actual_predicted_plot = ridge_regression_actual_predicted_plot_ax.scatter(x=y_test, y=ridge_regression_predictions.iloc[:,0], alpha=0.5)
# Add diagonal line
ridge_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
ridge_regression_actual_predicted_plot_ax.set_xlabel('Actual')
ridge_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
ridge_regression_actual_predicted_plot_ax.set_title(f'ridge_regression_Actual vs. Predicted')



##### Model Metrics Ridge Regression #####

print(ridge_regression_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for Ridge Regression #####

##### Model Pipeline for Random Forest Regression #####

from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
random_forest_regression_param_grid = {
"random_forest_regression__n_estimators": np.arange(50, 150, 35),
"random_forest_regression__max_depth": np.arange(5, 50, 10),
"random_forest_regression__min_samples_leaf": np.arange(1, 50, 20),
}


# Create the pipeline
random_forest_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('random_forest_regression', RandomForestRegressor())
])

# Create the grid search
random_forest_regression_grid_search = GridSearchCV(estimator=random_forest_regression_pipe, param_grid=random_forest_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=1)
random_forest_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
random_forest_regression_best_estimator = random_forest_regression_grid_search.best_estimator_

# Store results as a dataframe  
random_forest_regression_search_results = pd.DataFrame(random_forest_regression_grid_search.cv_results_)

# Model metrics

random_forest_regression_predictions = pd.DataFrame(random_forest_regression_best_estimator.predict(X_test))
random_forest_regression_r2_score = r2_score(y_test, random_forest_regression_predictions.iloc[:,0])
random_forest_regression_mean_squared_error = mean_squared_error(y_test, random_forest_regression_predictions.iloc[:,0])
random_forest_regression_explained_variance_score = explained_variance_score(y_test, random_forest_regression_predictions.iloc[:,0])
random_forest_regression_performance_metrics = [['random_forest_regression','r2_score', random_forest_regression_r2_score], 
                                  ['random_forest_regression','mean_squared_error',random_forest_regression_mean_squared_error],
                                  ['random_forest_regression','explained_variance_score', random_forest_regression_explained_variance_score]]
random_forest_regression_performance_metrics = pd.DataFrame(random_forest_regression_performance_metrics, columns=['model','metric', 'value'])


random_forest_regression_actual_predicted_plot, random_forest_regression_actual_predicted_plot_ax = plt.subplots()
random_forest_regression_actual_predicted_plot = random_forest_regression_actual_predicted_plot_ax.scatter(x=y_test, y=random_forest_regression_predictions.iloc[:,0], alpha=0.5)
# Add diagonal line
random_forest_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
random_forest_regression_actual_predicted_plot_ax.set_xlabel('Actual')
random_forest_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
random_forest_regression_actual_predicted_plot_ax.set_title(f'random_forest_regression_Actual vs. Predicted')



##### Model Metrics Random Forest Regression #####

print(random_forest_regression_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for Random Forest Regression #####

##### Model Pipeline for Decision Tree Regression #####

from sklearn.tree import DecisionTreeRegressor 
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import matplotlib.pyplot as plt
decision_tree_regression_param_grid = {
"decision_tree_regression__max_depth": np.arange(1, 10, 3),
"decision_tree_regression__max_features": ['auto'],
}


# Create the pipeline
decision_tree_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('decision_tree_regression', DecisionTreeRegressor())
])

# Create the grid search
decision_tree_regression_grid_search = GridSearchCV(estimator=decision_tree_regression_pipe, param_grid=decision_tree_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=1)
decision_tree_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
decision_tree_regression_best_estimator = decision_tree_regression_grid_search.best_estimator_

# Store results as a dataframe  
decision_tree_regression_search_results = pd.DataFrame(decision_tree_regression_grid_search.cv_results_)

# Model metrics

decision_tree_regression_predictions = pd.DataFrame(decision_tree_regression_best_estimator.predict(X_test))
decision_tree_regression_r2_score = r2_score(y_test, decision_tree_regression_predictions.iloc[:,0])
decision_tree_regression_mean_squared_error = mean_squared_error(y_test, decision_tree_regression_predictions.iloc[:,0])
decision_tree_regression_explained_variance_score = explained_variance_score(y_test, decision_tree_regression_predictions.iloc[:,0])
decision_tree_regression_performance_metrics = [['decision_tree_regression','r2_score', decision_tree_regression_r2_score], 
                                  ['decision_tree_regression','mean_squared_error',decision_tree_regression_mean_squared_error],
                                  ['decision_tree_regression','explained_variance_score', decision_tree_regression_explained_variance_score]]
decision_tree_regression_performance_metrics = pd.DataFrame(decision_tree_regression_performance_metrics, columns=['model','metric', 'value'])


decision_tree_regression_actual_predicted_plot, decision_tree_regression_actual_predicted_plot_ax = plt.subplots()
decision_tree_regression_actual_predicted_plot = decision_tree_regression_actual_predicted_plot_ax.scatter(x=y_test, y=decision_tree_regression_predictions.iloc[:,0], alpha=0.5)
# Add diagonal line
decision_tree_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
decision_tree_regression_actual_predicted_plot_ax.set_xlabel('Actual')
decision_tree_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
decision_tree_regression_actual_predicted_plot_ax.set_title(f'decision_tree_regression_Actual vs. Predicted')



##### Model Metrics Decision Tree Regression #####

print(decision_tree_regression_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for Decision Tree Regression #####

##### Model Pipeline for GBT Regression #####

from sklearn.ensemble import GradientBoostingRegressor 
from sklearn.metrics import mean_squared_error,make_scorer,r2_score,explained_variance_score
import matplotlib.pyplot as plt
gbt_regression_param_grid = {
"gbt_regression__n_estimators": np.arange(25, 200, 50),
"gbt_regression__max_depth": np.arange(1, 10, 3),
"gbt_regression__alpha": np.arange(0.1, 1.0, 0.5),
}


# Create the pipeline
gbt_regression_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('gbt_regression', GradientBoostingRegressor())
])

# Create the grid search
gbt_regression_grid_search = GridSearchCV(estimator=gbt_regression_pipe, param_grid=gbt_regression_param_grid, cv=5, scoring=make_scorer(mean_squared_error), verbose=1)
gbt_regression_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
gbt_regression_best_estimator = gbt_regression_grid_search.best_estimator_

# Store results as a dataframe  
gbt_regression_search_results = pd.DataFrame(gbt_regression_grid_search.cv_results_)

# Model metrics

gbt_regression_predictions = pd.DataFrame(gbt_regression_best_estimator.predict(X_test))
gbt_regression_r2_score = r2_score(y_test, gbt_regression_predictions.iloc[:,0])
gbt_regression_mean_squared_error = mean_squared_error(y_test, gbt_regression_predictions.iloc[:,0])
gbt_regression_explained_variance_score = explained_variance_score(y_test, gbt_regression_predictions.iloc[:,0])
gbt_regression_performance_metrics = [['gbt_regression','r2_score', gbt_regression_r2_score], 
                                  ['gbt_regression','mean_squared_error',gbt_regression_mean_squared_error],
                                  ['gbt_regression','explained_variance_score', gbt_regression_explained_variance_score]]
gbt_regression_performance_metrics = pd.DataFrame(gbt_regression_performance_metrics, columns=['model','metric', 'value'])


gbt_regression_actual_predicted_plot, gbt_regression_actual_predicted_plot_ax = plt.subplots()
gbt_regression_actual_predicted_plot = gbt_regression_actual_predicted_plot_ax.scatter(x=y_test, y=gbt_regression_predictions.iloc[:,0], alpha=0.5)
# Add diagonal line
gbt_regression_actual_predicted_plot_ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', alpha=0.5)
# Set axis labels and title
gbt_regression_actual_predicted_plot_ax.set_xlabel('Actual')
gbt_regression_actual_predicted_plot_ax.set_ylabel('Predicted')
gbt_regression_actual_predicted_plot_ax.set_title(f'gbt_regression_Actual vs. Predicted')



##### Model Metrics GBT Regression #####

print(gbt_regression_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for GBT Regression #####

### classification

In [None]:
titanic = pd.read_csv("pypelines/datasets/classification/titanic.csv")

In [None]:
# code output
clf_pypelines_all = pipe.SupervisedPipeline(data = titanic,target = 'Survived'
                            , model_type = 'classification'
#                            , models = ['Logistic Regression','Random Forest Classifier']
                            , nfolds = 5)

In [None]:
clf_pypelines_all.get_hyperparameters()

In [None]:
clf_pypelines_all.code_to_clipboard()

In [None]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import plotly.express as px
import plotly.graph_objects as go


# target dataframe: titanic
target = "Survived"
features = list(titanic.columns.drop("Survived"))
feature_df = titanic[features]

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
titanic[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(include=['string']).columns.tolist()


sample_size = np.min([10000, titanic.shape[0]])
unique_theshold = np.min([100, sample_size/10])

# check categorical columns for high cardinality and make it text column
for col in categorical_cols:
    if titanic[col].sample(sample_size).nunique() > unique_theshold:
        text_cols.append(col)
        categorical_cols.remove(col)
        

# check text columns for low cardinality and make it categorical columns
for col in text_cols:
    if titanic[col].sample(sample_size).nunique() < unique_theshold:
        categorical_cols.append(col)
        text_cols.remove(col)

print(numerical_cols)
print(categorical_cols)
print(text_cols)

# define numeric transformer steps
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", StandardScaler())]
)

# define categorical transformer steps
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# define text transformer steps
text_transformer = Pipeline(
    steps=[
        ('text', TfidfVectorizer())
    ]
)

# create the preprocessing pipelines for both numeric and categorical data
preprocessor = ColumnTransformer(
        transformers=[('num', numeric_transformer , numerical_cols),
        ('cat', categorical_transformer, categorical_cols),
        *[(f'text_{t_col}', text_transformer, t_col) for t_col in text_cols]]
)

# train test split
X = titanic[features]
y = titanic[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

##### End of Data Processing Pipeline #####



##### Model Pipeline for Decision Tree Classifier #####

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
dt_classifier_param_grid = {
"dt_classifier__max_depth": np.arange(2, 10, 5),
"dt_classifier__min_samples_split": np.arange(2, 10, 5),
"dt_classifier__min_samples_leaf": np.arange(1, 10, 5),
"dt_classifier__min_weight_fraction_leaf": np.arange(0.0, 0.5, 0.25),
"dt_classifier__max_leaf_nodes": np.arange(1, 10, 5),
"dt_classifier__min_impurity_decrease": np.arange(0.0, 0.5, 0.25),
}


# Create the pipeline
dt_classifier_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('dt_classifier', DecisionTreeClassifier())
])

# Create the grid search
dt_classifier_grid_search = GridSearchCV(estimator=dt_classifier_pipe, param_grid=dt_classifier_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=1)
dt_classifier_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
dt_classifier_best_estimator = dt_classifier_grid_search.best_estimator_

# Store results as a dataframe  
dt_classifier_search_results = pd.DataFrame(dt_classifier_grid_search.cv_results_)

# Model metrics

dt_classifier_predictions = pd.DataFrame(dt_classifier_best_estimator.predict(X_test))
dt_classifier_predictions_prob = dt_classifier_best_estimator.predict_proba(X_test)
dt_classifier_predictions_prob_df = pd.DataFrame()
dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[0]] = dt_classifier_predictions_prob[:,0]
dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]] = dt_classifier_predictions_prob[:,1] 
dt_classifier_accuracy = accuracy_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_f1_score = f1_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_precision = precision_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_recall = recall_score(y_test, dt_classifier_predictions.iloc[:,0])
dt_classifier_roc_auc_score = roc_auc_score(y_test, dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]])
dt_classifier_performance_metrics = [['dt_classifier','accuracy',dt_classifier_accuracy], 
                                  ['dt_classifier','f1_score',dt_classifier_f1_score],
                                  ['dt_classifier','precision', dt_classifier_precision],
                                  ['dt_classifier','recall', dt_classifier_recall],
                                  ['dt_classifier','roc_auc_score', dt_classifier_roc_auc_score]]
dt_classifier_performance_metrics = pd.DataFrame(dt_classifier_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, dt_classifier_predictions_prob_df[dt_classifier_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)
# Create plot
dt_classifier_roc_auc_plot, dt_classifier_roc_auc_plot_ax = plt.subplots()
dt_classifier_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
dt_classifier_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')

# Set axis labels and title
dt_classifier_roc_auc_plot_ax.set_xlabel('False Positive Rate')
dt_classifier_roc_auc_plot_ax.set_ylabel('True Positive Rate')
dt_classifier_roc_auc_plot_ax.set_title('ROC Curve')
# Add legend
dt_classifier_roc_auc_plot_ax.legend()


##### Model Metrics Decision Tree Classifier #####

print(dt_classifier_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for Decision Tree Classifier #####

##### Model Pipeline for Logistic Regression #####

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
log_reg_param_grid = {
"log_reg__C": np.arange(0.1, 1.0, 0.1),
}


# Create the pipeline
log_reg_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('log_reg', LogisticRegression())
])

# Create the grid search
log_reg_grid_search = GridSearchCV(estimator=log_reg_pipe, param_grid=log_reg_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=1)
log_reg_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
log_reg_best_estimator = log_reg_grid_search.best_estimator_

# Store results as a dataframe  
log_reg_search_results = pd.DataFrame(log_reg_grid_search.cv_results_)

# Model metrics

log_reg_predictions = pd.DataFrame(log_reg_best_estimator.predict(X_test))
log_reg_predictions_prob = log_reg_best_estimator.predict_proba(X_test)
log_reg_predictions_prob_df = pd.DataFrame()
log_reg_predictions_prob_df[log_reg_grid_search.classes_[0]] = log_reg_predictions_prob[:,0]
log_reg_predictions_prob_df[log_reg_grid_search.classes_[1]] = log_reg_predictions_prob[:,1] 
log_reg_accuracy = accuracy_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_f1_score = f1_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_precision = precision_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_recall = recall_score(y_test, log_reg_predictions.iloc[:,0])
log_reg_roc_auc_score = roc_auc_score(y_test, log_reg_predictions_prob_df[log_reg_grid_search.classes_[1]])
log_reg_performance_metrics = [['log_reg','accuracy',log_reg_accuracy], 
                                  ['log_reg','f1_score',log_reg_f1_score],
                                  ['log_reg','precision', log_reg_precision],
                                  ['log_reg','recall', log_reg_recall],
                                  ['log_reg','roc_auc_score', log_reg_roc_auc_score]]
log_reg_performance_metrics = pd.DataFrame(log_reg_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, log_reg_predictions_prob_df[log_reg_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)
# Create plot
log_reg_roc_auc_plot, log_reg_roc_auc_plot_ax = plt.subplots()
log_reg_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
log_reg_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')

# Set axis labels and title
log_reg_roc_auc_plot_ax.set_xlabel('False Positive Rate')
log_reg_roc_auc_plot_ax.set_ylabel('True Positive Rate')
log_reg_roc_auc_plot_ax.set_title('ROC Curve')
# Add legend
log_reg_roc_auc_plot_ax.legend()


##### Model Metrics Logistic Regression #####

print(log_reg_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for Logistic Regression #####

##### Model Pipeline for Random Forest Classifier #####

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
random_forest_classifier_param_grid = {
"random_forest_classifier__n_estimators": np.arange(10, 100, 35),
"random_forest_classifier__max_depth": np.arange(2, 10, 5),
"random_forest_classifier__min_samples_split": np.arange(0.5, 1.0, 0.5),
"random_forest_classifier__min_samples_leaf": np.arange(1, 10, 5),
}


# Create the pipeline
random_forest_classifier_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('random_forest_classifier', RandomForestClassifier())
])

# Create the grid search
random_forest_classifier_grid_search = GridSearchCV(estimator=random_forest_classifier_pipe, param_grid=random_forest_classifier_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=1)
random_forest_classifier_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
random_forest_classifier_best_estimator = random_forest_classifier_grid_search.best_estimator_

# Store results as a dataframe  
random_forest_classifier_search_results = pd.DataFrame(random_forest_classifier_grid_search.cv_results_)

# Model metrics

random_forest_classifier_predictions = pd.DataFrame(random_forest_classifier_best_estimator.predict(X_test))
random_forest_classifier_predictions_prob = random_forest_classifier_best_estimator.predict_proba(X_test)
random_forest_classifier_predictions_prob_df = pd.DataFrame()
random_forest_classifier_predictions_prob_df[random_forest_classifier_grid_search.classes_[0]] = random_forest_classifier_predictions_prob[:,0]
random_forest_classifier_predictions_prob_df[random_forest_classifier_grid_search.classes_[1]] = random_forest_classifier_predictions_prob[:,1] 
random_forest_classifier_accuracy = accuracy_score(y_test, random_forest_classifier_predictions.iloc[:,0])
random_forest_classifier_f1_score = f1_score(y_test, random_forest_classifier_predictions.iloc[:,0])
random_forest_classifier_precision = precision_score(y_test, random_forest_classifier_predictions.iloc[:,0])
random_forest_classifier_recall = recall_score(y_test, random_forest_classifier_predictions.iloc[:,0])
random_forest_classifier_roc_auc_score = roc_auc_score(y_test, random_forest_classifier_predictions_prob_df[random_forest_classifier_grid_search.classes_[1]])
random_forest_classifier_performance_metrics = [['random_forest_classifier','accuracy',random_forest_classifier_accuracy], 
                                  ['random_forest_classifier','f1_score',random_forest_classifier_f1_score],
                                  ['random_forest_classifier','precision', random_forest_classifier_precision],
                                  ['random_forest_classifier','recall', random_forest_classifier_recall],
                                  ['random_forest_classifier','roc_auc_score', random_forest_classifier_roc_auc_score]]
random_forest_classifier_performance_metrics = pd.DataFrame(random_forest_classifier_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, random_forest_classifier_predictions_prob_df[random_forest_classifier_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)
# Create plot
random_forest_classifier_roc_auc_plot, random_forest_classifier_roc_auc_plot_ax = plt.subplots()
random_forest_classifier_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
random_forest_classifier_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')

# Set axis labels and title
random_forest_classifier_roc_auc_plot_ax.set_xlabel('False Positive Rate')
random_forest_classifier_roc_auc_plot_ax.set_ylabel('True Positive Rate')
random_forest_classifier_roc_auc_plot_ax.set_title('ROC Curve')
# Add legend
random_forest_classifier_roc_auc_plot_ax.legend()


##### Model Metrics Random Forest Classifier #####

print(random_forest_classifier_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for Random Forest Classifier #####

##### Model Pipeline for XGBoost Classifier #####

from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
xgboost_classifier_param_grid = {
"xgboost_classifier__learning_rate": np.arange(0.1, 1.0, 0.25),
"xgboost_classifier__n_estimators": np.arange(100, 500, 250),
"xgboost_classifier__max_depth": np.arange(2, 10, 5),
"xgboost_classifier__gamma": np.arange(0.0, 0.5, 0.25),
"xgboost_classifier__subsample": np.arange(0.1, 1.0, 0.25),
"xgboost_classifier__colsample_bytree": np.arange(0.5, 1.0, 0.25),
}


# Create the pipeline
xgboost_classifier_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgboost_classifier', XGBClassifier())
])

# Create the grid search
xgboost_classifier_grid_search = GridSearchCV(estimator=xgboost_classifier_pipe, param_grid=xgboost_classifier_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=1)
xgboost_classifier_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
xgboost_classifier_best_estimator = xgboost_classifier_grid_search.best_estimator_

# Store results as a dataframe  
xgboost_classifier_search_results = pd.DataFrame(xgboost_classifier_grid_search.cv_results_)

# Model metrics

xgboost_classifier_predictions = pd.DataFrame(xgboost_classifier_best_estimator.predict(X_test))
xgboost_classifier_predictions_prob = xgboost_classifier_best_estimator.predict_proba(X_test)
xgboost_classifier_predictions_prob_df = pd.DataFrame()
xgboost_classifier_predictions_prob_df[xgboost_classifier_grid_search.classes_[0]] = xgboost_classifier_predictions_prob[:,0]
xgboost_classifier_predictions_prob_df[xgboost_classifier_grid_search.classes_[1]] = xgboost_classifier_predictions_prob[:,1] 
xgboost_classifier_accuracy = accuracy_score(y_test, xgboost_classifier_predictions.iloc[:,0])
xgboost_classifier_f1_score = f1_score(y_test, xgboost_classifier_predictions.iloc[:,0])
xgboost_classifier_precision = precision_score(y_test, xgboost_classifier_predictions.iloc[:,0])
xgboost_classifier_recall = recall_score(y_test, xgboost_classifier_predictions.iloc[:,0])
xgboost_classifier_roc_auc_score = roc_auc_score(y_test, xgboost_classifier_predictions_prob_df[xgboost_classifier_grid_search.classes_[1]])
xgboost_classifier_performance_metrics = [['xgboost_classifier','accuracy',xgboost_classifier_accuracy], 
                                  ['xgboost_classifier','f1_score',xgboost_classifier_f1_score],
                                  ['xgboost_classifier','precision', xgboost_classifier_precision],
                                  ['xgboost_classifier','recall', xgboost_classifier_recall],
                                  ['xgboost_classifier','roc_auc_score', xgboost_classifier_roc_auc_score]]
xgboost_classifier_performance_metrics = pd.DataFrame(xgboost_classifier_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, xgboost_classifier_predictions_prob_df[xgboost_classifier_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)
# Create plot
xgboost_classifier_roc_auc_plot, xgboost_classifier_roc_auc_plot_ax = plt.subplots()
xgboost_classifier_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
xgboost_classifier_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')

# Set axis labels and title
xgboost_classifier_roc_auc_plot_ax.set_xlabel('False Positive Rate')
xgboost_classifier_roc_auc_plot_ax.set_ylabel('True Positive Rate')
xgboost_classifier_roc_auc_plot_ax.set_title('ROC Curve')
# Add legend
xgboost_classifier_roc_auc_plot_ax.legend()


##### Model Metrics XGBoost Classifier #####

print(xgboost_classifier_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for XGBoost Classifier #####

##### Model Pipeline for GBT Classifier #####

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,make_scorer,f1_score,precision_score,recall_score,roc_auc_score,roc_curve,auc
import matplotlib.pyplot as plt
gbt_classifier_param_grid = {
"gbt_classifier__learning_rate": np.arange(0.0, 1.0, 0.5),
"gbt_classifier__n_estimators": np.arange(1000, 10000, 5000),
"gbt_classifier__subsample": np.arange(0.1, 1.0, 0.5),
"gbt_classifier__max_depth": np.arange(1, 10000, 1000),
}


# Create the pipeline
gbt_classifier_pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('gbt_classifier', GradientBoostingClassifier())
])

# Create the grid search
gbt_classifier_grid_search = GridSearchCV(estimator=gbt_classifier_pipe, param_grid=gbt_classifier_param_grid, cv=5, scoring=make_scorer(accuracy_score), verbose=1)
gbt_classifier_grid_search.fit(X_train, y_train)

# Get the best hyperparameters
gbt_classifier_best_estimator = gbt_classifier_grid_search.best_estimator_

# Store results as a dataframe  
gbt_classifier_search_results = pd.DataFrame(gbt_classifier_grid_search.cv_results_)

# Model metrics

gbt_classifier_predictions = pd.DataFrame(gbt_classifier_best_estimator.predict(X_test))
gbt_classifier_predictions_prob = gbt_classifier_best_estimator.predict_proba(X_test)
gbt_classifier_predictions_prob_df = pd.DataFrame()
gbt_classifier_predictions_prob_df[gbt_classifier_grid_search.classes_[0]] = gbt_classifier_predictions_prob[:,0]
gbt_classifier_predictions_prob_df[gbt_classifier_grid_search.classes_[1]] = gbt_classifier_predictions_prob[:,1] 
gbt_classifier_accuracy = accuracy_score(y_test, gbt_classifier_predictions.iloc[:,0])
gbt_classifier_f1_score = f1_score(y_test, gbt_classifier_predictions.iloc[:,0])
gbt_classifier_precision = precision_score(y_test, gbt_classifier_predictions.iloc[:,0])
gbt_classifier_recall = recall_score(y_test, gbt_classifier_predictions.iloc[:,0])
gbt_classifier_roc_auc_score = roc_auc_score(y_test, gbt_classifier_predictions_prob_df[gbt_classifier_grid_search.classes_[1]])
gbt_classifier_performance_metrics = [['gbt_classifier','accuracy',gbt_classifier_accuracy], 
                                  ['gbt_classifier','f1_score',gbt_classifier_f1_score],
                                  ['gbt_classifier','precision', gbt_classifier_precision],
                                  ['gbt_classifier','recall', gbt_classifier_recall],
                                  ['gbt_classifier','roc_auc_score', gbt_classifier_roc_auc_score]]
gbt_classifier_performance_metrics = pd.DataFrame(gbt_classifier_performance_metrics, columns=['model','metric', 'value'])
fpr, tpr, thresholds = roc_curve(y_test, gbt_classifier_predictions_prob_df[gbt_classifier_grid_search.classes_[1]])
roc_auc = auc(fpr, tpr)
# Create plot
gbt_classifier_roc_auc_plot, gbt_classifier_roc_auc_plot_ax = plt.subplots()
gbt_classifier_roc_auc_plot_ax.plot(fpr, tpr, label=f'ROC curve (AUC = {roc_auc:.4f})')
gbt_classifier_roc_auc_plot_ax.plot([0, 1], [0, 1], 'r--', label='Random guess')

# Set axis labels and title
gbt_classifier_roc_auc_plot_ax.set_xlabel('False Positive Rate')
gbt_classifier_roc_auc_plot_ax.set_ylabel('True Positive Rate')
gbt_classifier_roc_auc_plot_ax.set_title('ROC Curve')
# Add legend
gbt_classifier_roc_auc_plot_ax.legend()


##### Model Metrics GBT Classifier #####

print(gbt_classifier_performance_metrics)
plt.show(block=False)

##### End of Model Pipeline for GBT Classifier #####

### classification - all models

### Update grid search for a model

In [None]:
clf_pypelines_all.model_grid_search_settings(model_name="Random Forest Classifier")

In [None]:
rf_updated_dict = {'numerical': [{'search': True,
   'name': 'n_estimators',
   'min': 100,
   'max': 1000,
   'step': 20},
  {'search': True, 'name': 'max_depth', 'min': 2, 'max': 10, 'step': 2},
  {'search': True,
   'name': 'min_samples_split',
   'min': 0.50,
   'max': 1,
   'step': 0.1},
  {'search': True,
   'name': 'min_samples_leaf',
   'min': 1,
   'max': 10,
   'step': 2}],
 'categorical': [{'search': False,
   'name': 'criterion',
   'selected': ['gini'],
   'values': ['gini', 'entropy']},
  {'search': False,
   'name': 'max_features',
   'selected': ['sqrt'],
   'values': ['auto', 'sqrt', 'log2']},
  {'search': False,
   'name': 'bootstrap',
   'selected': [True],
   'values': [True, False]},
  {'search': True,
   'name': 'oob_score',
   'selected': [True],
   'values': [True, False]},
  {'search': False,
   'name': 'warm_start',
   'selected': [False],
   'values': [True, False]},
  {'search': False,
   'name': 'class_weight',
   'selected': ['balanced'],
   'values': ['balanced', 'balanced_subsample']}]}

In [None]:
# Example list of dictionaries
dict_list = [rf_updated_dict]

# Loop through each dictionary in the list
for i, rf_dict in enumerate(dict_list):
    # Format the dictionary
    formatted_dict = {
        'numerical': [
            {'search': param['search'], 'name': param['name'], 'min': param['min'], 'max': param['max'], 'step': param['step']}
            for param in rf_dict['numerical']
        ],
        'categorical': [
            {'search': param['search'], 'name': param['name'], 'selected': param['selected'], 'values': param['values']}
            for param in rf_dict['categorical']
        ]
    }
    # Replace the original dictionary with the formatted one
    dict_list[i] = formatted_dict


In [None]:
clf_pypelines_all.set_model_grid_search_settings(hyperparam_dict=rf_updated_dict,model_name = 'Random Forest Classifier')