In [4]:
import pypelines.supervised_pipeline as pipe
from pypelines import utils

### regression

In [5]:
utils.list_supported_models(model_type='regression')

['Elastic Net Regression',
 'Linear Regression',
 'Lasso Regression',
 'Ridge Regression',
 'SGD Regressor Regression',
 'Histogram Gradient Boost Regression',
 'Random Forest Regression',
 'AdaBoost Regression',
 'Poisson Regression',
 'Decision Tree Regression',
 'GBT Regression',
 'ExtraTree Regression',
 'GPR Regression',
 'Bayesian ARD Regression',
 'Bayesian Ridge Regression',
 'Quantile Regression',
 'Huber Regression',
 'TheilSen Regression',
 'Passive Aggressive Regression',
 'Gamma Regression',
 'Tweedie Regression',
 'OMP Regression',
 'LassoLars Regression',
 'RANSAC Regression']

In [6]:
import pandas as pd
housing = pd.read_csv("pypelines/datasets/regression/housing.csv")

### regression - all models

In [None]:
reg_pypelines_all = pipe.SupervisedPipeline(data = housing,target = 'median_house_value'
                            , model_type = 'regression'
                            , models = ['Linear Regression','Random Forest Regression']
                            , nfolds = 5)

In [None]:
reg_pypelines_all.get_hyperparameters()

In [None]:
reg_pypelines_all.model_list()

In [None]:
reg_pypelines_all.get_code()

In [None]:
reg_pypelines_all.code_to_clipboard()

### classification

In [8]:
titanic = pd.read_csv("pypelines/datasets/classification/titanic.csv")

In [9]:
# code output
clf_pypelines_all = pipe.SupervisedPipeline(data = titanic,target = 'Survived'
                            , model_type = 'classification'
                            , models = ['Logistic Regression','Random Forest Classifier']
                            , nfolds = 5)

In [10]:
clf_pypelines_all.get_hyperparameters()

{'Logistic Regression': {'numerical': [{'search': True,
    'name': 'C',
    'min': 0.1,
    'max': 1,
    'step': 0.1}],
  'categorical': [{'search': False,
    'name': 'penalty',
    'selected': ['l2'],
    'values': ['l2', 'elasticnet', 'none']}]},
 'Random Forest Classifier': {'numerical': [{'search': True,
    'name': 'n_estimators',
    'min': 10,
    'max': 100,
    'step': 20},
   {'search': True, 'name': 'max_depth', 'min': 2, 'max': 10, 'step': 2},
   {'search': True,
    'name': 'min_samples_split',
    'min': 0.5,
    'max': 1,
    'step': 0.1},
   {'search': True,
    'name': 'min_samples_leaf',
    'min': 1,
    'max': 10,
    'step': 2}],
  'categorical': [{'search': False,
    'name': 'criterion',
    'selected': ['gini'],
    'values': ['gini', 'entropy']},
   {'search': False,
    'name': 'max_features',
    'selected': ['auto'],
    'values': ['auto', 'sqrt', 'log2']},
   {'search': False,
    'name': 'bootstrap',
    'selected': [True],
    'values': [True, False]},

In [11]:
clf_pypelines_all.code_to_clipboard()

### Update grid search for a model

In [13]:
clf_pypelines_all.model_grid_search_settings(model_name="Random Forest Classifier")

{'numerical': [{'search': True,
   'name': 'n_estimators',
   'min': 10,
   'max': 100,
   'step': 20},
  {'search': True, 'name': 'max_depth', 'min': 2, 'max': 10, 'step': 2},
  {'search': True,
   'name': 'min_samples_split',
   'min': 0.5,
   'max': 1,
   'step': 0.1},
  {'search': True,
   'name': 'min_samples_leaf',
   'min': 1,
   'max': 10,
   'step': 2}],
 'categorical': [{'search': False,
   'name': 'criterion',
   'selected': ['gini'],
   'values': ['gini', 'entropy']},
  {'search': False,
   'name': 'max_features',
   'selected': ['auto'],
   'values': ['auto', 'sqrt', 'log2']},
  {'search': False,
   'name': 'bootstrap',
   'selected': [True],
   'values': [True, False]},
  {'search': False,
   'name': 'oob_score',
   'selected': [True],
   'values': [True, False]},
  {'search': False,
   'name': 'warm_start',
   'selected': [False],
   'values': [True, False]},
  {'search': False,
   'name': 'class_weight',
   'selected': ['balanced'],
   'values': ['balanced', 'balanced_s

In [17]:
rf_updated_dict = {'numerical': [{'search': True,
   'name': 'n_estimators',
   'min': 100,
   'max': 1000,
   'step': 20},
  {'search': True, 'name': 'max_depth', 'min': 2, 'max': 10, 'step': 2},
  {'search': True,
   'name': 'min_samples_split',
   'min': 0.50,
   'max': 1,
   'step': 0.1},
  {'search': True,
   'name': 'min_samples_leaf',
   'min': 1,
   'max': 10,
   'step': 2}],
 'categorical': [{'search': False,
   'name': 'criterion',
   'selected': ['gini'],
   'values': ['gini', 'entropy']},
  {'search': False,
   'name': 'max_features',
   'selected': ['sqrt'],
   'values': ['auto', 'sqrt', 'log2']},
  {'search': False,
   'name': 'bootstrap',
   'selected': [True],
   'values': [True, False]},
  {'search': True,
   'name': 'oob_score',
   'selected': [True],
   'values': [True, False]},
  {'search': False,
   'name': 'warm_start',
   'selected': [False],
   'values': [True, False]},
  {'search': False,
   'name': 'class_weight',
   'selected': ['balanced'],
   'values': ['balanced', 'balanced_subsample']}]}

In [19]:
clf_pypelines_all.set_model_grid_search_settings(hyperparam_dict=rf_updated_dict,model_name = 'Random Forest Classifier')


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import plotly.express as px
import plotly.graph_objects as go


# target dataframe: titanic
target = "Survived"
features = list(titanic.columns.drop("Survived"))
feature_df = titanic[features]

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
titanic[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(