In [8]:
import pypelines.supervised_pipeline as pipe
from pypelines import utils

In [9]:
utils.list_supported_models(model_type='classification')

['Decision Tree',
 'Logistic Regression',
 'Random Forest',
 'SVC',
 'XGBoost',
 'MLP',
 'Ridge Classifier',
 'HistGBT Classifier',
 'Perceptron Classifier',
 'SGD Classifier',
 'GBT Classifier',
 'ADABoost Classifier',
 'ExtraTrees Classifier',
 'PassiveAggressive Classifier',
 'LDA Classifier',
 'QDA Classifier',
 'NuSVC Classifier',
 'GaussianNB Classifier',
 'MultinomialNB Classifier',
 'ComplementNB Classifier',
 'BernoulliNB Classifier',
 'CategoricalNB Classifier']

In [15]:
import pandas as pd
titanic = pd.read_csv("pypelines/datasets/classification/titanic.csv")

### regression - all models

In [20]:
# code output
reg_pypelines_all = pipe.SupervisedPipeline(data = titanic,target = 'Survived'
                            , model_type = 'regression'
#                            , models = ['Logistic Regression','Random Forest']
                            , nfolds = 5)

In [21]:
reg_pypelines_all.get_hyperparameters()

{'Elastic Net Regression': {'numerical': [{'name': 'alpha',
    'min': 0.1,
    'max': 1,
    'step': 0.5},
   {'name': 'l1_ratio', 'min': 0.0, 'max': 1.0, 'step': 0.1},
   {'name': 'max_iter', 'min': 500, 'max': 1000, 'step': 100}],
  'categorical': [{'name': 'fit_intercept',
    'selected': [True],
    'values': [True, False]},
   {'name': 'precompute', 'selected': [False], 'values': [True, False]},
   {'name': 'selection',
    'selected': ['cyclic'],
    'values': ['cyclic', 'random']}]},
 'Linear Regression': {'numerical': [{'name': 'n_jobs',
    'min': 1,
    'max': 10,
    'step': 1}],
  'categorical': [{'name': 'fit_intercept',
    'selected': [True],
    'values': [True, False]},
   {'name': 'normalize', 'selected': [True], 'values': [True, False]}]},
 'Lasso Regression': {'numerical': [{'name': 'alpha',
    'min': 10,
    'max': 100,
    'step': 10},
   {'name': 'max_iter', 'min': 100, 'max': 1000, 'step': 100}],
  'categorical': [{'name': 'fit_intercept',
    'selected': [Tru

In [18]:
reg_pypelines_all.model_list()

['Logistic Regression', 'Random Forest']


In [19]:
reg_pypelines_all.get_code()


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import plotly.express as px
import plotly.graph_objects as go


# target dataframe: titanic
titanic = pd.read_csv("./titanic.csv")
target = "Survived"
features = list(titanic.columns.drop("Survived"))
feature_df = titanic[features]

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
titanic[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tol

In [6]:
reg_pypelines_all.code_to_clipboard()

In [7]:
reg_pypelines_all.code_to_file(path='./code_output/')

'model files saved to ./code_output/'

### regression - selected models

In [8]:
reg_pypelines_sel = pipe.SupervisedPipeline(data = "titanic",target = 'Survived'
                            , model_type = 'regression'
                            , models = ['Linear Regression','Elastic Net']
                            , nfolds = 5)

In [9]:
reg_pypelines_sel.get_hyperparameters()

{'Linear Regression': {'numerical': [{'name': 'n_jobs',
    'min': 1,
    'max': 10,
    'step': 1}],
  'categorical': [{'name': 'fit_intercept',
    'selected': [True],
    'values': [True, False]},
   {'name': 'normalize', 'selected': [True], 'values': [True, False]}]}}

In [10]:
reg_pypelines_sel.model_list()

['Linear Regression', 'Elastic Net']


In [11]:
reg_pypelines_sel.get_code()


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import plotly.express as px
import plotly.graph_objects as go


# target dataframe: titanic
titanic = pd.read_csv("./titanic.csv")
target = "Survived"
features = list(titanic.columns.drop("Survived"))
feature_df = titanic[features]

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
titanic[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tol

In [12]:
reg_pypelines_sel.code_to_clipboard()

In [13]:
reg_pypelines_sel.code_to_file(path='./code_output/')

'model files saved to ./code_output/'

### classification - all models

In [22]:
clf_pypelines_all = pipe.SupervisedPipeline(data = "titanic",target = 'Survived'
                            , model_type = 'classification'
                           , models = ['Logistic Regression','Random Forest']
                            , nfolds = 5)

In [23]:
clf_pypelines_all.get_hyperparameters()

{'Logistic Regression': {'numerical': [{'name': 'C',
    'min': 0.1,
    'max': 1,
    'step': 0.1}],
  'categorical': [{'name': 'penalty',
    'selected': ['l2'],
    'values': ['l2', 'elasticnet', 'none']}]},
 'Random Forest': {'numerical': [{'name': 'n_estimators',
    'min': 10,
    'max': 100,
    'step': 20},
   {'name': 'max_depth', 'min': 2, 'max': 10, 'step': 2},
   {'name': 'min_samples_split', 'min': 0.5, 'max': 1, 'step': 0.1},
   {'name': 'min_samples_leaf', 'min': 1, 'max': 10, 'step': 2}],
  'categorical': [{'name': 'criterion',
    'selected': ['gini'],
    'values': ['gini', 'entropy']},
   {'name': 'max_features',
    'selected': ['auto'],
    'values': ['auto', 'sqrt', 'log2']},
   {'name': 'bootstrap', 'selected': [True], 'values': [True, False]},
   {'name': 'oob_score', 'selected': [True], 'values': [True, False]},
   {'name': 'warm_start', 'selected': [False], 'values': [True, False]},
   {'name': 'class_weight',
    'selected': ['balanced'],
    'values': ['bala

In [24]:
clf_pypelines_all.model_list()

['Logistic Regression', 'Random Forest']


In [25]:
clf_pypelines_all.get_code()


from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import plotly.express as px
import plotly.graph_objects as go


# target dataframe: titanic
titanic = pd.read_csv("./titanic.csv")
target = "Survived"
features = list(titanic.columns.drop("Survived"))
feature_df = titanic[features]

# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
titanic[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist(

In [18]:
clf_pypelines_all.code_to_clipboard()

In [19]:
clf_pypelines_all.code_to_file(path='./code_output/')

'model files saved to ./code_output/'

### classification - selected models

In [20]:


clf_pypelines_sel = pipe.SupervisedPipeline(data = "titanic",target = 'Survived'
                            , model_type = 'classification'
                            , models = ['Logistic Regression','Random Forest']
                            , nfolds = 5)

In [21]:
clf_pypelines_sel.get_hyperparameters()


{'Logistic Regression': {'numerical': [{'name': 'C',
    'min': 0.1,
    'max': 1,
    'step': 0.1}],
  'categorical': [{'name': 'penalty',
    'selected': ['l2'],
    'values': ['l2', 'elasticnet', 'none']}]},
 'Random Forest': {'numerical': [{'name': 'n_estimators',
    'min': 10,
    'max': 100,
    'step': 20},
   {'name': 'max_depth', 'min': 2, 'max': 10, 'step': 2},
   {'name': 'min_samples_split', 'min': 0.5, 'max': 1, 'step': 0.1},
   {'name': 'min_samples_leaf', 'min': 1, 'max': 10, 'step': 2}],
  'categorical': [{'name': 'criterion',
    'selected': ['gini'],
    'values': ['gini', 'entropy']},
   {'name': 'max_features',
    'selected': ['auto'],
    'values': ['auto', 'sqrt', 'log2']},
   {'name': 'bootstrap', 'selected': [True], 'values': [True, False]},
   {'name': 'oob_score', 'selected': [True], 'values': [True, False]},
   {'name': 'warm_start', 'selected': [False], 'values': [True, False]},
   {'name': 'class_weight',
    'selected': ['balanced'],
    'values': ['bala

In [22]:
clf_pypelines_sel.model_list()


['Logistic Regression', 'Random Forest']


In [23]:
clf_pypelines_sel.code_to_clipboard()

In [24]:
clf_pypelines_sel.code_to_file(path='./code_output/')

'model files saved to ./code_output/'

In [25]:
clf_pypelines_sel.grid_search_for_model('Logistic Regression')

{'Logistic Regression': {'numerical': [{'name': 'C',
    'min': 0.1,
    'max': 1,
    'step': 0.1}],
  'categorical': [{'name': 'penalty',
    'selected': ['l2'],
    'values': ['l2', 'elasticnet', 'none']}]}}

In [27]:

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import plotly.express as px
import plotly.graph_objects as go


# target dataframe: titanic
#titanic = pd.read_csv("./titanic.csv")
target = "Survived"
features = list(titanic.columns.drop("Survived"))
feature_df = titanic[features]


In [30]:
feature_df.dtypes

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [28]:
# get numerical and categorical columns
bool_cols = feature_df.select_dtypes(include=['bool']).columns.tolist()
titanic[bool_cols] = feature_df[bool_cols].astype(int)
numerical_cols = feature_df.select_dtypes(include=['int', 'float']).columns.tolist()
categorical_cols = feature_df.select_dtypes(include=['object']).columns.tolist()
text_cols = feature_df.select_dtypes(include=['string']).columns.tolist()
# check categorical columns for high cardinality

sample_size = np.min([10000, titanic.shape[0]])
unique_theshold = np.min([100, sample_size/10])

# check categorical columns for high cardinality
for col in categorical_cols:
    if titanic[col].sample(sample_size).nunique() > unique_theshold:
        categorical_cols.remove(col)
        text_cols.append(col)

# check text columns for low cardinality
for col in text_cols:
    if titanic[col].sample(sample_size).nunique() < unique_theshold:
        categorical_cols.append(col)
        text_cols.remove(col)

# define numeric transformer steps
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")), 
        ("scaler", StandardScaler())]
)

# define categorical transformer steps
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")), 
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

# define text transformer steps
text_transformer = Pipeline(
    steps=[
        ('text', TfidfVectorizer())
    ]
)

[]