### Experiment 4: AutoML

In [18]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf
import plotly.express as px
import plotly.graph_objects as go
import missingno
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

cf.go_offline()

%matplotlib inline

In [19]:
import autosklearn.classification
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler 
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbpipeline

In [20]:
churn = pd.read_csv(r'Telco-Customer-Churn.csv')

churn['TotalCharges'] = pd.to_numeric(churn['TotalCharges'], errors='coerce')

# replacing all the blank values with NaN 
churn_clean = churn.replace(r'^\s*$', np.nan, regex=True)

#preparing the feature and target
X = churn_clean.drop(columns=['Churn'])
y = churn_clean['Churn'].replace(('Yes', 'No'), (1, 0))

print(X.shape, y.shape)

(7043, 20) (7043,)


In [21]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33, random_state=42)

In [22]:
#reusable function to build a pipeline
def build_pipeline(model, withOverSampling, withPCA=False):
    feature_to_drop = ['customerID', 'gender', 'MultipleLines', 'PaymentMethod', 
                       'PaperlessBilling', 'StreamingTV', 'StreamingMovies']
    num_feat = ['tenure','MonthlyCharges', 'TotalCharges']
    cat_feat = ['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'InternetService',
           'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
           'Contract']

    #scale the numeric features
    #num_pipeline = StandardScaler()
    num_pipeline = MinMaxScaler()

    #perform onehot encoding on the categorical features
    cat_pipeline = OneHotEncoder(handle_unknown='ignore')
    
    col_transform = ColumnTransformer(
        transformers=[
            ('drop_feat', 'drop', feature_to_drop),
            ('proc_num_feat', num_pipeline, num_feat),
            ('proc_cat_feat', cat_pipeline, cat_feat)
        ], remainder='passthrough'
    )
    
    if withOverSampling:
        mlpipeline = imbpipeline(steps = [
                        ['col_transform', col_transform],
                        ['smote', SMOTE(random_state=88)],   
                        ['classifier', model]
            ]
        )
    else:
         mlpipeline = imbpipeline(steps = [
                        ['col_transform', col_transform],  
                        ['classifier', model]
            ]
                            
         )
    
    if withPCA:
        mlpipeline = imbpipeline(steps = [
                        ['col_transform', col_transform],
                        ['smote', SMOTE(random_state=88)], 
                        ['pca', PCA(n_components=10)],
                        ['classifier', model]
            ]
        )
        
    
    return mlpipeline

In [23]:
def fit_pipeline(pipelines):
    for pipe in pipelines:
        pipe.fit(X_train, y_train)

In [24]:
pipeLines = []
automl = build_pipeline(autosklearn.classification.AutoSklearnClassifier(
    time_left_for_this_task=120, per_run_time_limit=30, n_jobs=4,
    include_estimators=["liblinear_svc", "k_nearest_neighbors", "decision_tree",
                        "libsvm_svc", "gaussian_nb", "random_forest", "mlp", 
                        ],
    exclude_estimators=None, include_preprocessors=["no_preprocessing", ],
    exclude_preprocessors=None)
                        ,
                        False)
pipeLines.append(automl)
#n_jobs - number of threads to use
#include estimators if considered "adaboost", "gradient_boosting", "sgd",

In [25]:
fit_pipeline(pipeLines)

In [26]:
print(automl['classifier'].show_models())

[(0.160000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'random_forest', 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding', 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'minority_coalescer', 'data_preprocessing:numerical_transformer:imputation:strategy': 'most_frequent', 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'quantile_transformer', 'feature_preprocessor:__choice__': 'no_preprocessing', 'classifier:random_forest:bootstrap': 'False', 'classifier:random_forest:criterion': 'entropy', 'classifier:random_forest:max_depth': 'None', 'classifier:random_forest:max_features': 0.8194447905213179, 'classifier:random_forest:max_leaf_nodes': 'None', 'classifier:random_forest:min_impurity_decrease': 0.0, 'classifier:random_forest:min_samples_leaf': 20, 'classifier:random_forest:min_samples_split': 2, 'classifier:random_forest:min_weight_fraction_leaf': 0.0, 'da

In [27]:
print(automl['classifier'].sprint_statistics())

auto-sklearn results:
  Dataset name: 5ea6cca8-ed66-11eb-bb03-a5076bdad228
  Metric: accuracy
  Best validation score: 0.789981
  Number of target algorithm runs: 29
  Number of successful target algorithm runs: 24
  Number of crashed target algorithm runs: 0
  Number of target algorithms that exceeded the time limit: 5
  Number of target algorithms that exceeded the memory limit: 0



In [28]:
automl['classifier'].cv_results_['params'][np.argmax(automl['classifier'].cv_results_['mean_test_score'])]

{'balancing:strategy': 'none',
 'classifier:__choice__': 'random_forest',
 'data_preprocessing:categorical_transformer:categorical_encoding:__choice__': 'no_encoding',
 'data_preprocessing:categorical_transformer:category_coalescence:__choice__': 'no_coalescense',
 'data_preprocessing:numerical_transformer:imputation:strategy': 'mean',
 'data_preprocessing:numerical_transformer:rescaling:__choice__': 'robust_scaler',
 'feature_preprocessor:__choice__': 'no_preprocessing',
 'classifier:random_forest:bootstrap': 'True',
 'classifier:random_forest:criterion': 'gini',
 'classifier:random_forest:max_depth': 'None',
 'classifier:random_forest:max_features': 0.8201576390625605,
 'classifier:random_forest:max_leaf_nodes': 'None',
 'classifier:random_forest:min_impurity_decrease': 0.0,
 'classifier:random_forest:min_samples_leaf': 3,
 'classifier:random_forest:min_samples_split': 5,
 'classifier:random_forest:min_weight_fraction_leaf': 0.0,
 'data_preprocessing:numerical_transformer:rescaling:r

### The field classifier choise above indicates the best model to use