In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
# new in 0.21!
sklearn.set_config(print_changed_only=True)

# Roadmap and Directions

Also see https://scikit-learn.org/dev/roadmap.html

## Feature Names

In [6]:
from sklearn.model_selection import train_test_split
churn = pd.read_csv("WA_Fn-UseC_-Telco-Customer-Churn.csv")
X = churn.drop(['customerID', 'Churn'], axis=1)
y = churn.Churn
X.TotalCharges.replace(" ", np.NaN, inplace=True)
continuous_columns = ['tenure', 'MonthlyCharges', 'TotalCharges']
categorical_columns = [c for c in X.columns if c not in continuous_columns]
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

In [17]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

cont_preprocessing = make_pipeline(StandardScaler(), SimpleImputer())

ct = ColumnTransformer([("continuous", cont_preprocessing, continuous_columns),
                        ("categorical", OneHotEncoder(), categorical_columns)])
X_train_pre = ct.fit_transform(X_train)

In [18]:
type(X_train)

pandas.core.frame.DataFrame

In [19]:
X_train.shape

(5282, 19)

In [20]:
X_train.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [21]:
X_train_pre.shape

(5282, 46)

In [22]:
type(X_train_pre)

numpy.ndarray

In [23]:
# this uses the branch at https://github.com/scikit-learn/scikit-learn/pull/13307
ct.get_feature_names()

['continuous__tenure',
 'continuous__MonthlyCharges',
 'continuous__TotalCharges',
 'categorical__gender_Female',
 'categorical__gender_Male',
 'categorical__SeniorCitizen_0',
 'categorical__SeniorCitizen_1',
 'categorical__Partner_No',
 'categorical__Partner_Yes',
 'categorical__Dependents_No',
 'categorical__Dependents_Yes',
 'categorical__PhoneService_No',
 'categorical__PhoneService_Yes',
 'categorical__MultipleLines_No',
 'categorical__MultipleLines_No phone service',
 'categorical__MultipleLines_Yes',
 'categorical__InternetService_DSL',
 'categorical__InternetService_Fiber optic',
 'categorical__InternetService_No',
 'categorical__OnlineSecurity_No',
 'categorical__OnlineSecurity_No internet service',
 'categorical__OnlineSecurity_Yes',
 'categorical__OnlineBackup_No',
 'categorical__OnlineBackup_No internet service',
 'categorical__OnlineBackup_Yes',
 'categorical__DeviceProtection_No',
 'categorical__DeviceProtection_No internet service',
 'categorical__DeviceProtection_Yes',


## Categorical and Missing values

In [35]:
X = pd.DataFrame({'species': ['Ornamental Pear', np.NaN, 'Red Maple', 'Flaxleaf Paperbark'],
                  'police district': [3, 8, None, 3]})
#X['police district'] = X['police district'].astype("Int64")
X

Unnamed: 0,species,police district
0,Ornamental Pear,3.0
1,,8.0
2,Red Maple,
3,Flaxleaf Paperbark,3.0


In [36]:
imputer = SimpleImputer(strategy='constant', fill_value='missing')
imputer.fit_transform(X)

array([['Ornamental Pear', 3.0],
       ['missing', 8.0],
       ['Red Maple', 'missing'],
       ['Flaxleaf Paperbark', 3.0]], dtype=object)

## Hyper-parameter tuning

### Successive Halving et al

https://github.com/scikit-learn/scikit-learn/pull/13900

### Redundant computation in Pipelines and Grid Search

https://dask-ml.readthedocs.io/en/latest/hyper-parameter-search.html



In [38]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier())])

grid = {'vect__ngram_range': [(1, 1)],
        'tfidf__norm': ['l1', 'l2'],
        'clf__alpha': [1e-3, 1e-4, 1e-5]}

![grid-graph](unmerged_grid_search_graph.svg)

![grid-graph-simplified](merged_grid_search_graph.svg)

(caching sometimes helps)

## Warm-starting in Grid-Search

## Model Diagnostics & Plotting
https://scikit-learn.org/dev/auto_examples/inspection/plot_partial_dependence.html

https://github.com/scikit-learn/scikit-learn/pull/13146

![partial dependence](sphx_glr_plot_partial_dependence_001.png)