In [61]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
import evalml
import woodwork as ww
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from featuretools.selection import remove_low_information_features, remove_highly_null_features, remove_single_value_features, remove_highly_correlated_features

In [62]:
def custom(name):
    
    df = pd.read_csv('ca.csv')
    
    imputer = SimpleImputer(missing_values = np.NaN, strategy = 'constant')

    for col in df.columns:
        df[col] = imputer.fit_transform(df[col].values.reshape(-1, 1))
    
    y = df[name]
    
    df = df[df.columns[1:25]]

    df = remove_low_information_features(df)

    df = remove_highly_null_features(df)

    df = remove_single_value_features(df)

    df = remove_highly_correlated_features(df)

    X = df
    
    problem_type = 'binary'
    objective =  'auto'

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


    automl = evalml.automl.AutoMLSearch(X_train, y_train, problem_type=problem_type, objective = objective)
    
    automl.search()
    
    best_pipeline = automl.best_pipeline
    best_pipeline.save(name + '_best_pipeline')
    best_pipeline = automl.load(name + '_best_pipeline')
    
    temp = pd.DataFrame()
    
    temp[name + '_feature'] = best_pipeline.feature_importance['feature']
    temp[name + '_importance'] = best_pipeline.feature_importance['importance']
    
    temp2  = result.merge(temp, left_on = result.index, right_on = temp.index)
    temp2 =  temp2.drop(['key_0'], axis = 1)
    
    return temp2

In [65]:
cols = ['DAST_2', 'OP_NMU_EVER', 'BENZ_NMU_EVER', 'STIM_NMU_EVER', 'GABA_NMU_EVER']

result = pd.DataFrame(index=range(32))

for col in cols:
    result = custom(col)

Using default limit of max_batches=1.

Generating pipelines to search over...
Column 'QLANG' is 95.0% or more correlated with the target
Column 'DEM_GENDER' is 95.0% or more correlated with the target
Column 'DEM_ABOR' is 95.0% or more correlated with the target
Column 'DEM_ABOR_TYPE' is 95.0% or more correlated with the target
Column 'DEM_STDNT' is 95.0% or more correlated with the target
Column 'DEM_VET' is 95.0% or more correlated with the target
Column 'DEM_HEALTH' is 95.0% or more correlated with the target
Column 'DEM_PREG' is 95.0% or more correlated with the target
Column 'DEM_PREGMNTH' is 95.0% or more correlated with the target
Column 'HEALTH_SETTING_PRE' is 95.0% or more correlated with the target
Column 'HEALTH_SETTING_HOS' is 95.0% or more correlated with the target
Column 'HEALTH_SETTING_OUT' is 95.0% or more correlated with the target
Column 'HEALTH_SETTING_HOME' is 95.0% or more correlated with the target
Column 'HEALTH_SETTING_PRIV' is 95.0% or more correlated with the

FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 1.528
Batch 1: (2/9) Decision Tree Classifier w/ Imputer +... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.635
Batch 1: (3/9) LightGBM Classifier w/ Imputer + One ... Elapsed:00:01
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.264
Batch 1: (4/9) Extra Trees Classifier w/ Imputer + O... Elapsed:00:04
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.235
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + O... Elapsed:00:06
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.294
Batch 1: (6/9) CatBoost Classifier w/ Imputer           Elapsed:00:08
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.503
Batch 1: (7/9) XGBoost Classifier w/ Imputer + One H... Elapsed:00:09
	Start

FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 10.240
Batch 1: (2/9) Decision Tree Classifier w/ Imputer +... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.616
Batch 1: (3/9) LightGBM Classifier w/ Imputer + One ... Elapsed:00:01
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.598
Batch 1: (4/9) Extra Trees Classifier w/ Imputer + O... Elapsed:00:04
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.583
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + O... Elapsed:00:06
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.608
Batch 1: (6/9) CatBoost Classifier w/ Imputer           Elapsed:00:08
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.643
Batch 1: (7/9) XGBoost Classifier w/ Imputer + One H... Elapsed:00:09
	Star

FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.686
Batch 1: (2/9) Decision Tree Classifier w/ Imputer +... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.376
High coefficient of variation (cv >= 0.2) within cross validation scores. Decision Tree Classifier w/ Imputer + One Hot Encoder may not perform as estimated on unseen data.
Batch 1: (3/9) LightGBM Classifier w/ Imputer + One ... Elapsed:00:01
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.172
High coefficient of variation (cv >= 0.2) within cross validation scores. LightGBM Classifier w/ Imputer + One Hot Encoder may not perform as estimated on unseen data.
Batch 1: (4/9) Extra Trees Classifier w/ Imputer + O... Elapsed:00:04
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.147
High coefficient of variation (cv >= 0.2) w

FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.548
Batch 1: (2/9) Decision Tree Classifier w/ Imputer +... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.166
High coefficient of variation (cv >= 0.2) within cross validation scores. Decision Tree Classifier w/ Imputer + One Hot Encoder may not perform as estimated on unseen data.
Batch 1: (3/9) LightGBM Classifier w/ Imputer + One ... Elapsed:00:01
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.099
Batch 1: (4/9) Extra Trees Classifier w/ Imputer + O... Elapsed:00:04
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.076
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + O... Elapsed:00:06
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.104
Batch 1: (6/9) CatBoost Classifier w/ Imputer           E

FigureWidget({
    'data': [{'mode': 'lines+markers',
              'name': 'Best Score',
              'type'…

Batch 1: (1/9) Mode Baseline Binary Classification P... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 1.054
Batch 1: (2/9) Decision Tree Classifier w/ Imputer +... Elapsed:00:00
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.455
High coefficient of variation (cv >= 0.2) within cross validation scores. Decision Tree Classifier w/ Imputer + One Hot Encoder may not perform as estimated on unseen data.
Batch 1: (3/9) LightGBM Classifier w/ Imputer + One ... Elapsed:00:01
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.192
Batch 1: (4/9) Extra Trees Classifier w/ Imputer + O... Elapsed:00:03
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.190
Batch 1: (5/9) Elastic Net Classifier w/ Imputer + O... Elapsed:00:05
	Starting cross validation
	Finished cross validation - mean Log Loss Binary: 0.280
Batch 1: (6/9) CatBoost Classifier w/ Imputer           E

In [66]:
result.head()

Unnamed: 0,DAST_2_feature,DAST_2_importance,OP_NMU_EVER_feature,OP_NMU_EVER_importance,BENZ_NMU_EVER_feature,BENZ_NMU_EVER_importance,STIM_NMU_EVER_feature,STIM_NMU_EVER_importance,GABA_NMU_EVER_feature,GABA_NMU_EVER_importance
0,DEM_ABOR_TYPE,0.228798,QLANG,0.258288,DEM_AGE,0.19103,DEM_AGE,0.369285,DEM_ABOR_TYPE,0.225976
1,DEM_AGE,0.20017,DEM_LOCATION,0.1546,DEM_INCOME,0.186637,DEM_STDNT,0.092306,DEM_ABOR,0.161933
2,DEM_ABOR,0.189792,DEM_REGION,0.124403,DEM_MARITAL,0.139865,DEM_MARITAL,0.086849,DEM_AGE,0.129678
3,DEM_INCOME,0.053897,DEM_AGE,0.100812,DEM_LOCATION,0.079856,DEM_INCOME,0.082243,DEM_POSTAL_T5J,0.086125
4,DEM_LOCATION,0.05374,DEM_INCOME,0.078896,DEM_EDU,0.065056,DEM_EDU,0.067844,DEM_LOCATION,0.063604


In [67]:
result.to_csv('feature_importance.csv', index = False)