In [43]:
%load_ext autoreload
%autoreload 2
import numpy as np
import random
import pandas as pd
import sklearn
import importlib
import imodels
from mrules.api.modeling import fit_models
from imblearn.over_sampling import RandomOverSampler

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# get data

In [44]:
project_id = 'iai_pecarn'
print('fitting on', project_id)
project_module_name = f'mrules.projects.{project_id}.dataset'
module = importlib.import_module(project_module_name)
dset = module.Dataset()
df_train, df_tune, df_test = dset.get_data(load_csvs=True)

fitting on iai_pecarn


In [45]:
# balance the classes
over_sampler = RandomOverSampler(random_state=42)
df_train, y_train = over_sampler.fit_resample(df_train.drop(columns='outcome'), df_train['outcome'])
df_train['outcome'] = y_train
df_train['outcome'].value_counts()

0    7111
1    7111
Name: outcome, dtype: int64

# fit models

In [3]:
np.random.seed(0)
random.seed(0)
predictor = fit_models(df_train, df_tune, interpretable=True)
predictor.leaderboard(df_tune, silent=True, extra_metrics=['accuracy', 'precision', 'recall'])

Presets specified: ['interpretable']
Beginning AutoGluon training ... Time limit = 30s
AutoGluon will save models to "/Volumes/GoogleDrive/My Drive/research/medical_rules/medical-rules/data/autogluon_cache/"
AutoGluon Version:  0.3.2b20211006
Train Data Rows:    7226
Train Data Columns: 58
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    5977.77 MB
	Train Data (Original)  Memory Usage: 3.35 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to 

corels py feats [] <class 'list'>


	0.5	 = Validation score   (roc_auc)
	0.3s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: OptimalTree ... Training model for up to 26.04s of the 26.04s of remaining time.
	0.6116	 = Validation score   (roc_auc)
	0.05s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: BoostedRules ... Training model for up to 25.97s of the 25.97s of remaining time.
	0.9203	 = Validation score   (roc_auc)
	0.1s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ... Training model for up to 29.76s of the 25.81s of remaining time.
	0.9558	 = Validation score   (roc_auc)
	0.58s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 4.87s ...
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/Volumes/GoogleDrive/My Drive/research/medical_rules/medical-rules/data/autogluon_cache/")
  _warn_prf(average, modifier, msg_start, len(result))


                        model_types  model_performance  complexity  \
RuleFit                RuleFitModel           0.955755         8.0   
BoostedRules      BoostedRulesModel           0.920300        20.0   
GreedyTree          GreedyTreeModel           0.611639   4718592.0   
OptimalTree   GlobalSparseTreeModel           0.611639   4718592.0   
RuleList        CorelsRuleListModel           0.500000         1.0   

                       model_best  \
RuleFit       WeightedEnsemble_L2   
BoostedRules  WeightedEnsemble_L2   
GreedyTree    WeightedEnsemble_L2   
OptimalTree   WeightedEnsemble_L2   
RuleList      WeightedEnsemble_L2   

                                                    model_paths  \
RuleFit       /Volumes/GoogleDrive/My Drive/research/medical...   
BoostedRules  /Volumes/GoogleDrive/My Drive/research/medical...   
GreedyTree    /Volumes/GoogleDrive/My Drive/research/medical...   
OptimalTree   /Volumes/GoogleDrive/My Drive/research/medical...   
RuleList      /Volume

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,model,score_test,accuracy,precision,recall,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,RuleFit,0.891218,0.982565,0.0,0.0,0.955755,0.016057,0.01302,3.26289,0.016057,0.01302,3.26289,1,True,1
1,WeightedEnsemble_L2,0.891218,0.982565,0.0,0.0,0.955755,0.018421,0.014553,3.83804,0.002364,0.001533,0.57515,2,True,6
2,BoostedRules,0.872739,0.982565,0.0,0.0,0.9203,0.005269,0.003805,0.096408,0.005269,0.003805,0.096408,1,True,5
3,OptimalTree,0.568371,0.969697,0.155556,0.166667,0.611639,0.001561,0.002515,0.05077,0.001561,0.002515,0.05077,1,True,4
4,GreedyTree,0.568371,0.969697,0.155556,0.166667,0.611639,0.00166,0.002276,0.061109,0.00166,0.002276,0.061109,1,True,2
5,RuleList,0.5,0.982565,0.0,0.0,0.5,0.034182,0.013216,0.296712,0.034182,0.013216,0.296712,1,True,3


In [7]:
np.random.seed(0)
random.seed(0)
predictor = fit_models(df_train, df_tune, interpretable=False)
predictor.leaderboard(df_tune, silent=True, extra_metrics=['roc_auc', 'precision', 'recall'])

Beginning AutoGluon training ... Time limit = 30s
AutoGluon will save models to "/Volumes/GoogleDrive/My Drive/research/medical_rules/medical-rules/data/autogluon_cache/"
AutoGluon Version:  0.3.2b20211006
Train Data Rows:    14222
Train Data Columns: 58
Preprocessing data ...
AutoGluon infers your prediction problem is: 'binary' (because only two unique label-values observed).
	2 unique label values:  [0, 1]
	If 'binary' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Selected class <--> label mapping:  class 1 = 1, class 0 = 0
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    6539.22 MB
	Train Data (Original)  Memory Usage: 6.6 MB (0.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of th

Unnamed: 0,model,score_test,roc_auc,precision,recall,score_val,pred_time_test,pred_time_val,fit_time,pred_time_test_marginal,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,0.865879,0.865879,0.081967,0.119048,0.99963,0.971324,0.438764,18.790404,0.013218,0.001703,2.028207,2,True,9
1,ExtraTreesGini,0.856137,0.856137,0.055556,0.071429,0.999608,0.253104,0.108927,0.990921,0.253104,0.108927,0.990921,1,True,6
2,RandomForestEntr,0.852707,0.852707,0.056604,0.071429,0.999628,0.252163,0.104736,0.978276,0.252163,0.104736,0.978276,1,True,4
3,ExtraTreesEntr,0.845711,0.845711,0.070175,0.095238,0.999628,0.158604,0.109566,0.976095,0.158604,0.109566,0.976095,1,True,7
4,RandomForestGini,0.842618,0.842618,0.039216,0.047619,0.999606,0.275251,0.108669,0.993718,0.275251,0.108669,0.993718,1,True,3
5,CatBoost,0.784905,0.784905,0.147541,0.214286,0.999333,0.018984,0.005163,12.823187,0.018984,0.005163,12.823187,1,True,5
6,NeuralNetFastAI,0.725044,0.725044,0.166667,0.238095,0.998059,0.0845,0.054997,11.446886,0.0845,0.054997,11.446886,1,True,8
7,KNeighborsUnif,0.551034,0.551034,0.031933,0.452381,0.84977,0.110525,0.109296,0.025935,0.110525,0.109296,0.025935,1,True,1
8,KNeighborsDist,0.461972,0.461972,0.026549,0.357143,0.851523,0.115476,0.108786,0.033132,0.115476,0.108786,0.033132,1,True,2


In [9]:
predictor.get_model_best()

'WeightedEnsemble_L2'

# examine predictions

In [99]:
# m = sklearn.linear_model.LogisticRegression()
# predictions = m.fit(df_train.drop(columns='outcome'), df_train['outcome']).predict(df_tune.drop(columns='outcome'))
d = df_train
predictions = 0 * d['outcome']
cls, columns = imodels.explain_classification_errors(d.drop(columns='outcome'),
                                                     0 * d['outcome'], d['outcome'],
                                                     classifier=imodels.CorelsRuleListClassifier(c=0.005))

CorelsClassifier:

   7111 / 14222 (positive class / total)
	↓ 
[96mIf GCSScore_1                         [00m → 4670 / 11444 (40.8%)
	↓ 
   2441 /  2778	 
	↓ 
[96mIf GCSScore_1  & LtCostalTender_0 == 0[00m →   0 /    0 (0.0%)
	↓ 
   2441 /  2778	 

