# Model selection

In this notebook, we implement a similar functionality as in the example ```003_evaluation_one_dataset``` but using the ```model_selection``` function which simplifies the workflow by returning the oversampler and classifier combination providing the highest score.

In [1]:
import os.path

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

import smote_variants as sv

import sklearn.datasets as datasets

In [2]:
# The model_selection procedure uses the cache_path directory for caching

cache_path= os.path.join(os.path.expanduser('~'), 'smote_test')

if not os.path.exists(cache_path):
    os.makedirs(cache_path)

In [3]:
# Specifying the dataset. Note that the datasets loaded from the imbalanced_learning package come with a 'name'
# field which is used for labelling in the model selection functions, but the datasets loaded from 
# sklearn.datasets lack the 'name' field, therefore, we need to add it manually.

dataset= datasets.load_breast_cancer()
dataset= {'data': dataset['data'], 'target': dataset['target'], 'name': 'breast_cancer'}

In [4]:
# Specifying the classifiers.

knn_classifier= KNeighborsClassifier()
dt_classifier= DecisionTreeClassifier()

In [5]:
# Executing the model selection using 5 parallel jobs and at most 35 random but meaningful parameter combinations
# with the oversamplers.

samp_obj, cl_obj= sv.model_selection(dataset= dataset,
                                        samplers= sv.get_n_quickest_oversamplers(5),
                                        classifiers= [knn_classifier, dt_classifier],
                                        cache_path= cache_path,
                                        n_jobs= 5,
                                        max_samp_par_comb= 35)

2019-06-11 18:18:41,886:INFO:dataset: breast_cancer, samplings_available: True, evaluations_available: True
2019-06-11 18:18:41,888:INFO:doing the folding
2019-06-11 18:18:41,993:INFO:Folding reading from file folding_breast_cancer.pickle
2019-06-11 18:18:42,598:INFO:do the samplings
2019-06-11 18:18:42,599:INFO:create sampling objects
2019-06-11 18:18:42,601:INFO:executing 72 sampling in parallel
2019-06-11 18:19:15,521:INFO:do the evaluations
2019-06-11 18:19:15,521:INFO:create classifier jobs
2019-06-11 18:19:15,549:INFO:executing 72 evaluation jobs in parallel
2019-06-11 18:19:16,119:INFO:concatenating the results
2019-06-11 18:19:16,381:INFO:aggregating the results


In [6]:
# Oversampling and training the classifier providing the best results in the model selection procedure

X_samp, y_samp= samp_obj.sample(dataset['data'], dataset['target'])
cl_obj.fit(X_samp, y_samp)

2019-06-11 18:19:16,487:INFO:OUPS: Running sampling via ('OUPS', "{'proportion': 0.1, 'n_jobs': 1}")


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')