 # Evaluation Exercise

In [1]:
from si.io.data_file import read_data_file
from sklearn.preprocessing import StandardScaler
from si.model_selection.cross_validate import cross_validate
from si.model_selection.grid_search import grid_search_cv
from si.model_selection.randomize_search_cv import randomized_search_cv
from si.linear_module.logistic_regression import LogisticRegression
import numpy as np

## Cross Validate Test

In [2]:
breast_bin_dataset = r"C:\Users\anaca\Documents\GitHub\SIB-ML-Portfolio\datasets\breast-bin.data"
dataset = read_data_file(breast_bin_dataset, sep=",", label=True)

In [3]:
dataset.x = StandardScaler().fit_transform(dataset.x)

In [4]:
lg = LogisticRegression()
scores = cross_validate(lg, dataset, cv=5)
scores

{'seed': [647, 725, 935, 497, 304],
 'train': [0.9714285714285714,
  0.9653061224489796,
  0.9673469387755103,
  0.9734693877551021,
  0.9673469387755103],
 'test': [0.9569377990430622,
  0.9712918660287081,
  0.9665071770334929,
  0.9521531100478469,
  0.9712918660287081],
 'parameters': []}

## Grid Search Test

In [5]:
lg2 = LogisticRegression()

lg2_param = {'l2_penalty': [1, 10],
             'alpha': [0.001, 0.0001],
            'max_iter': [1000, 2000]}

scores = grid_search_cv(lg2, dataset, lg2_param, cv=3)
scores

[{'seed': [966, 858, 57],
  'train': [0.9612244897959183, 0.9693877551020408, 0.9775510204081632],
  'test': [0.9808612440191388, 0.9617224880382775, 0.9425837320574163],
  'parameters': [{'l2_penalty': 1, 'alpha': 0.001, 'max_iter': 1000}]},
 {'seed': [568, 561, 27],
  'train': [0.9714285714285714, 0.9612244897959183, 0.9653061224489796],
  'test': [0.9569377990430622, 0.9808612440191388, 0.9712918660287081],
  'parameters': [{'l2_penalty': 1, 'alpha': 0.001, 'max_iter': 2000}]},
 {'seed': [779, 898, 672],
  'train': [0.9693877551020408, 0.963265306122449, 0.9673469387755103],
  'test': [0.9617224880382775, 0.9760765550239234, 0.9665071770334929],
  'parameters': [{'l2_penalty': 1, 'alpha': 0.0001, 'max_iter': 1000}]},
 {'seed': [232, 480, 731],
  'train': [0.9673469387755103, 0.963265306122449, 0.9714285714285714],
  'test': [0.9665071770334929, 0.9760765550239234, 0.9569377990430622],
  'parameters': [{'l2_penalty': 1, 'alpha': 0.0001, 'max_iter': 2000}]},
 {'seed': [213, 491, 538],

## Randomize Search

In [4]:
lg3 = LogisticRegression()

lg3_param = {'l2_penalty': np.linspace(1, 10, 10),
             'alpha': np.linspace(0.001, 0.0001, 100),
            'max_iter': np.linspace(1000, 2000, 200)}

scores = randomized_search_cv(lg3, dataset, lg3_param, cv=3)
scores

{'parameters': [{'l2_penalty': 10.0,
   'alpha': 0.0007545454545454546,
   'max_iter': 1306.532663316583},
  {'l2_penalty': 10.0,
   'alpha': 0.000809090909090909,
   'max_iter': 1718.5929648241208},
  {'l2_penalty': 5.0, 'alpha': 0.0009, 'max_iter': 1070.3517587939698},
  {'l2_penalty': 2.0,
   'alpha': 0.00039090909090909096,
   'max_iter': 1025.1256281407036},
  {'l2_penalty': 6.0,
   'alpha': 0.0009363636363636364,
   'max_iter': 1819.0954773869348},
  {'l2_penalty': 3.0,
   'alpha': 0.0006363636363636364,
   'max_iter': 1507.5376884422112},
  {'l2_penalty': 6.0,
   'alpha': 0.00023636363636363633,
   'max_iter': 1125.6281407035176},
  {'l2_penalty': 6.0,
   'alpha': 0.00030909090909090914,
   'max_iter': 1376.8844221105528},
  {'l2_penalty': 1.0,
   'alpha': 0.00011818181818181817,
   'max_iter': 1005.0251256281407},
  {'l2_penalty': 4.0,
   'alpha': 0.00032727272727272726,
   'max_iter': 1718.5929648241208}],
 'seed': [783, 762, 673, 495, 930, 245, 35, 844, 333, 520],
 'train': [