In [1]:
import pandas as pd
import numpy as np

print(pd.__version__)

import sklearn.feature_selection as fs

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

from pprint import pprint
import itertools as it
import random as rnd
import statistics as stat
from dataclasses import dataclass, field

1.0.1


In [2]:
#load sklearn cacer dataset, make a df
cancer = load_breast_cancer()
df = pd.DataFrame(np.c_[cancer['data'], cancer['target']], columns= np.append(cancer['feature_names'], ['target']))

In [3]:
#how it works on a single feature, there are some numpy conversations
#it return 1D np array
fs.mutual_info_classif(df['mean radius'].values.reshape(-1,1), df['target'].values.ravel())

array([0.36888507])

In [4]:
#iterate over all features
feature_mi_pairs = []
for feature_name in list(cancer.feature_names):
    mi = fs.mutual_info_classif(df[feature_name].values.reshape(-1,1), df['target'].values.ravel())
    feature_mi_pairs.append((feature_name, mi[0]))

    
feature_mi_pairs = sorted(feature_mi_pairs, key=lambda tup: tup[1], reverse=True)
pprint(feature_mi_pairs)

[('worst perimeter', 0.47256692767995734),
 ('worst area', 0.4644562953791862),
 ('worst radius', 0.4543093981621742),
 ('mean concave points', 0.439200133497031),
 ('worst concave points', 0.4356648648463073),
 ('mean perimeter', 0.4024833415714808),
 ('mean concavity', 0.3752870982554244),
 ('mean radius', 0.36638900937846497),
 ('mean area', 0.3581690929247303),
 ('area error', 0.3415176431980822),
 ('worst concavity', 0.3159224398631988),
 ('perimeter error', 0.27201882064483973),
 ('radius error', 0.24957131855689574),
 ('worst compactness', 0.22482856238955296),
 ('mean compactness', 0.21146164637077702),
 ('concave points error', 0.12733872153642634),
 ('worst texture', 0.11951278683081101),
 ('concavity error', 0.11694579405190053),
 ('worst smoothness', 0.10689698623463717),
 ('mean texture', 0.0977989039852094),
 ('worst symmetry', 0.08956724397585569),
 ('mean smoothness', 0.07774313790374876),
 ('compactness error', 0.07589158717970124),
 ('mean symmetry', 0.072517978125056

In [5]:
selected_features = [tpl[0] for tpl in feature_mi_pairs[0:10]]
pprint(selected_features)

['worst perimeter',
 'worst area',
 'worst radius',
 'mean concave points',
 'worst concave points',
 'mean perimeter',
 'mean concavity',
 'mean radius',
 'mean area',
 'area error']


In [6]:
#selected_features = cancer['feature_names']

In [7]:
len(selected_features)

10

In [8]:
#split test train - forget the test data
#X = df[cancer['feature_names']]
X = df[selected_features]
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25,random_state=42)

In [9]:
X.head()

Unnamed: 0,worst perimeter,worst area,worst radius,mean concave points,worst concave points,mean perimeter,mean concavity,mean radius,mean area,area error
0,184.6,2019.0,25.38,0.1471,0.2654,122.8,0.3001,17.99,1001.0,153.4
1,158.8,1956.0,24.99,0.07017,0.186,132.9,0.0869,20.57,1326.0,74.08
2,152.5,1709.0,23.57,0.1279,0.243,130.0,0.1974,19.69,1203.0,94.03
3,98.87,567.7,14.91,0.1052,0.2575,77.58,0.2414,11.42,386.1,27.23
4,152.2,1575.0,22.54,0.1043,0.1625,135.1,0.198,20.29,1297.0,94.44


In [10]:
def measure_auc(X, y, estimator, estimator_params):
    """
    Given training data X, y and estimator with candidate params find ROC-AUC
    """
 
    estimator = estimator.set_params(**estimator_params)
    
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    scores = []
    for train_index, test_index in skf.split(X, y):
        X_train_piece, X_test_piece = X.iloc[train_index], X.iloc[test_index]
        y_train_piece, y_test_piece = y.iloc[train_index], y.iloc[test_index]
            
        estimator.fit(X_train_piece, y_train_piece)
        
        score = roc_auc_score(y_test_piece, estimator.predict(X_test_piece))
        
        scores.append(score)
            
    #print(scores)
    mean = stat.mean(scores)
    return mean

In [11]:
#An example to set params, given a dictionary of params
estimator = RandomForestClassifier(random_state=42)
params = {'max_depth': 2, 'n_estimators': 40}
estimator = estimator.set_params(**params)

#measure performance
measure_auc(X_train,y_train, estimator, params)

0.9307328650165686

In [12]:
estimator

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=40,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [13]:
#example below
def generate_param_space(space, num_trials):
    """
    generates all possible combinations in parameter space
    num_trials: sample size to select from all combinations
    """
    
    #Create proper formatted candidates
    bag = []
    
    #key=paramater
    for key in sorted(space.keys()):
        values = space[key]
        key_val_pairs = []
        for val in values:
            pair = (key,val)
            key_val_pairs.append(pair)
        bag.append(key_val_pairs)

    #all candidates but list of tuples
    all_candidate_solutions = list(it.product(*bag))

    #convert to list of dictionaries -> better representation
    all_candid_sol_as_dict = []
    for candid in all_candidate_solutions:
        candid_solution = {}
        for pair in candid:
            key=pair[0]
            value=pair[1]
            candid_solution[key] = value
        all_candid_sol_as_dict.append(candid_solution)

    total_combinations = len(all_candid_sol_as_dict)
    print("Total number of possible parameter combinations:", total_combinations)

    if num_trials > total_combinations:
        num_trials = total_combinations
        print("num_trials > total_combinations, selecting all:", num_trials)
  
    random_candidates = rnd.sample(all_candid_sol_as_dict, num_trials)
    return random_candidates
#####################################################################################

test_space = {"learning_rate" : [0.05, 0.10, 0.15 ],
              "max_depth" : [ 3, 4],
              "min_child_weight" : [ 1, 3, 5, 7 ] }   

# all possible parameter combinations 3x3x4
generate_param_space(test_space, 3) # randomly select 3 of them. becomes a random search

Total number of possible parameter cobinations: 24


[{'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 1},
 {'learning_rate': 0.05, 'max_depth': 4, 'min_child_weight': 5},
 {'learning_rate': 0.1, 'max_depth': 4, 'min_child_weight': 1}]

In [19]:
test_space = {"max_depth" : [2, 3, 4, 5] , 
              "n_estimators" : [50, 60, 70, 80]}   

param_space = generate_param_space(test_space, num_trials=20)

param_auc_list = []

for params in param_space:
    #print('params:', params)
    
    estimator = RandomForestClassifier(random_state=42).set_params(**params) 
    
    auc = measure_auc(X_train, y_train, estimator, params)
    
    param_auc_list.append((estimator, params, auc))

param_auc_list = sorted(param_auc_list, key=lambda tup: tup[2], reverse=True)
#pprint(param_auc_list)

#the best
print('Best Params:')
pprint(param_auc_list[0])

Total number of possible parameter cobinations: 16
num_trials > total_combinations, selecting all: 16
Best Params:
(RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=80,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False),
 {'max_depth': 4, 'n_estimators': 80},
 0.9416834325195554)


In [15]:
#Re-fit all training data
best_params = param_auc_list[0][1]
print(best_params)

best_estimator = RandomForestClassifier(random_state=42).set_params(**best_params)

best_estimator.fit(X_train, y_train)

{'max_depth': 4, 'n_estimators': 80}


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=4, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=80,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [16]:
#performance on training data
roc_auc_score(y_train, best_estimator.predict(X_train))

0.9811320754716981

In [17]:
#performance on test data
roc_auc_score(y_test, best_estimator.predict(X_test))

0.9417190775681342

In [18]:
#all - train 0.990566 vs test 0.951153