In [None]:
DATASET_PATH = {
    'train': 'breast-train-0-s1.csv',
    'test': 'breast-test-0-s1.csv'
}
SAVE_RESULTS_PATH = "workdir/results/res1-new.results"
N_ESTIMATORS = 5
MIN_SAMPLES_SPLIT = 2
N_JOBS = 1
MAX_DEPTH = 5
SUBSPACES = 5
CV = 5
CV_REPEATS = 10
RF_TYPE = 'randomForest' # 'randomForest'
SELECTION_METHODS = ['balanced_accuracy', 'accuracy', 'rf_accuracy', 'rf_balanced_accuracy', 'accuracy/accuracy_stddev'] # 'balanced_accuracy' # 'accuracy', 'f1_weighted'
PARALLELISM = "loky"

In [None]:

import time
import warnings
from collections import defaultdict

import joblib
import networkx as nx
import numpy as np
import pandas as pd
from imblearn.metrics import geometric_mean_score
from joblib import Parallel, delayed
from networkx.algorithms.clique import find_cliques
from rules.api import AdjacentOrNot
from rules.classification.rule_measures import BayesianRuleMeasures, covered_by_statements
from rules.classification.subspace_rules_classifier import SubspaceRulesClassifier
from rules.note.extract_rules import extract_rules
from rules.note.overlapping.measure_adjacencies import measure_rules
from rules.utils.utils import join_consecutive_statements
from scipy.stats import entropy
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score, make_scorer, confusion_matrix, f1_score, precision_score, balanced_accuracy_score, cohen_kappa_score
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeClassifier
from sympy import parse_expr
from toolz.curried import pipe, filter, map, reduce


## LOAD

In [None]:
train_data = pd.read_csv(DATASET_PATH['train'])
test_data = pd.read_csv(DATASET_PATH['test'])

In [None]:
x_train = train_data.drop('TARGET', axis=1).values
y_train = train_data['TARGET'].values
x_test = test_data.drop('TARGET', axis=1).values
y_test = test_data['TARGET'].values

In [None]:
meta = {
        **DATASET_PATH,
       }

In [None]:
if RF_TYPE == 'randomForest':
    clf_rf = RandomForestClassifier(max_depth=MAX_DEPTH, n_estimators=N_ESTIMATORS, min_samples_split=MIN_SAMPLES_SPLIT, random_state=42)
elif RF_TYPE == 'extraTrees':
    clf_rf = ExtraTreesClassifier(max_depth=MAX_DEPTH, n_estimators=N_ESTIMATORS, min_samples_split=MIN_SAMPLES_SPLIT, random_state=42)
clf_rf.fit(x_train, y_train)

In [None]:
clf_dt = DecisionTreeClassifier(random_state=42, min_samples_split=MIN_SAMPLES_SPLIT, max_depth=MAX_DEPTH)
clf_dt.fit(x_train, y_train)

In [None]:
rf_test_predict = clf_rf.predict(x_test)
dt_test_predict = clf_dt.predict(x_test)

In [None]:
meta = {
    **meta,
    "DT_test_acc": accuracy_score(y_test, dt_test_predict),
    "RF_test_acc": accuracy_score(y_test, rf_test_predict),
    "DT_test_confusion_matrix": confusion_matrix(y_test, dt_test_predict),
    "RF_test_confusion_matrix": confusion_matrix(y_test, rf_test_predict),
    "RF_test_f1": f1_score(y_test, rf_test_predict, average='weighted'),
    "DT_test_f1": f1_score(y_test, dt_test_predict, average='weighted'),
    "DT_test_gmean": geometric_mean_score(y_test, dt_test_predict, average='weighted'),
    "RF_test_gmean": geometric_mean_score(y_test, rf_test_predict, average='weighted'),
    "DT_test_balanced_accuracy": balanced_accuracy_score(y_test, dt_test_predict),
    "RF_test_balanced_accuracy": balanced_accuracy_score(y_test, rf_test_predict),
    "x_test": x_test,
    "RF_test_predictions": rf_test_predict,
    "DT_test_predictions": dt_test_predict,
    "SELECTION_METHODS": SELECTION_METHODS,
    "CV": CV,
    "RF_TYPE": RF_TYPE,
    "CV_REPEATS": CV_REPEATS
}

In [None]:
%%time

all_rules = pipe(
    clf_rf.estimators_,
    map(lambda estimator: extract_rules(estimator)),
    reduce(set.union),
    map(lambda r: join_consecutive_statements(r)),
#     map(lambda r: bound_rule(r, x_train)),
    list
)


In [None]:
all_statements = pipe(
    all_rules,
    set,
    map(lambda r: list(r.statements)),
    reduce(list.__add__),
)

# Find all non-overlapping rule cliques

In [None]:
g = nx.Graph()

In [None]:
%%time

with joblib.parallel_backend('threading'):
    all_rule_measurements= measure_rules(all_rules, n_jobs=N_JOBS)

In [None]:
def add_to_graph(measurements_tuple, measurement):
    
    if measurement == AdjacentOrNot.NOT_ADJACENT:
        rule_1, rule_2 = measurements_tuple
        rule_idx_1 = all_rules.index(rule_1)
        rule_idx_2 = all_rules.index(rule_2)
        
        g.add_node(rule_idx_1)

        g.add_node(rule_idx_2)

        g.add_edge(rule_idx_1, rule_idx_2)

In [None]:
%%time
for measurements_tuple, measurement in all_rule_measurements.items():
    
    add_to_graph(measurements_tuple, measurement)

In [None]:
%%time
all_subspaces = list(filter(lambda s: len(s) <= SUBSPACES)(find_cliques(g)))

subspaces_to_check = all_subspaces

for clique_size in reversed(range(1, SUBSPACES + 1)):
    subspaces_by_size_of_param = list(filter(lambda s: len(s) == SUBSPACES)(all_subspaces))
    print(f"clique = {clique_size}")
    if not len(subspaces_by_size_of_param) == 0:
        subspaces_to_check = subspaces_by_size_of_param
        break

In [None]:
if not subspaces_to_check:
    subspaces_to_check = [[r_idx] for r_idx in list(range(len(all_rules)))]

In [None]:
len(subspaces_to_check)

In [None]:
meta = {
    **meta,
    'total_rules' : len(all_rules),
    'total_subspaces_to_check': len(subspaces_to_check),
}

In [None]:
meta

In [None]:
classes_count = len(np.unique(y_train))

def calculate_entropy(y, classes_count):
    counts = np.unique(y, return_counts=True)[1]
    return entropy(counts, base=classes_count)

In [None]:
def accuracy_with_rf(estimator, X, y):
    y_rf = clf_rf.predict(X)
    y_model = estimator.predict(X)
    
    return accuracy_score(y_rf, y_model)

In [None]:
def bal_accuracy_with_rf(estimator, X, y):
    y_rf = clf_rf.predict(X)
    y_model = estimator.predict(X)
    
    return balanced_accuracy_score(y_rf, y_model)

In [None]:
def rf_kohen_cappa(estimator, X, y):
    y_rf = clf_rf.predict(X)
    y_model = estimator.predict(X)
    
    return cohen_kappa_score(y_rf, y_model)

In [None]:
def get_val(clique, x_train, y_train, x_test, y_test):
    
    rules = np.array(all_rules)[clique]
    clf = SubspaceRulesClassifier(rules=rules, max_depth=MAX_DEPTH, random_state=42)

    skf = ShuffleSplit(n_splits=CV, test_size=0.5,random_state=42)
    with joblib.parallel_backend('threading'):
        scores = cross_validate(clf, x_train, y_train, n_jobs=1, scoring={
            'balanced_accuracy': 'balanced_accuracy',
            'f1': 'f1_weighted',
            'accuracy': 'accuracy',
            'g_mean': make_scorer(geometric_mean_score, average='weighted'),
            'recall': 'recall_weighted',
            'precision': 'precision_weighted',
            'rf_accuracy': accuracy_with_rf,
            'rf_balanced_accuracy': bal_accuracy_with_rf,
            'rf_cohen_kappa': rf_kohen_cappa
        }, cv=skf)

    additional_scores = defaultdict(list)

    skf = ShuffleSplit(n_splits=CV, test_size=0.5, random_state=42)
    for train_index, test_index in skf.split(x_train, y_train):
        x_train_split = x_train[train_index]
        y_train_split = y_train[train_index]

        mean_rule_scores = defaultdict(list)
        for rule in rules:
            covered_indicies = list(covered_by_statements(rule, x_train_split))

            rule_measurements = BayesianRuleMeasures.create(rule, x_train_split, y_train_split)
            mean_rule_scores['a'].append(rule_measurements.a())
            mean_rule_scores['b'].append(rule_measurements.b())
            mean_rule_scores['c'].append(rule_measurements.c())
            mean_rule_scores['d'].append(rule_measurements.d())
            mean_rule_scores['s'].append(rule_measurements.s_measure())
            mean_rule_scores['n'].append(rule_measurements.n_measure())
            mean_rule_scores['entropy'].append(calculate_entropy(y_train_split[covered_indicies], classes_count))

        for score_name, this_score in dict(mean_rule_scores).items():
            additional_scores[score_name].append(np.mean(this_score))


            
    final_clf = SubspaceRulesClassifier(rules=rules, max_depth=MAX_DEPTH, random_state=42)
    final_clf.fit(x_train, y_train)
    final_clf_y_pred = final_clf.predict(x_test)
    final_model_test_accuracy = accuracy_score(y_test, final_clf_y_pred) 
    final_model_confusion_matrix = confusion_matrix(y_test, final_clf_y_pred)
    
    scores = {
        **scores,
        **dict(additional_scores),
    }
    scores_without_test_preffix = {
        **{k[5:] if k.startswith('test_') else k: np.mean(v) for k, v in scores.items()},
        **{f"{k[5:]}_stddev" if k.startswith('test_') else f"{k}_stddev": np.std(v) for k, v in scores.items()},
    }
    
    score_by_selection_method = {
        'score ' + method: float(parse_expr(method).evalf(subs=scores_without_test_preffix)) for method in SELECTION_METHODS
    }
    
    return {
        **scores_without_test_preffix, 
        **score_by_selection_method,
        'final_model_test_accuracy': final_model_test_accuracy,
        'final_model_test_predictions': final_clf_y_pred,
        'final_model_confusion_matrix': final_model_confusion_matrix,
        'final_model_used_trees': len(final_clf._clf_by_rule),
        'final_model_test_f1_score': f1_score(y_test, final_clf_y_pred, average='weighted'),
        'final_model_test_g_mean': geometric_mean_score(y_test, final_clf_y_pred, average='weighted'),
        'final_model_test_recall_score': recall_score(y_test, final_clf_y_pred, average='weighted'),
        'final_model_test_precision_score': precision_score(y_test, final_clf_y_pred, average='weighted'),
        'final_model_test_balanced_accuracy_score': balanced_accuracy_score(y_test, final_clf_y_pred),
        'final_model_original_rf_fidelity': accuracy_score(rf_test_predict, final_clf_y_pred)
    }

In [None]:
warnings.filterwarnings('ignore')

In [None]:
scoring_start = time.time()

In [None]:
%%time
with joblib.parallel_backend('threading'):
#     with tqdm_joblib(tqdm(desc="My calculation", total=len(subspaces_to_check))) as progress_bar:
    score_by_subspace = \
        dict(zip(
            map(tuple)(subspaces_to_check), 
            Parallel(n_jobs=N_JOBS)(delayed(lambda subspace: get_val(subspace, x_train, y_train, x_test, y_test))(subspace) for subspace in subspaces_to_check)
        ))


In [None]:
scoring_end = time.time()
scoring_time = scoring_end - scoring_start

In [None]:
top_rules = max(score_by_subspace, key=lambda s: score_by_subspace[s]['final_model_test_accuracy'])

In [None]:
top = score_by_subspace[top_rules]

In [None]:
scores_by_selection_method = {}
for selection_method in SELECTION_METHODS:
    best_score = max([s[f'score {selection_method}'] for s in score_by_subspace.values()])
    best_score_rules = [rules for rules, val in score_by_subspace.items() if val[f'score {selection_method}'] == best_score]
    
    best_score_rules_with_scoring = {
        rules: score_by_subspace[rules] for rules in best_score_rules
    } 
    worst = best_score_rules_with_scoring[min(best_score_rules_with_scoring, key=lambda v: best_score_rules_with_scoring[v]['accuracy'])]
    best = best_score_rules_with_scoring[max(best_score_rules_with_scoring, key=lambda v: best_score_rules_with_scoring[v]['accuracy'])]
    
    scores_by_selection_method[selection_method] = {
        **{f'worst_{k}': v for k, v in worst.items()},
        **{f'best_{k}': v for k, v in worst.items()},
        'found': len(best_score_rules)
    }
    

In [None]:
results = {
    **meta,
    'scoring_time': scoring_time,
    **scores_by_selection_method,
    **{f'top_{k}': v for k, v in top.items()},
    'found_rules': len(best_score_rules_with_scoring),
    'all_subspaces_with_score': score_by_subspace
}

In [None]:
top