# Lazy FCA Binary Classification Problem


### Dataset: <a href="https://archive.ics.uci.edu/ml/datasets/Breast+Cancer">Breast Cancer Data Set</a>

<b>Attribute Information</b>:
   1. <b>Class</b>: no-recurrence-events, recurrence-events
   2. <b>age</b>: 10-19, 20-29, 30-39, 40-49, 50-59, 60-69, 70-79, 80-89, 90-99.
   3. <b>menopause</b>: lt40, ge40, premeno.
   4. <b>tumornode-caps-size</b>: 0-4, 5-9, 10-14, 15-19, 20-24, 25-29, 30-34, 35-39, 40-44, 45-49, 50-54, 55-59.
   5. <b>inv-nodes</b>: 0-2, 3-5, 6-8, 9-11, 12-14, 15-17, 18-20, 21-23, 24-26, 27-29, 30-32, 33-35, 36-39.
   6. <b>node-caps</b>: yes, no, ?
   7. <b>deg-malig</b>: 1, 2, 3.
   8. <b>breast</b>: left, right.
   9. <b>breast-quad</b>: left-up, left-low, right-up,	right-low, central.
  10. <b>irradiat</b>: yes, no.

In [1]:
from pathlib import Path
import pandas as pd

RANDOM_SEED = 4444
DATA_PATH = Path('../data/')

## Read the data

In [2]:
data = pd.read_csv(DATA_PATH / 'breast-cancer.data', header = None)
data.columns = ['class', 'age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast', 'breast-quad', 'irradiat']
data.head(2)

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no


In [3]:
print('Total size of the data:', data.shape[0])

Total size of the data: 286


In [4]:
# just in case
data.isna().sum()

class          0
age            0
menopause      0
tumor-size     0
inv-nodes      0
node-caps      0
deg-malig      0
breast         0
breast-quad    0
irradiat       0
dtype: int64

## Preprocess For FCA

Our data has categorical & binary features, categorical features do not allow us to use FCA directly, so we need to encode this data somehow. <br>
Here I will use One Hot Encoding to address this issue.

In [5]:
# firstly let's binarize the columns that have only 2 unique values

data['class'] = data['class'].map({'no-recurrence-events': 0, 'recurrence-events': 1})
data['breast'] = data['breast'].map({'left': 0, 'right': 1})
data['irradiat'] = data['irradiat'].map({'no': 0, 'yes': 1})

assert(data.isna().sum().sum() == 0) # check that we didn't spoil the data

In [6]:
# now let's deal with textual columns that have more than 2 unique values

for col in ['age', 'menopause', 'tumor-size', 'inv-nodes', 'node-caps', 'deg-malig', 'breast-quad']:
    data = pd.concat([data, pd.get_dummies(data[col], prefix=f'{col}')], axis = 1).drop(columns = [col])
    
# check that we didn't spoil the data 

assert(data.isna().sum().sum() == 0) # no missing values
assert((data.nunique().unique()[0] == 2) & (data.nunique().unique().shape[0] == 1)) # all columns have only 2 unique values

## Predict with Lazy Learning Algorithm

In [7]:
from lazy_learning_framework import LazyLearningAlgo
algo = LazyLearningAlgo(min_support = 0.2, allow_online_learning = False, fca_algorithm = 'apriori', prediction_algorithm = 'itemsets_intersection_sums')

In [13]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
def print_oof_metrics(Y, Y_pred):
    print('\n\n\n')
    tn, fp, fn, tp = confusion_matrix(Y, Y_pred).ravel()
    print('OOF accuracy', accuracy_score(Y, Y_pred))
    print(f'TN {tn} FP {fp} FN {fn} TP {tp}')
    print('OOF precision', precision_score(Y, Y_pred))
    print('OOF recall', recall_score(Y, oof_preds))
    
    return accuracy_score(Y, Y_pred)

In [9]:
# KFold crossvalidation
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import numpy as np

X, Y = data.drop(columns = ['class']), data['class']
kf = KFold(n_splits = 5, shuffle = True, random_state = RANDOM_SEED)
oof_preds = np.zeros(Y.shape[0])

for fold_idx, (tr_idx, val_idx) in enumerate(kf.split(X, Y)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    Y_tr, Y_val = Y.iloc[tr_idx], Y.iloc[val_idx]
    
    Y_pred = algo.run_prediction(X_tr, Y_tr, X_val)
    oof_preds[val_idx] = Y_pred
    print(f'Fold {fold_idx} Accuracy', accuracy_score(Y_val, Y_pred))

oof_acc = print_oof_metrics(Y, oof_preds)

Fold 0 Accuracy 0.7413793103448276
Fold 1 Accuracy 0.5964912280701754
Fold 2 Accuracy 0.7368421052631579
Fold 3 Accuracy 0.7719298245614035
Fold 4 Accuracy 0.8245614035087719




OOF accuracy 0.7342657342657343
TN 187 FP 14 FN 62 TP 23
OOF precision 0.6216216216216216
OOF recall 0.27058823529411763


## Run GridSearch

In [10]:
# example params
params = {
    'min_support': 0.2, 
    'allow_online_learning': False, 
    'fca_algorithm': 'apriori', 
    'prediction_algorithm': 'itemsets_intersection_sums'
}

In [11]:
from lazy_learning_framework import LazyLearningAlgo
def validate_algo(data, params):
    algo = LazyLearningAlgo(**params)
    
    X, Y = data.drop(columns = ['class']), data['class']
    kf = KFold(n_splits = 5, shuffle = True, random_state = RANDOM_SEED)
    oof_preds = np.zeros(Y.shape[0])

    for fold_idx, (tr_idx, val_idx) in enumerate(kf.split(X, Y)):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        Y_tr, Y_val = Y.iloc[tr_idx], Y.iloc[val_idx]

        Y_pred = algo.run_prediction(X_tr, Y_tr, X_val)
        oof_preds[val_idx] = Y_pred
        print(f'Fold {fold_idx} Accuracy', accuracy_score(Y_val, Y_pred))

    oof_acc = print_oof_metrics(Y, oof_preds)
    return oof_acc

In [14]:
# grid search loop
# I will limit the search space a bit to save time

grid_search_params = {
    'min_support': [0.2, 0.3, 0.4],
    'allow_online_learning': [False, True],
    'fca_algorithm': ['fpgrowth', 'apriori'],
    'prediction_algorithm': ['most_itemsets', 'itemsets_intersection_sums']
}

best_acc = 0
best_params = {}
iteration_idx = 0
for min_support in grid_search_params['min_support']:
    for allow_online_learning in grid_search_params['allow_online_learning']:
        for fca_algorithm in grid_search_params['fca_algorithm']:
            for prediction_algorithm in grid_search_params['prediction_algorithm']:
                
                print(f'\n\n\n----------------------------------Running GridSearch; Iteration {iteration_idx}----------------------------------\n')
                iteration_idx+=1
                
                params = {
                    'min_support': min_support, 
                    'allow_online_learning': allow_online_learning, 
                    'fca_algorithm': fca_algorithm, 
                    'prediction_algorithm': prediction_algorithm
                }
                
                acc = validate_algo(data, params)
                if acc > best_acc:
                    best_acc = acc
                    best_params = params




----------------------------------Running GridSearch; Iteration 0----------------------------------

Fold 0 Accuracy 0.8275862068965517
Fold 1 Accuracy 0.631578947368421
Fold 2 Accuracy 0.6666666666666666
Fold 3 Accuracy 0.7368421052631579
Fold 4 Accuracy 0.631578947368421




OOF accuracy 0.6993006993006993
TN 155 FP 46 FN 40 TP 45
OOF precision 0.4945054945054945
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 1----------------------------------

Fold 0 Accuracy 0.7413793103448276
Fold 1 Accuracy 0.5964912280701754
Fold 2 Accuracy 0.7368421052631579
Fold 3 Accuracy 0.7719298245614035
Fold 4 Accuracy 0.8245614035087719




OOF accuracy 0.7342657342657343
TN 187 FP 14 FN 62 TP 23
OOF precision 0.6216216216216216
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 2----------------------------------

Fold 0 Accuracy 0.8275862068965517
Fold 1 Accuracy 0.631578947368421
Fold 2 Accuracy 0.666

  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.8103448275862069


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.6491228070175439


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.6666666666666666


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.7543859649122807


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.6666666666666666




OOF accuracy 0.7097902097902098
TN 154 FP 47 FN 36 TP 49
OOF precision 0.5104166666666666
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 5----------------------------------



  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.7586206896551724


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.5964912280701754


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.7368421052631579


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.7543859649122807


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.8245614035087719




OOF accuracy 0.7342657342657343
TN 188 FP 13 FN 63 TP 22
OOF precision 0.6285714285714286
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 6----------------------------------



  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.8103448275862069


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.6491228070175439


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.6666666666666666


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.7543859649122807


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.6666666666666666




OOF accuracy 0.7097902097902098
TN 154 FP 47 FN 36 TP 49
OOF precision 0.5104166666666666
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 7----------------------------------



  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.7586206896551724


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.6140350877192983


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.7368421052631579


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.7543859649122807


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.8245614035087719




OOF accuracy 0.7377622377622378
TN 188 FP 13 FN 62 TP 23
OOF precision 0.6388888888888888
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 8----------------------------------

Fold 0 Accuracy 0.7413793103448276
Fold 1 Accuracy 0.5964912280701754
Fold 2 Accuracy 0.6842105263157895
Fold 3 Accuracy 0.7894736842105263
Fold 4 Accuracy 0.7192982456140351




OOF accuracy 0.7062937062937062
TN 168 FP 33 FN 51 TP 34
OOF precision 0.5074626865671642
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 9----------------------------------

Fold 0 Accuracy 0.7586206896551724
Fold 1 Accuracy 0.6140350877192983
Fold 2 Accuracy 0.7543859649122807
Fold 3 Accuracy 0.7543859649122807
Fold 4 Accuracy 0.7894736842105263




OOF accuracy 0.7342657342657343
TN 198 FP 3 FN 73 TP 12
OOF precision 0.8
OOF recall 0.27058823529411763



----------------------------------Running Gri

  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.7413793103448276


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.5964912280701754


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.6842105263157895


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.7894736842105263


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.7543859649122807




OOF accuracy 0.7132867132867133
TN 170 FP 31 FN 51 TP 34
OOF precision 0.5230769230769231
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 13----------------------------------



  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.7758620689655172


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.6140350877192983


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.7543859649122807


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.7894736842105263


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.7894736842105263




OOF accuracy 0.7447552447552448
TN 199 FP 2 FN 71 TP 14
OOF precision 0.875
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 14----------------------------------



  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.7413793103448276


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.5964912280701754


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.6842105263157895


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.7894736842105263


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.7543859649122807




OOF accuracy 0.7132867132867133
TN 170 FP 31 FN 51 TP 34
OOF precision 0.5230769230769231
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 15----------------------------------



  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.7758620689655172


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.6140350877192983


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.7543859649122807


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.7894736842105263


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.7894736842105263




OOF accuracy 0.7447552447552448
TN 199 FP 2 FN 71 TP 14
OOF precision 0.875
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 16----------------------------------

Fold 0 Accuracy 0.43103448275862066
Fold 1 Accuracy 0.5263157894736842
Fold 2 Accuracy 0.7368421052631579
Fold 3 Accuracy 0.5789473684210527
Fold 4 Accuracy 0.8245614035087719




OOF accuracy 0.6188811188811189
TN 136 FP 65 FN 44 TP 41
OOF precision 0.3867924528301887
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 17----------------------------------

Fold 0 Accuracy 0.603448275862069
Fold 1 Accuracy 0.5789473684210527
Fold 2 Accuracy 0.7543859649122807
Fold 3 Accuracy 0.7719298245614035
Fold 4 Accuracy 0.7894736842105263




OOF accuracy 0.6993006993006993
TN 181 FP 20 FN 66 TP 19
OOF precision 0.48717948717948717
OOF recall 0.27058823529411763



----------------------------------Runnin

  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.43103448275862066


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.5263157894736842


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.7192982456140351


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.543859649122807


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.8070175438596491




OOF accuracy 0.6048951048951049
TN 131 FP 70 FN 43 TP 42
OOF precision 0.375
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 21----------------------------------



  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.6379310344827587


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.5789473684210527


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.7368421052631579


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.7719298245614035


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.7894736842105263




OOF accuracy 0.7027972027972028
TN 181 FP 20 FN 65 TP 20
OOF precision 0.5
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 22----------------------------------



  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.43103448275862066


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.5263157894736842


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.7192982456140351


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.543859649122807


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.8070175438596491




OOF accuracy 0.6048951048951049
TN 131 FP 70 FN 43 TP 42
OOF precision 0.375
OOF recall 0.27058823529411763



----------------------------------Running GridSearch; Iteration 23----------------------------------



  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 0 Accuracy 0.6379310344827587


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 1 Accuracy 0.5789473684210527


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 2 Accuracy 0.7368421052631579


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 3 Accuracy 0.7719298245614035


  frequent_zero_class = self._find_frequent_itemsets(X_train[Y_train == 0])
  frequent_one_class = self._find_frequent_itemsets(X_train[Y_train == 1])


Fold 4 Accuracy 0.7894736842105263




OOF accuracy 0.7027972027972028
TN 181 FP 20 FN 65 TP 20
OOF precision 0.5
OOF recall 0.27058823529411763


In [18]:
print('Best Accuracy', best_acc)
print('Best Params', best_params)

Best Accuracy 0.7447552447552448
Best Params {'min_support': 0.3, 'allow_online_learning': True, 'fca_algorithm': 'fpgrowth', 'prediction_algorithm': 'itemsets_intersection_sums'}


## Compare to SKLearn models

### LogisticRegression

In [44]:
# KFold crossvalidation
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

X, Y = data.drop(columns = ['class']), data['class']
kf = KFold(n_splits = 5, shuffle = True, random_state = RANDOM_SEED)
oof_preds = np.zeros(Y.shape[0])

for fold_idx, (tr_idx, val_idx) in enumerate(kf.split(X, Y)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    Y_tr, Y_val = Y.iloc[tr_idx], Y.iloc[val_idx]
    
    model = LogisticRegression()
    model.fit(X_tr, Y_tr)
    Y_pred = model.predict(X_val)
    oof_preds[val_idx] = Y_pred
    print(f'Fold {fold_idx} Accuracy', accuracy_score(Y_val, Y_pred))

print_oof_metrics(Y, oof_preds)

Fold 0 Accuracy 0.7241379310344828
Fold 1 Accuracy 0.5964912280701754
Fold 2 Accuracy 0.7368421052631579
Fold 3 Accuracy 0.7894736842105263
Fold 4 Accuracy 0.7192982456140351




OOF accuracy 0.7132867132867133
TN 177 FP 24 FN 58 TP 27
OOF precision 0.5294117647058824
OOF recall 0.3176470588235294


### KNN

In [43]:
# KFold crossvalidation
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

X, Y = data.drop(columns = ['class']), data['class']
kf = KFold(n_splits = 5, shuffle = True, random_state = RANDOM_SEED)
oof_preds = np.zeros(Y.shape[0])

for fold_idx, (tr_idx, val_idx) in enumerate(kf.split(X, Y)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    Y_tr, Y_val = Y.iloc[tr_idx], Y.iloc[val_idx]
    
    model = KNeighborsClassifier()
    model.fit(X_tr, Y_tr)
    Y_pred = model.predict(X_val)
    oof_preds[val_idx] = Y_pred
    print(f'Fold {fold_idx} Accuracy', accuracy_score(Y_val, Y_pred))

print_oof_metrics(Y, oof_preds)

Fold 0 Accuracy 0.7413793103448276
Fold 1 Accuracy 0.5789473684210527
Fold 2 Accuracy 0.7192982456140351
Fold 3 Accuracy 0.7543859649122807
Fold 4 Accuracy 0.7017543859649122




OOF accuracy 0.6993006993006993
TN 175 FP 26 FN 60 TP 25
OOF precision 0.49019607843137253
OOF recall 0.29411764705882354


### DecisionTreeClassifier

In [45]:
# KFold crossvalidation
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

X, Y = data.drop(columns = ['class']), data['class']
kf = KFold(n_splits = 5, shuffle = True, random_state = RANDOM_SEED)
oof_preds = np.zeros(Y.shape[0])

for fold_idx, (tr_idx, val_idx) in enumerate(kf.split(X, Y)):
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    Y_tr, Y_val = Y.iloc[tr_idx], Y.iloc[val_idx]
    
    model = DecisionTreeClassifier()
    model.fit(X_tr, Y_tr)
    Y_pred = model.predict(X_val)
    oof_preds[val_idx] = Y_pred
    print(f'Fold {fold_idx} Accuracy', accuracy_score(Y_val, Y_pred))

print_oof_metrics(Y, oof_preds)

Fold 0 Accuracy 0.6379310344827587
Fold 1 Accuracy 0.6140350877192983
Fold 2 Accuracy 0.6842105263157895
Fold 3 Accuracy 0.5964912280701754
Fold 4 Accuracy 0.7017543859649122




OOF accuracy 0.6468531468531469
TN 154 FP 47 FN 54 TP 31
OOF precision 0.3974358974358974
OOF recall 0.36470588235294116
