In [1]:
%load_ext autoreload
%autoreload 1

In [2]:
import numpy as np
import pandas as pd
import datetime, copy, imp
import pickle
import time
import os
import re
from sklearn.model_selection import StratifiedKFold
from importlib import reload


from tqdm.auto import tqdm, trange
from tqdm.notebook import tqdm
tqdm.pandas()

import sys
sys.path.insert(0, '../util/')

import rules as rs

In [3]:
# coverage table that the rule set search will run on

In [4]:
covFileStr = '../data/MLB-CoverageTable.pkl'

with open(covFileStr,'rb') as pklFile:
    mlbData = pickle.load(pklFile)

exsMlb = rs.Examples(mlbData)

In [5]:
# Search for a single rule with F-score as the optimisation metric

In [6]:
qual = rs.RuleQualFScore(exsMlb.FtrStrs,betaSq=0.1)
rSrch = rs.OpusRuleSearch(ruleQuality=qual, maxRuleLen=3, debug=rs.OPUS_DEBUG_RULE_DEPTH)
rBest = rSrch.find_rule(exsMlb)

print('\n-- Best Rule --')
qual.print_summary([rBest],exsMlb.CovTbl,exsMlb.Labels)

Finished Depth: 1
 MaxPotQuality: 0.92 (evaluated), 0.88 (todo)
 Best Rule: [112, 55]:(['A2 >= 56.5' 'A0 < 544']), Quality: 0.7437722419928826
Finished Depth: 2
 MaxPotQuality: 0.89 (evaluated), 0.87 (todo)
 Best Rule: [112, 55, 28]:(['A2 >= 56.5' 'A0 < 544' 'A0 >= 260']), Quality: 0.7712177121771219
Finished Depth: 3
 MaxPotQuality: 0.88 (evaluated), nan (todo)
 Best Rule: [112, 55, 28]:(['A2 >= 56.5' 'A0 < 544' 'A0 >= 260']), Quality: 0.7712177121771219

-- Best Rule --
Rule: ['A2 >= 56.5' 'A0 < 544' 'A0 >= 260']

 Precision: 0.9048
 Recall: 0.3115
 Confusion:
  True Pos: 19/61
  False Neg: 42/61
  False Pos: 2/854
  True Neg: 852/854



In [7]:
# Search for best ruleset with F-score as the optimisation metric

In [8]:
qual = rs.RuleQualFScore(exsMlb.FtrStrs,betaSq=0.1)
rSrch = rs.OpusRuleSearch(ruleQuality=qual, maxRuleLen=3, debug=rs.OPUS_DEBUG_RULE_DEPTH)
ruleSet = rs.rule_set_search(qual,rSrch,exsMlb,maxSetSize=2,debug=True)
print(f'\nFinal Rule Set: {qual.ruleset_str(ruleSet)}')

Searching for individual rule...
Finished Depth: 1
 MaxPotQuality: 0.92 (evaluated), 0.88 (todo)
 Best Rule: [112, 55]:(['A2 >= 56.5' 'A0 < 544']), Quality: 0.7437722419928826
Finished Depth: 2
 MaxPotQuality: 0.89 (evaluated), 0.87 (todo)
 Best Rule: [112, 55, 28]:(['A2 >= 56.5' 'A0 < 544' 'A0 >= 260']), Quality: 0.7712177121771219
Finished Depth: 3
 MaxPotQuality: 0.88 (evaluated), nan (todo)
 Best Rule: [112, 55, 28]:(['A2 >= 56.5' 'A0 < 544' 'A0 >= 260']), Quality: 0.7712177121771219

-- New Rule --
Rule: ['A2 >= 56.5' 'A0 < 544' 'A0 >= 260']

 Precision: 0.9048
 Recall: 0.3115
 Confusion:
  True Pos: 19/61
  False Neg: 42/61
  False Pos: 2/854
  True Neg: 852/854

Total Label Weight: 61.0 (before), 42.0 (after)

Searching for individual rule...
Finished Depth: 1
 MaxPotQuality: 0.91 (evaluated), 0.85 (todo)
 Best Rule: [104, 109]:(['A2 >= 42.5' 'A2 < 52.5']), Quality: 0.5755813953488372
Finished Depth: 2
 MaxPotQuality: 0.86 (evaluated), 0.82 (todo)
 Best Rule: [104, 109, 22]:(['A