# Expert Rules

This notebook presents example usage of user-guided rule induction which follows the scheme introduced by the [GuideR](https://www.sciencedirect.com/science/article/abs/pii/S0950705119300802?dgcid=coauthor) algorithm (Sikora et al, 2019).    
Each problem (classification, regression, survival) in addition to the basic class has an expert class, i.e. RuleClassifier and ExpertRuleClassifier. Expert classes allow you to define set of initial rules, preferred conditions and forbidden conditions.    
This tutorial will show you how to define rules and conditions


## Import RuleKit

In [1]:
from rulekit import RuleKit
from rulekit.classification import RuleClassifier
from rulekit.params import Measures

## Classification

### Prepare dataset

In [2]:
from scipy.io import arff
import pandas as pd


data_df = pd.DataFrame(arff.loadarff("seismic-bumps.arff")[0])
data_df['class'] = data_df['class'].astype(int)

X = data_df.drop(['class'], axis=1)
y = data_df['class']

### Define rules and conditions

In [3]:
expert_rules = [
    ('rule-0', 'IF [[gimpuls = <-inf, 750)]] THEN class = {0}'),
    ('rule-1', 'IF [[gimpuls = <750, inf)]] THEN class = {1}')
]

expert_preferred_conditions = [('preferred-condition-0', '1: IF [[seismic = {a}]] THEN class = {0}'), (
    'preferred-attribute-0', '1: IF [[gimpuls = Any]] THEN class = {1}')]

expert_forbidden_conditions = [('forb-attribute-0', '1: IF [[seismoacoustic  = Any]] THEN class = {0}'), (
    'forb-attribute-1', 'inf: IF [[ghazard  = Any]] THEN class = {1}')]

### Rule induction

In [4]:
from rulekit.classification import ExpertRuleClassifier

clf = ExpertRuleClassifier(
    minsupp_new=8,
    max_growing=0,
    extend_using_preferred=True,
    extend_using_automatic=True,
    induce_using_preferred=True,
    induce_using_automatic=True
)
clf.fit(
    X, y,
    expert_rules=expert_rules,
    expert_preferred_conditions=expert_preferred_conditions,
    expert_forbidden_conditions=expert_forbidden_conditions
)
ruleset = clf.model

In [5]:
for rule in ruleset.rules:
    print(rule)

IF [[gimpuls = <-inf, 750)]] AND [seismic = {a}] AND nbumps4 = (-inf, 0.50) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1252.50) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1342.50) AND goimpuls = (-inf, 312) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1427.50) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1653.50) AND genergy = (-inf, 1006585) AND goimpuls = (-inf, 312) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1752) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 2733) AND goimpuls = (-inf, 312) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = <2965, inf) AND genergy = <634250, inf) AND nbumps = (-inf, 1.50) THEN class = {0}
IF gimpuls = (-inf, 1331) AND nbumps = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1655.50) AND genergy = (-inf, 386010) AND nbumps = (-inf, 2.50) THEN class = {0}
IF gimpuls = (-inf, 1686) AND goimpuls = (-inf, 312) AND nbumps5 = (-inf, 0.50

## Regression

### Prepare dataset

In [6]:
from scipy.io import arff
import pandas as pd

data_df = pd.DataFrame(arff.loadarff("methane-train.arff")[0])

X = data_df.drop(['MM116_pred'], axis=1)
y = data_df['MM116_pred']

In [7]:
X

Unnamed: 0,MM31,MM116,AS038,PG072,PD,BA13,DMM116
0,0.46,1.3,2.4,2.0,1.0,1076.0,0.0
1,0.46,1.3,2.2,1.9,1.0,1076.0,0.0
2,0.49,1.3,2.2,1.9,1.0,1076.0,0.0
3,0.50,1.3,2.3,1.9,1.0,1076.0,0.0
4,0.54,1.3,2.3,1.9,1.0,1076.0,0.0
...,...,...,...,...,...,...,...
13363,0.64,1.2,2.4,1.8,1.0,1077.0,0.0
13364,0.59,1.2,2.4,1.8,1.0,1077.0,0.0
13365,0.60,1.1,2.2,1.8,1.0,1077.0,-0.1
13366,0.64,1.1,2.2,1.8,1.0,1077.0,0.0


### Define rules and conditions

In [8]:
expert_rules = None

expert_preferred_conditions = [
    (
        'preferred-condition-0',
        '3: IF PD = <0.5, inf) THEN MM116_pred = {NaN}'
    ),
    (
        'preferred-condition-1',
        '5: IF PD = <0.5, inf) AND MM116 = (-inf, 1.0) THEN MM116_pred = {NaN}'
    )
]

expert_forbidden_conditions = [
    ('forb-attribute-0', 'inf: IF DMM116 = Any THEN MM116_pred = {NaN}')
]

### Rule induction

In [9]:
from rulekit.regression import ExpertRuleRegressor

reg = ExpertRuleRegressor(
    minsupp_new=5,
    max_growing=0,
    mean_based_regression=True,
    extend_using_preferred=True,
    extend_using_automatic=False,
    induce_using_preferred=True,
    induce_using_automatic=True
)
reg.fit(
    X, y,
    expert_rules=expert_rules,
    expert_preferred_conditions=expert_preferred_conditions,
    expert_forbidden_conditions=expert_forbidden_conditions
)
ruleset = reg.model

In [10]:
for rule in ruleset.rules:
    print(rule)

IF [PD = <0.50, inf)] AND PG072 = (-inf, 2.05) THEN MM116_pred = {1.01} [0.77,1.25]
IF [PD = <0.50, inf)] THEN MM116_pred = {1.01} [0.77,1.25]
IF MM31 = (-inf, 0.23) THEN MM116_pred = {0.40} [0.39,0.41]
IF MM116 = (-inf, 0.45) AND MM31 = <0.18, 0.24) THEN MM116_pred = {0.40} [0.38,0.42]
IF MM31 = (-inf, 0.25) THEN MM116_pred = {0.44} [0.37,0.51]
IF PD = (-inf, 0.50) AND MM116 = <0.25, inf) AND AS038 = <2, 2.45) AND MM31 = <0.23, inf) AND PG072 = (-inf, 1.95) AND BA13 = (-inf, 1075.50) THEN MM116_pred = {0.71} [0.50,0.93]
IF PD = (-inf, 0.50) AND MM116 = (-inf, 0.25) AND AS038 = <2.35, 2.45) AND MM31 = <0.19, inf) AND PG072 = <1.75, 1.95) AND BA13 = (-inf, 1075.50) THEN MM116_pred = {0.25} [0.20,0.30]
IF MM116 = (-inf, 0.45) AND MM31 = <0.18, inf) AND BA13 = (-inf, 1077.50) THEN MM116_pred = {0.40} [0.37,0.43]
IF MM116 = (-inf, 0.55) AND MM31 = (-inf, 0.32) THEN MM116_pred = {0.45} [0.39,0.51]
IF MM116 = <0.45, 0.65) THEN MM116_pred = {0.55} [0.49,0.61]
IF MM116 = (-inf, 0.75) AND MM31 

## Survival

### Prepare dataset

In [11]:
from scipy.io import arff
import pandas as pd

data_df = pd.DataFrame(arff.loadarff(open('bmt.arff', 'r', encoding="cp1252"))[0])

# code to fix the problem with encoding of the file
tmp_df = data_df.select_dtypes([object]) 
tmp_df = tmp_df.stack().str.decode("cp1252").unstack()
for col in tmp_df:
    data_df[col] = tmp_df[col]
    
data_df = data_df.replace({'?': None})

X = data_df.drop(['survival_status'], axis=1)
y = data_df['survival_status']

### Define rules and conditions

In [12]:
expert_rules = [
    (
        'rule-0',
        'IF [[CD34kgx10d6 = (-inf, 10.0)]] AND [[extcGvHD = {0}]] THEN survival_status = {NaN}')
]

expert_preferred_conditions = [
    (
        'attr-preferred-0',
        'inf: IF [CD34kgx10d6 = Any] THEN survival_status = {NaN}'
    )
]


expert_forbidden_conditions = [
    ('attr-forbidden-0', 'IF [ANCrecovery = Any] THEN survival_status = {NaN}')
]

### Rule induction

In [13]:
from rulekit.survival import ExpertSurvivalRules

srv = ExpertSurvivalRules(
    survival_time_attr='survival_time',
    minsupp_new=5,
    max_growing=0,
    extend_using_preferred=False,
    extend_using_automatic=False,
    induce_using_preferred=True,
    induce_using_automatic=True
)
srv.fit(
    X, y,
    expert_rules=expert_rules,
    expert_preferred_conditions=expert_preferred_conditions,
    expert_forbidden_conditions=expert_forbidden_conditions
)
ruleset = srv.model

In [14]:
for rule in ruleset.rules:
    print(rule)

IF [[CD34kgx10d6 = (-inf, 10)]] AND [[extcGvHD = {0}]] THEN 
IF [CD34kgx10d6 = (-inf, 11.86)] AND PLTrecovery = <500142.50, inf) THEN 
IF [CD34kgx10d6 = (-inf, 11.86)] AND RecipientRh = {1} AND Recipientage = <17.85, inf) THEN 
IF [CD34kgx10d6 = (-inf, 11.86)] AND Relapse = {0} AND PLTrecovery = <26, inf) AND Recipientage = <14.30, inf) THEN 
IF [CD34kgx10d6 = (-inf, 11.86)] AND Donorage = (-inf, 40.64) AND Gendermatch = {0} AND PLTrecovery = <26, inf) AND Recipientage = <12, 18.85) THEN 
IF [CD34kgx10d6 = (-inf, 11.86)] AND Donorage = (-inf, 49.19) AND extcGvHD = {1} AND PLTrecovery = (-inf, 500142.50) AND Txpostrelapse = {0} AND CD3dCD34 = (-inf, 10.97) THEN 
IF [CD34kgx10d6 = <11.86, inf)] AND Relapse = {0} THEN 
IF [CD34kgx10d6 = (-inf, 11.86)] AND RecipientRh = {1} AND CD3dCD34 = <6.64, inf) THEN 
IF [CD34kgx10d6 = (-inf, 11.86)] AND Donorage = <36.03, inf) AND Recipientageint = {2} AND CD3dCD34 = <0.94, inf) THEN 
IF [CD34kgx10d6 = (-inf, 11.86)] AND PLTrecovery = <22.50, inf) TH