# Regression

This notebook presents example usage of package for solving regression problem on `methane` dataset. You can download training dataset [here](https://raw.githubusercontent.com/adaa-polsl/RuleKit/master/data/methane/methane-train.arff) and test dataset [here](https://raw.githubusercontent.com/adaa-polsl/RuleKit/master/data/methane/methane-test.arff)

This tutorial will cover topics such as:    
- training model   
- changing model hyperparameters   
- hyperparameters tuning   
- calculating metrics for model    
- getting RuleKit inbuilt   

## Summary of the dataset

In [1]:
from scipy.io import arff
import pandas as pd

train_file_name = "methane-train.arff"
test_file_name = "methane-test.arff"

train_df = pd.DataFrame(arff.loadarff(train_file_name)[0])
test_df = pd.DataFrame(arff.loadarff(test_file_name)[0])

### Train file

In [2]:
print("Train file overview:")
print(f"Name: {train_file_name}")
print(f"Objects number: {train_df.shape[0]}; Attributes number: {train_df.shape[1]}")
print("Basic attribute statistics:")
train_df.describe()

Train file overview:
Name: methane-train.arff
Objects number: 13368; Attributes number: 8
Basic attribute statistics:


Unnamed: 0,MM31,MM116,AS038,PG072,PD,BA13,DMM116,MM116_pred
count,13368.0,13368.0,13368.0,13368.0,13368.0,13368.0,13368.0,13368.0
mean,0.36396,0.775007,2.294734,1.8356,0.308573,1073.443372,-7e-06,0.79825
std,0.117105,0.269366,0.142504,0.106681,0.461922,3.162811,0.043566,0.28649
min,0.17,0.2,1.4,1.1,0.0,1067.0,-1.8,0.2
25%,0.26,0.5,2.3,1.8,0.0,1070.0,0.0,0.5
50%,0.36,0.8,2.3,1.8,0.0,1075.0,0.0,0.8
75%,0.45,1.0,2.4,1.9,1.0,1076.0,0.0,1.0
max,0.82,2.2,2.7,2.6,1.0,1078.0,0.8,2.2


### Test file

In [3]:
# test file
print("\nTest file overview:")
print(f"Name: {test_file_name}")
print(f"Objects number: {test_df.shape[0]}; Attributes number: {test_df.shape[1]}")
print("Basic attribute statistics:")
test_df.describe()


Test file overview:
Name: methane-test.arff
Objects number: 5728; Attributes number: 8
Basic attribute statistics:


Unnamed: 0,MM31,MM116,AS038,PG072,PD,BA13,DMM116,MM116_pred
count,5728.0,5728.0,5728.0,5728.0,5728.0,5728.0,5728.0,5728.0
mean,0.556652,1.006913,2.236627,1.819239,0.538408,1072.69169,-1.7e-05,1.042458
std,0.114682,0.167983,0.104913,0.078865,0.498566,2.799559,0.046849,0.171393
min,0.35,0.5,1.8,1.6,0.0,1067.0,-0.4,0.6
25%,0.46,0.9,2.2,1.8,0.0,1071.0,0.0,0.9
50%,0.55,1.0,2.2,1.8,1.0,1073.0,0.0,1.0
75%,0.64,1.1,2.3,1.9,1.0,1075.0,0.0,1.2
max,0.98,1.6,2.7,2.1,1.0,1078.0,0.3,1.6


## Import RuleKit

In [4]:
from rulekit import RuleKit
from rulekit.regression import RuleRegressor
from rulekit.params import Measures

## Helper function for calculating metrics

In [5]:
import sklearn.tree as scikit
import math
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import metrics
import pandas as pd
import numpy as np
from typing import Tuple
from math import sqrt


def get_regression_metrics(measure: str, y_pred, y_true) -> pd.DataFrame:
    relative_error = 0
    squared_relative_error = 0
    relative_error_lenient = 0
    relative_error_strict = 0
    nae_denominator = 0
    avg = sum(y_true) / len(y_pred)

    for i in range(0, len(y_pred)):
        true = y_true[i]
        predicted = y_pred[i]

        relative_error += abs((true - predicted) / true)
        squared_relative_error += abs((true - predicted) / true) * abs((true - predicted) / true)
        relative_error_lenient += abs((true - predicted) / max(true, predicted))
        relative_error_strict += abs((true - predicted) / min(true, predicted))
        nae_denominator += abs(avg - true)
    relative_error /= len(y_pred)
    squared_relative_error /= len(y_pred)
    relative_error_lenient /= len(y_pred)
    relative_error_strict /= len(y_pred)
    nae_denominator /= len(y_pred)
    correlation = np.mean(np.corrcoef(y_true, y_pred))

    dictionary = {
        'Measure': measure,
        'absolute_error': metrics.mean_absolute_error(y_true, y_pred),
        'relative_error': relative_error,
        'relative_error_lenient': relative_error_lenient,
        'relative_error_strict': relative_error_strict,
        'normalized_absolute_error': metrics.mean_absolute_error(y_true, y_pred) / nae_denominator,
        'squared_error': metrics.mean_squared_error(y_true, y_pred),
        'root_mean_squared_error': metrics.mean_squared_error(y_true, y_pred, squared=False),
        'root_relative_squared_error': sqrt(squared_relative_error),
        'correlation': correlation,
        'squared_correlation': np.power(correlation, 2),
    }
    return pd.DataFrame.from_records([dictionary], index='Measure')

def get_ruleset_stats(measure: str, model) -> pd.DataFrame:
    tmp = model.parameters.__dict__
    del tmp['_java_object']
    return pd.DataFrame.from_records([{'Measure': measure, **tmp, **model.stats.__dict__}], index='Measure')

## Rule induction on training dataset

In [14]:
X_train = train_df.drop(['MM116_pred'], axis=1)
y_train = train_df['MM116_pred']

In [8]:
# C2
c2_reg = RuleRegressor(
    induction_measure=Measures.C2,
    pruning_measure=Measures.C2,
    voting_measure=Measures.C2,
)
c2_reg.fit(X_train, y_train)
c2_ruleset = c2_reg.model
predictions = c2_reg.predict(X_train)

regression_metrics = get_regression_metrics('C2', predictions, y_train)
ruleset_stats = get_ruleset_stats('C2', c2_ruleset)


# Correlation
corr_reg = RuleRegressor(
    induction_measure=Measures.Correlation,
    pruning_measure=Measures.Correlation,
    voting_measure=Measures.Correlation,
    mean_based_regression=True
)
corr_reg.fit(X_train, y_train)
corr_ruleset = corr_reg.model
predictions = corr_reg.predict(X_train)

tmp = get_regression_metrics('Correlation', predictions, y_train)
regression_metrics = pd.concat([regression_metrics, tmp])
ruleset_stats = pd.concat([ruleset_stats, get_ruleset_stats('Correlation', corr_ruleset)])


# RSS
rss_reg = RuleRegressor(
    induction_measure=Measures.RSS,
    pruning_measure=Measures.RSS,
    voting_measure=Measures.RSS,
    mean_based_regression=True
)
rss_reg.fit(X_train, y_train)
rss_ruleset = rss_reg.model
predictions = rss_reg.predict(X_train)

tmp = get_regression_metrics('RSS', predictions, y_train)
regression_metrics = pd.concat([regression_metrics, tmp])
ruleset_stats = pd.concat([ruleset_stats, get_ruleset_stats('RSS', rss_ruleset)])


display(ruleset_stats)
display(regression_metrics)

Unnamed: 0_level_0,minimum_covered,maximum_uncovered_fraction,ignore_missing,pruning_enabled,max_growing_condition,time_total_s,time_growing_s,time_pruning_s,rules_count,conditions_per_rule,induced_conditions_per_rule,avg_rule_coverage,avg_rule_precision,avg_rule_quality,pvalue,FDR_pvalue,FWER_pvalue,fraction_significant,fraction_FDR_significant,fraction_FWER_significant
Measure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
C2,5.0,0.0,False,True,0.0,16.88121,2.869502,13.938604,28,3.928571,24.285714,0.169234,0.943278,0.808954,0.001581,0.001581,0.001581,1.0,1.0,1.0
Correlation,5.0,0.0,False,True,0.0,56.457539,3.133929,53.277314,20,3.45,35.25,0.218047,0.859963,,0.04986,0.04986,0.049861,0.95,0.95,0.95
RSS,5.0,0.0,False,True,0.0,3.716216,0.348308,3.352011,7,5.142857,22.571429,0.23521,0.724607,,4e-06,4e-06,4e-06,1.0,1.0,1.0


Unnamed: 0_level_0,absolute_error,relative_error,relative_error_lenient,relative_error_strict,normalized_absolute_error,squared_error,root_mean_squared_error,root_relative_squared_error,correlation,squared_correlation
Measure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C2,0.085293,0.100046,0.093069,0.111329,0.362967,0.01606,0.12673,0.132221,0.949559,0.901663
Correlation,0.063712,0.074516,0.070187,0.082596,0.271129,0.011506,0.107265,0.118721,0.969484,0.939899
RSS,0.139782,0.173295,0.14561,0.199137,0.594849,0.040448,0.201118,0.249335,0.861232,0.741721


### C2 Measure generated rules

In [9]:
for rule in c2_ruleset.rules:
    print(rule)

IF MM31 = (-inf, 0.23) THEN MM116_pred = {0.40} [0.39,0.41]
IF MM116 = <0.35, 0.45) AND DMM116 = <-0.05, inf) AND MM31 = (-inf, 0.24) THEN MM116_pred = {0.40} [0.39,0.42]
IF MM116 = <0.35, 0.45) AND MM31 = (-inf, 0.24) THEN MM116_pred = {0.40} [0.38,0.42]
IF MM31 = <0.24, 0.25) AND BA13 = (-inf, 1076.50) THEN MM116_pred = {0.50} [0.50,0.50]
IF MM116 = (-inf, 0.45) AND DMM116 = <-0.05, inf) AND AS038 = (-inf, 2.45) AND MM31 = <0.19, 0.25) AND PG072 = (-inf, 2.05) THEN MM116_pred = {0.40} [0.38,0.42]
IF MM116 = (-inf, 0.45) THEN MM116_pred = {0.40} [0.37,0.44]
IF MM116 = (-inf, 0.55) AND MM31 = <0.19, 0.29) AND BA13 = <1072.50, inf) THEN MM116_pred = {0.45} [0.39,0.50]
IF PD = (-inf, 0.50) AND MM116 = <0.45, 0.55) AND MM31 = <0.23, inf) AND PG072 = <1.65, inf) AND BA13 = (-inf, 1077.50) THEN MM116_pred = {0.50} [0.48,0.53]
IF MM116 = <0.45, inf) AND DMM116 = <-0.05, inf) AND MM31 = <0.23, 0.30) AND BA13 = <1073.50, 1076.50) THEN MM116_pred = {0.50} [0.48,0.53]
IF MM116 = (-inf, 0.55) AND

### Correlation Measure generated rules

In [10]:
for rule in corr_ruleset.rules:
    print(rule)

IF MM31 = (-inf, 0.23) THEN MM116_pred = {0.40} [0.39,0.41]
IF MM116 = (-inf, 0.45) AND DMM116 = <-0.05, inf) AND MM31 = <0.18, 0.24) THEN MM116_pred = {0.40} [0.38,0.42]
IF MM116 = (-inf, 0.45) AND MM31 = <0.18, 0.24) THEN MM116_pred = {0.40} [0.38,0.42]
IF MM31 = (-inf, 0.25) THEN MM116_pred = {0.44} [0.37,0.51]
IF MM31 = (-inf, 0.26) THEN MM116_pred = {0.46} [0.36,0.55]
IF MM31 = (-inf, 0.28) THEN MM116_pred = {0.49} [0.37,0.61]
IF PD = (-inf, 0.50) AND MM116 = <0.25, inf) AND DMM116 = <-0.05, 0.05) AND AS038 = <2, 2.45) AND MM31 = <0.23, inf) AND BA13 = (-inf, 1075.50) THEN MM116_pred = {0.71} [0.50,0.92]
IF MM116 = <0.25, 0.45) AND MM31 = <0.18, inf) AND PG072 = (-inf, 2.05) THEN MM116_pred = {0.40} [0.38,0.43]
IF PD = (-inf, 0.50) AND MM116 = (-inf, 0.25) AND DMM116 = <-0.05, 0.05) AND AS038 = <2.35, 2.45) AND MM31 = <0.19, inf) AND PG072 = <1.75, 1.95) AND BA13 = (-inf, 1076.50) THEN MM116_pred = {0.25} [0.20,0.30]
IF MM116 = (-inf, 0.45) AND DMM116 = <-0.05, inf) AND MM31 = <0.

### RSS Measure generated rules

In [11]:
for rule in rss_ruleset.rules:
    print(rule)

IF MM31 = (-inf, 0.23) THEN MM116_pred = {0.40} [0.39,0.41]
IF MM116 = (-inf, 0.45) AND MM31 = <0.18, 0.25) AND PG072 = (-inf, 2.05) THEN MM116_pred = {0.40} [0.38,0.43]
IF MM31 = (-inf, 0.26) THEN MM116_pred = {0.46} [0.36,0.55]
IF MM116 = <0.35, inf) AND MM31 = <0.26, inf) THEN MM116_pred = {0.91} [0.67,1.14]
IF PD = (-inf, 0.50) AND MM116 = <0.25, inf) AND DMM116 = <-0.95, 0.05) AND AS038 = <2, 2.45) AND MM31 = <0.23, inf) AND PG072 = <1.65, 2.05) AND BA13 = (-inf, 1075.50) THEN MM116_pred = {0.71} [0.50,0.93]
IF PD = (-inf, 0.50) AND MM116 = (-inf, 0.25) AND DMM116 = <-0.05, 0.05) AND AS038 = <2.35, 2.45) AND MM31 = <0.19, inf) AND PG072 = <1.75, 1.95) AND BA13 = (-inf, 1077.50) THEN MM116_pred = {0.25} [0.20,0.30]
IF MM116 = (-inf, 0.25) THEN MM116_pred = {0.23} [0.19,0.28]


## Evaluation on a test set

In [7]:
X_test = test_df.drop(['MM116_pred'], axis=1)
y_test = test_df['MM116_pred']

In [13]:
# C2
c2_predictions = c2_reg.predict(X_test)
c2_regression_metrics = get_regression_metrics('C2', c2_predictions, y_test)

# Correlation
corr_predictions = corr_reg.predict(X_test)
corr_regression_metrics = get_regression_metrics('Correlation', corr_predictions, y_test)

# RSS
rss_predictions = rss_reg.predict(X_test)
rss_regression_metrics = get_regression_metrics('RSS', rss_predictions, y_test)


In [14]:
display(pd.concat([c2_regression_metrics, corr_regression_metrics, rss_regression_metrics]))

Unnamed: 0_level_0,absolute_error,relative_error,relative_error_lenient,relative_error_strict,normalized_absolute_error,squared_error,root_mean_squared_error,root_relative_squared_error,correlation,squared_correlation
Measure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
C2,0.101333,0.096754,0.090187,0.105576,0.69869,0.016746,0.129405,0.121616,0.829647,0.688314
Correlation,0.093286,0.083513,0.082229,0.094968,0.643208,0.01505,0.122678,0.104504,0.9135,0.834482
RSS,0.186167,0.164586,0.16272,0.21287,1.283621,0.053397,0.231079,0.193394,0.627423,0.393659


## Hyperparameters tuning

This one gonna take a while...

In [8]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from rulekit.params import Measures


# define models and parameters
model = RuleRegressor(mean_based_regression=True)
minsupp_new = range(5, 7)
measures_choice = [Measures.C2, Measures.Correlation, Measures.RSS]

# define grid search
grid = {
    'minsupp_new': minsupp_new, 
    'induction_measure': measures_choice, 
    'pruning_measure': measures_choice, 
    'voting_measure': measures_choice
}
cv = KFold(n_splits=3)
grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cv, scoring='neg_root_mean_squared_error', verbose=True)
grid_result = grid_search.fit(X_train, y_train)

# summarize results
print("Best RMSE: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best RMSE: -0.129690 using {'induction_measure': <Measures.RSS: 'RSS'>, 'minsupp_new': 6, 'pruning_measure': <Measures.C2: 'C2'>, 'voting_measure': <Measures.C2: 'C2'>}


## Prediction using the model selected from the tuning

In [9]:
reg = grid_result.best_estimator_

In [10]:
ruleset = reg.model
ruleset_stats = get_ruleset_stats('', ruleset)

Generated rules

In [11]:
for rule in ruleset.rules:
    print(rule)

IF MM31 = (-inf, 0.23) THEN MM116_pred = {0.40} [0.39,0.41]
IF MM116 = <0.35, 0.45) AND DMM116 = <-0.05, inf) AND MM31 = (-inf, 0.24) THEN MM116_pred = {0.40} [0.39,0.42]
IF MM116 = <0.35, 0.45) AND MM31 = (-inf, 0.24) THEN MM116_pred = {0.40} [0.38,0.42]
IF PD = (-inf, 0.50) AND DMM116 = <-0.05, inf) AND AS038 = (-inf, 2.45) AND MM31 = <0.24, 0.25) THEN MM116_pred = {0.50} [0.47,0.54]
IF PD = <0.50, inf) AND MM116 = (-inf, 0.45) AND AS038 = (-inf, 2.45) AND MM31 = <0.24, 0.25) AND PG072 = (-inf, 2.05) THEN MM116_pred = {0.41} [0.38,0.44]
IF PD = (-inf, 0.50) AND MM31 = <0.24, 0.25) THEN MM116_pred = {0.51} [0.47,0.54]
IF DMM116 = <-0.05, 0.05) AND MM31 = (-inf, 0.26) THEN MM116_pred = {0.46} [0.36,0.55]
IF MM116 = (-inf, 0.45) THEN MM116_pred = {0.40} [0.37,0.44]
IF MM116 = <0.45, inf) AND MM31 = <0.23, 0.24) AND BA13 = (-inf, 1075.50) THEN MM116_pred = {0.50} [0.48,0.52]
IF PD = (-inf, 0.50) AND MM116 = <0.45, 0.55) AND DMM116 = <-0.05, inf) AND MM31 = <0.23, inf) AND PG072 = <1.65, 

Ruleset evaluation

In [12]:
display(ruleset_stats)

Unnamed: 0_level_0,minimum_covered,maximum_uncovered_fraction,ignore_missing,pruning_enabled,max_growing_condition,time_total_s,time_growing_s,time_pruning_s,rules_count,conditions_per_rule,induced_conditions_per_rule,avg_rule_coverage,avg_rule_precision,avg_rule_quality,pvalue,FDR_pvalue,FWER_pvalue,fraction_significant,fraction_FDR_significant,fraction_FWER_significant
Measure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
,6.0,0.0,False,True,0.0,17.995523,2.274811,15.69378,21,6.52381,29.809524,0.116152,0.849723,,,,,0.952381,0.952381,0.952381


### Validate model on test dataset

In [15]:
predictions = reg.predict(X_test)
regression_metrics = get_regression_metrics('', predictions, y_test)
display(regression_metrics.iloc[0])

absolute_error                 0.111355
relative_error                 0.103524
relative_error_lenient         0.097884
relative_error_strict          0.114888
normalized_absolute_error      0.767792
squared_error                  0.019642
root_mean_squared_error        0.140148
root_relative_squared_error    0.125609
correlation                    0.801204
squared_correlation            0.641927
Name: , dtype: float64