In [1]:
import pandas as pd
import numpy as np
from cleverminer import cleverminer
from sklearn.preprocessing import KBinsDiscretizer
data = pd.read_csv("data.csv")

display(data.isnull().sum())
display(data.head())

Cleverminer version  1.0.6


gender                 0
age                    0
height(cm)             0
weight(kg)             0
waist(cm)              0
eyesight(left)         0
eyesight(right)        0
hearing(left)          0
hearing(right)         0
systolic               0
relaxation             0
fasting_blood_sugar    0
cholesterol            0
triglyceride           0
hdl                    0
ldl                    0
hemoglobin             0
urine_protein          0
serum_creatinine       0
ast                    0
alt                    0
gtp                    0
dental_caries          0
tartar                 0
smoking                0
bmi                    0
healthy_weight         0
eyesight_total         0
vision                 0
hearing_impairment     0
dtype: int64

Unnamed: 0,gender,age,height(cm),weight(kg),waist(cm),eyesight(left),eyesight(right),hearing(left),hearing(right),systolic,...,alt,gtp,dental_caries,tartar,smoking,bmi,healthy_weight,eyesight_total,vision,hearing_impairment
0,0,40,155,60,81.3,1.2,1.0,1.0,1.0,114.0,...,19.0,27.0,0,1,0,24.97,1.0,2.2,1,0
1,0,40,160,60,81.0,0.8,0.6,1.0,1.0,119.0,...,19.0,18.0,0,1,0,23.44,0.0,1.4,1,0
2,1,55,170,60,80.0,0.8,0.8,1.0,1.0,138.0,...,16.0,22.0,0,0,1,20.76,0.0,1.6,1,0
3,1,40,165,70,88.0,1.5,1.5,1.0,1.0,100.0,...,26.0,18.0,0,1,0,25.71,1.0,3.0,0,0
4,0,40,155,60,86.0,1.0,1.0,1.0,1.0,120.0,...,14.0,22.0,0,0,0,24.97,1.0,2.0,1,0


# Discretization of values

In [2]:
def show_bins(bins):
    """Shows the ranges of values the bins comprises"""

    for i in range(len(bins)-1):
        print(f'Bin {i}: {str(bins[i])} -> {str(bins[i+1])}')
    print("------------")

In [3]:
def discretize(column: str):
    kbins = KBinsDiscretizer(n_bins=5, strategy="kmeans", encode="ordinal")
    data[column] = kbins.fit_transform(np.array(data[column]).reshape(-1,1))
    return kbins



In [4]:
show_bins(discretize("waist(cm)").bin_edges_[0])
show_bins(discretize("triglyceride").bin_edges_[0])
show_bins(discretize("hemoglobin").bin_edges_[0])
show_bins(discretize("gtp").bin_edges_[0])

Bin 0: 51.0 -> 73.35517898946664
Bin 1: 73.35517898946664 -> 81.5306262494403
Bin 2: 81.5306262494403 -> 89.11850049539169
Bin 3: 89.11850049539169 -> 98.0337495675102
Bin 4: 98.0337495675102 -> 129.0
------------
Bin 0: 8.0 -> 98.1158529751392
Bin 1: 98.1158529751392 -> 164.01416122082935
Bin 2: 164.01416122082935 -> 255.14372276503133
Bin 3: 255.14372276503133 -> 654.0995828988534
Bin 4: 654.0995828988534 -> 999.0
------------
Bin 0: 4.9 -> 11.1796358362651
Bin 1: 11.1796358362651 -> 13.19627551401679
Bin 2: 13.19627551401679 -> 14.570595105555157
Bin 3: 14.570595105555157 -> 15.923341005633791
Bin 4: 15.923341005633791 -> 21.1
------------
Bin 0: 1.0 -> 43.06023512349059
Bin 1: 43.06023512349059 -> 102.12770858809866
Bin 2: 102.12770858809866 -> 225.67557557697745
Bin 3: 225.67557557697745 -> 490.9169987656497
Bin 4: 490.9169987656497 -> 999.0
------------


In [5]:
print(data["gender"].value_counts())
print(data["waist(cm)"].value_counts())
print(data["triglyceride"].value_counts())
print(data["hemoglobin"].value_counts())
print(data["gtp"].value_counts())


1    35401
0    20291
Name: gender, dtype: int64
1.0    17090
2.0    16904
0.0    10112
3.0     9174
4.0     2412
Name: waist(cm), dtype: int64
0.0    24326
1.0    18263
2.0     9266
3.0     3836
4.0        1
Name: triglyceride, dtype: int64
3.0    19781
2.0    15188
4.0    11183
1.0     8349
0.0     1191
Name: hemoglobin, dtype: int64
0.0    41770
1.0    10612
2.0     2669
3.0      534
4.0      107
Name: gtp, dtype: int64


In [6]:
clm = cleverminer(df = data, proc='4ftMiner',
                  quantifiers = {'conf':0.7, 'Base':1000},
                  ante ={
                      'attributes':[
                          {'name': 'gender', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                          {'name': 'waist(cm)', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                          {'name': 'hemoglobin', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                          {'name': 'gtp', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                          {'name': 'triglyceride', 'type': 'subset', 'minlen': 1, 'maxlen': 1},
                      ], 'minlen':1, 'maxlen':6, 'type':'con'},
                  succ ={
                      'attributes':[
                          {'name': 'smoking', 'type': 'subset', 'minlen': 1, 'maxlen': 1}
                      ], 'minlen':1, 'maxlen':1, 'type':'con'})

Cleverminer version 1.0.6.
Starting data preparation ...
Automatically reordering numeric categories ...
Automatically reordering numeric categories ...done
Encoding columns into bit-form...
Encoding columns into bit-form...done
Data preparation finished.
Will go for  4ftMiner
Starting to mine rules.
100%|####################################################|Elapsed Time: 0:00:01
Done. Total verifications : 400, rules 127, times: prep 2.04sec, processing 1.14sec


In [12]:
clm.print_rulelist()


List of rules:
RULEID BASE  CONF  AAD    Rule
     1 19432 0.958 +0.514 gender(0) => smoking(0) | ---
     2  7197 0.959 +0.515 gender(0) & waist(cm)(0) => smoking(0) | ---
     3  3120 0.966 +0.526 gender(0) & waist(cm)(0) & hemoglobin(1) => smoking(0) | ---
     4  3033 0.968 +0.531 gender(0) & waist(cm)(0) & hemoglobin(1) & gtp(0) => smoking(0) | ---
     5  2313 0.973 +0.538 gender(0) & waist(cm)(0) & hemoglobin(1) & gtp(0) & triglyceride(0) => smoking(0) | ---
     6  2363 0.972 +0.537 gender(0) & waist(cm)(0) & hemoglobin(1) & triglyceride(0) => smoking(0) | ---
     7  3100 0.950 +0.502 gender(0) & waist(cm)(0) & hemoglobin(2) => smoking(0) | ---
     8  2960 0.956 +0.511 gender(0) & waist(cm)(0) & hemoglobin(2) & gtp(0) => smoking(0) | ---
     9  2149 0.959 +0.515 gender(0) & waist(cm)(0) & hemoglobin(2) & gtp(0) & triglyceride(0) => smoking(0) | ---
    10  2218 0.958 +0.514 gender(0) & waist(cm)(0) & hemoglobin(2) & triglyceride(0) => smoking(0) | ---
    11  6934 0.963 +0.

In [22]:
clm.print_rule(68)
clm.print_rule(124)



Rule id : 68

Base :  1018  Relative base : 0.018  CONF : 0.746  AAD : +1.031  BAD : -1.031

Cedents:
  antecedent : gender(1) & hemoglobin(4) & triglyceride(3)
  succcedent : smoking(1)
  condition  : ---

Fourfold table
    |  S  |  ¬S |
----|-----|-----|
 A  | 1018|  347|
----|-----|-----|
¬A  |19437|34890|
----|-----|-----|



Rule id : 124

Base :  1018  Relative base : 0.018  CONF : 0.745  AAD : +1.028  BAD : -1.028

Cedents:
  antecedent : hemoglobin(4) & triglyceride(3)
  succcedent : smoking(1)
  condition  : ---

Fourfold table
    |  S  |  ¬S |
----|-----|-----|
 A  | 1018|  349|
----|-----|-----|
¬A  |19437|34888|
----|-----|-----|

