# tabarules

Meant to take tabular data and turn it into a format usable for `mlxtend` association rules mining.

In [1]:
!pip install mlxtend

Collecting mlxtend
  Downloading https://files.pythonhosted.org/packages/52/04/c362f34f666f0ddc7cf593805e64d64fa670ed96fd9302e68549dd48287d/mlxtend-0.17.0-py2.py3-none-any.whl (1.3MB)
Collecting joblib>=0.13.2 (from mlxtend)
  Downloading https://files.pythonhosted.org/packages/28/5c/cf6a2b65a321c4a209efcdf64c2689efae2cb62661f8f6f4bb28547cf1bf/joblib-0.14.1-py2.py3-none-any.whl (294kB)
Installing collected packages: joblib, mlxtend
Successfully installed joblib-0.14.1 mlxtend-0.17.0


In [18]:
# imports
import os
import numpy as np
import pandas as pd
import mlxtend
from math import ceil

In [3]:
# load in data
df = pd.read_csv("./data/winequality-red-features.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,high_alc,acid_level,quality_2
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,low,bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0,med,bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0,med,bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0,med,bad
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,low,bad


In [6]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
high_alc                  int64
acid_level               object
quality_2                object
dtype: object

So the idea here is that we want to make each row of our table a list of values, and have each of these stored with an index. That's the format that `mlxtend` expects for apriori and association rules.

In [8]:
float_series = df['fixed acidity']
categorical_series = df['acid_level']
boolean_series = df['quality_2']
boolean_series2 = df['high_alc']

In [49]:
def float_processer(series, title, cutoffs = 4):
    # find splits in data based on ordering
    len_series = series.shape[0]
    series = series.sort_values().reindex()
    print(series)
    cutoff_val_ind = [ceil( (i+1) / cutoffs * len_series) for i in range(cutoffs-1)]
    cutoff_vals = series[cutoff_val_ind].values
    
    # label creation
    str_labels = [title + "_lessthan_" + str(val) for val in cutoff_vals]
    str_labels.append(title + "_morethan_" + str(cutoff_vals[cutoffs-2]))
    print(str_labels)
    
    output_list = []
    # likely can have better parallelization if needed
    # populate a list with the appropriate label for the data
    for i in range(len_series-1):
        for j in range(len(cutoff_vals)):
            if series[i] <= cutoff_vals[j]:
                output_list.append(str_labels[j])
                next
            elif j == len(cutoff_vals):
                output_list.append(str_labels[len(cutoff_vals)])
                next
    
    # can debate whether list or series is better later
    return pd.Series(output_list)
    
float_processer(float_series, "fixed_acidity", 6)


45       4.6
95       4.7
821      4.9
588      5.0
94       5.0
553      5.0
1270     5.0
1114     5.0
1321     5.0
1157     5.1
695      5.1
1228     5.1
802      5.1
1300     5.2
1377     5.2
34       5.2
230      5.2
144      5.2
142      5.2
444      5.3
1477     5.3
916      5.3
1475     5.3
198      5.4
1316     5.4
1537     5.4
1111     5.4
1591     5.4
1269     5.5
1178     5.6
        ... 
366     12.8
205     12.8
206     12.8
429     12.8
364     12.8
811     12.9
538     12.9
564     13.0
470     13.0
559     13.0
603     13.2
611     13.2
601     13.2
509     13.3
680     13.3
294     13.3
328     13.4
353     13.5
381     13.7
391     13.7
347     13.8
374     14.0
544     14.3
244     15.0
243     15.0
555     15.5
554     15.5
442     15.6
557     15.6
652     15.9
Name: fixed acidity, Length: 1599, dtype: float64
['fixed_acidity_lessthan_7.9', 'fixed_acidity_lessthan_10.3', 'fixed_acidity_lessthan_7.2', 'fixed_acidity_lessthan_6.6', 'fixed_acidity_lessthan_9.1', 'fixe

0        fixed_acidity_lessthan_7.9
1       fixed_acidity_lessthan_10.3
2        fixed_acidity_lessthan_9.1
3        fixed_acidity_lessthan_7.9
4       fixed_acidity_lessthan_10.3
5        fixed_acidity_lessthan_9.1
6        fixed_acidity_lessthan_7.9
7       fixed_acidity_lessthan_10.3
8        fixed_acidity_lessthan_9.1
9        fixed_acidity_lessthan_7.9
10      fixed_acidity_lessthan_10.3
11       fixed_acidity_lessthan_9.1
12       fixed_acidity_lessthan_7.9
13      fixed_acidity_lessthan_10.3
14       fixed_acidity_lessthan_9.1
15       fixed_acidity_lessthan_7.9
16      fixed_acidity_lessthan_10.3
17       fixed_acidity_lessthan_9.1
18       fixed_acidity_lessthan_7.9
19      fixed_acidity_lessthan_10.3
20       fixed_acidity_lessthan_9.1
21       fixed_acidity_lessthan_7.9
22      fixed_acidity_lessthan_10.3
23       fixed_acidity_lessthan_9.1
24       fixed_acidity_lessthan_7.9
25      fixed_acidity_lessthan_10.3
26       fixed_acidity_lessthan_9.1
27       fixed_acidity_lesst

In [26]:
float_series

0        7.4
1        7.8
2        7.8
3       11.2
4        7.4
5        7.4
6        7.9
7        7.3
8        7.8
9        7.5
10       6.7
11       7.5
12       5.6
13       7.8
14       8.9
15       8.9
16       8.5
17       8.1
18       7.4
19       7.9
20       8.9
21       7.6
22       7.9
23       8.5
24       6.9
25       6.3
26       7.6
27       7.9
28       7.1
29       7.8
        ... 
1569     6.2
1570     6.4
1571     6.4
1572     7.3
1573     6.0
1574     5.6
1575     7.5
1576     8.0
1577     6.2
1578     6.8
1579     6.2
1580     7.4
1581     6.2
1582     6.1
1583     6.2
1584     6.7
1585     7.2
1586     7.5
1587     5.8
1588     7.2
1589     6.6
1590     6.3
1591     5.4
1592     6.3
1593     6.8
1594     6.2
1595     5.9
1596     6.3
1597     5.9
1598     6.0
Name: fixed acidity, Length: 1599, dtype: float64