# tabarules

Meant to take tabular data and turn it into a format usable for `mlxtend` association rules mining.

In [1]:
#!pip install mlxtend

In [2]:
# imports
import os
import numpy as np
import pandas as pd
import mlxtend
from math import ceil

In [3]:
# load in data
df = pd.read_csv("./data/winequality-red-features.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,high_alc,acid_level,quality_2
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,low,bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0,med,bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0,med,bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0,med,bad
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,low,bad


In [4]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
high_alc                  int64
acid_level               object
quality_2                object
dtype: object

So the idea here is that we want to make each row of our table a list of values, and have each of these stored with an index. That's the format that `mlxtend` expects for apriori and association rules.

In [5]:
float_series = df['fixed acidity']
categorical_series = df['acid_level']
boolean_series = df['quality_2']
boolean_series2 = df['high_alc']

In [16]:
def float_processer(series, title, cutoffs = 4, print_labels = False):
    '''
    SUMMARY: Gives bin labeling to float data based off of splits in the data
    
    INPUTS:
    - series: A pandas series object of type float
    - title: A title used for the feature of type string
    - cutoffs: the number of bins to place data into. The more bins, the more granular.
    - print_labels: a boolean value to print out the label values before assignment. Useful for debugging.
    
    OUTPUTS:
    - a pandas series object of the bin-labelled data
    
    NOTES:
    - string labels can get long. opt for shorter ones if possible.
    - string labels will come out in the format "title_lessthan_cutoffval" or "title_morethan_cutoffval"
    - Cutoff values are calulated by sorting the data and taking the value at each index 1/cutoffs way through the data.
    
    '''
    
    
    # find splits in data based on ordering
    len_series = series.shape[0]
    series_sorted = series.sort_values().reset_index(drop = True)
    cutoff_val_ind = [ceil( (i+1) / cutoffs * len_series) for i in range(cutoffs-1)]
    cutoff_vals = series_sorted[cutoff_val_ind].values
    
    # label creation
    str_labels = [title + "_lessthan_" + str(val) for val in cutoff_vals]
    str_labels.append(title + "_morethan_" + str(cutoff_vals[cutoffs-2]))
    
    if print_labels:
        print(str_labels)
    
    output_list = []
    # likely can have better parallelization if needed
    # populate a list with the appropriate label for the data
    for i in range(len_series-1):
        for j in range(len(cutoff_vals)):
            if series[i] <= cutoff_vals[j]:
                output_list.append(str_labels[j])
                next
            elif j == len(cutoff_vals):
                output_list.append(str_labels[len(cutoff_vals)])
                next
    
    # can debate whether list or series is better later
    return pd.Series(output_list)


In [17]:
float_processer(float_series, "fixed_acidity", 7)

['fixed_acidity_lessthan_6.8', 'fixed_acidity_lessthan_7.2', 'fixed_acidity_lessthan_7.7', 'fixed_acidity_lessthan_8.2', 'fixed_acidity_lessthan_9.0', 'fixed_acidity_lessthan_10.2', 'fixed_acidity_morethan_10.2']


0        fixed_acidity_lessthan_7.7
1        fixed_acidity_lessthan_8.2
2        fixed_acidity_lessthan_9.0
3       fixed_acidity_lessthan_10.2
4        fixed_acidity_lessthan_8.2
5        fixed_acidity_lessthan_9.0
6       fixed_acidity_lessthan_10.2
7        fixed_acidity_lessthan_8.2
8        fixed_acidity_lessthan_9.0
9       fixed_acidity_lessthan_10.2
10       fixed_acidity_lessthan_7.7
11       fixed_acidity_lessthan_8.2
12       fixed_acidity_lessthan_9.0
13      fixed_acidity_lessthan_10.2
14       fixed_acidity_lessthan_7.7
15       fixed_acidity_lessthan_8.2
16       fixed_acidity_lessthan_9.0
17      fixed_acidity_lessthan_10.2
18       fixed_acidity_lessthan_8.2
19       fixed_acidity_lessthan_9.0
20      fixed_acidity_lessthan_10.2
21       fixed_acidity_lessthan_7.7
22       fixed_acidity_lessthan_8.2
23       fixed_acidity_lessthan_9.0
24      fixed_acidity_lessthan_10.2
25       fixed_acidity_lessthan_8.2
26       fixed_acidity_lessthan_9.0
27      fixed_acidity_lessth