# tabarules

Meant to take tabular data and turn it into a format usable for `mlxtend` association rules mining.

In [1]:
#!pip install mlxtend

### functions to write
* float_processor
* boolean_processor (needs to be robust to categorial)
* int_processor
* category_processor
* determine_dtype (takes series and identifies relevant fxn)
* featurize_df

__other associated todos__
* write tests to ensure that the data coming in is actually of the right type, throw error messages

In [1]:
# imports
import os
import numpy as np
import pandas as pd
import mlxtend
from math import ceil

In [40]:
# this is the nan functionality

sample = pd.Series([np.nan, 0, np.nan, 7])
np.where(np.isnan(sample))

(array([0, 2], dtype=int64),)

In [2]:
# load in data
df = pd.read_csv("./data/winequality-red-features.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,high_alc,acid_level,quality_2
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,low,bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0,med,bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0,med,bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0,med,bad
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,low,bad


In [3]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
high_alc                  int64
acid_level               object
quality_2                object
dtype: object

So the idea here is that we want to make each row of our table a list of values, and have each of these stored with an index. That's the format that `mlxtend` expects for apriori and association rules.

In [24]:
float_series = df['fixed acidity']
categorical_series = df['acid_level']
boolean_series = df['quality_2']
boolean_series2 = df['high_alc']
int_series = df['quality']

In [51]:
def float_processer(series, title, cutoffs = 4, na_action = ['label', 'return_na'][1], print_labels = False):
    '''
    SUMMARY: Gives bin labeling to float data based off of splits in the data
    
    INPUTS:
    - series: A pandas series object of type float
    - title: A title used for the feature of type string
    - cutoffs: the number of bins to place data into. The more bins, the more granular.
    - na_action: desired action for missing values. "label" will return a string with "title_is_na", "return_na" will return NA
    - print_labels: a boolean value to print out the label values before assignment. Useful for debugging.
    
    OUTPUTS:
    - a pandas series object of the bin-labelled data
    
    NOTES:
    - string labels can get long. opt for shorter ones if possible.
    - string labels will come out in the format "title_lessthan_cutoffval" or "title_morethan_cutoffval"
    - Cutoff values are calulated by sorting the data and taking the value at each index 1/cutoffs way through the data.
    
    '''
    
    
    # find splits in data based on ordering
    len_series = series.shape[0]
    series_sorted = series.sort_values().reset_index(drop = True)
    cutoff_val_ind = [ceil( (i+1) / cutoffs * len_series) for i in range(cutoffs-1)]
    cutoff_vals = series_sorted[cutoff_val_ind].values
    
    # label creation
    str_labels = [title + "_lessthan_" + str(val) for val in cutoff_vals]
    str_labels.append(title + "_morethan_" + str(cutoff_vals[cutoffs-2]))
    
    if print_labels:
        print(str_labels)
    
    output_list = []
    # likely can have better parallelization if needed
    # populate a list with the appropriate label for the data
    for i in range(len_series-1):
        if np.isnan(series[i]):
            if na_action == "label":
                output_list.append(title + "is_na")
            elif na_action == "return_na":
                output_list.append(np.nan)
        for j in range(len(cutoff_vals)):
            if series[i] <= cutoff_vals[j]:
                output_list.append(str_labels[j])
                next
            elif j == len(cutoff_vals):
                output_list.append(str_labels[len(cutoff_vals)])
                next
    
    # can debate whether list or series is better later
    return pd.Series(output_list)


In [52]:
# test for float_processor

float_processer(float_series, "fixed_acidity", 7, True).unique()

array(['fixed_acidity_lessthan_7.7', 'fixed_acidity_lessthan_8.2',
       'fixed_acidity_lessthan_9.0', 'fixed_acidity_lessthan_10.2',
       'fixed_acidity_lessthan_6.8', 'fixed_acidity_lessthan_7.2'],
      dtype=object)

In [16]:
def boolean_processer(series, title, which_yes = 1, print_labels = False):
    '''
    SUMMARY: Gives bin labeling to boolean data
    
    INPUTS:
    - series: A pandas series object of type float
    - title: A title used for the feature of type string
    - which_yes: which number in the data is used as the "yes" marking
    - print_labels: a boolean value to print out the label values before assignment. Useful for debugging.
    
    OUTPUTS:
    - a pandas series object of the bin-labelled data
    
    NOTES:
    - string labels can get long. opt for shorter ones if possible.
    - string labels will come out in the format "title_yes" or "title_no"
    - Cutoff values are calulated by sorting the data and taking the value at each index 1/cutoffs way through the data.
    
    '''
    len_series = series.shape[0]
    
    # label creation
    str_labels = [title + "_no", title + "_yes"]
    
    if print_labels:
        print(str_labels)
    
    output_list = []
    # likely can have better parallelization if needed
    # populate a list with the appropriate label for the data
    
    ### 
    for i in range(len_series-1):
        if series[i] == which_yes:
            output_list.append(str_labels[1])
        else:
            output_list.append(str_labels[0])
    
    # can debate whether list or series is better later
    return pd.Series(output_list)


In [22]:
# works
boolean_processer(boolean_series2, "high_alc")

0        high_alc_no
1        high_alc_no
2        high_alc_no
3        high_alc_no
4        high_alc_no
5        high_alc_no
6        high_alc_no
7        high_alc_no
8        high_alc_no
9       high_alc_yes
10       high_alc_no
11      high_alc_yes
12       high_alc_no
13       high_alc_no
14       high_alc_no
15       high_alc_no
16      high_alc_yes
17       high_alc_no
18       high_alc_no
19       high_alc_no
20       high_alc_no
21       high_alc_no
22       high_alc_no
23       high_alc_no
24       high_alc_no
25       high_alc_no
26       high_alc_no
27       high_alc_no
28       high_alc_no
29       high_alc_no
            ...     
1568     high_alc_no
1569    high_alc_yes
1570    high_alc_yes
1571    high_alc_yes
1572     high_alc_no
1573    high_alc_yes
1574    high_alc_yes
1575    high_alc_yes
1576    high_alc_yes
1577    high_alc_yes
1578    high_alc_yes
1579    high_alc_yes
1580    high_alc_yes
1581    high_alc_yes
1582    high_alc_yes
1583     high_alc_no
1584    high_

In [23]:
boolean_processer(boolean_series, "quality", which_yes = "good")

0        quality_no
1        quality_no
2        quality_no
3        quality_no
4        quality_no
5        quality_no
6        quality_no
7       quality_yes
8       quality_yes
9        quality_no
10       quality_no
11       quality_no
12       quality_no
13       quality_no
14       quality_no
15       quality_no
16      quality_yes
17       quality_no
18       quality_no
19       quality_no
20       quality_no
21       quality_no
22       quality_no
23       quality_no
24       quality_no
25       quality_no
26       quality_no
27       quality_no
28       quality_no
29       quality_no
           ...     
1568     quality_no
1569     quality_no
1570     quality_no
1571     quality_no
1572     quality_no
1573     quality_no
1574     quality_no
1575     quality_no
1576     quality_no
1577     quality_no
1578     quality_no
1579     quality_no
1580     quality_no
1581     quality_no
1582     quality_no
1583     quality_no
1584    quality_yes
1585     quality_no
1586     quality_no


In [32]:
float_processer(int_series, "quality", cutoffs = 7, print_labels = True).unique()

['quality_lessthan_5', 'quality_lessthan_5', 'quality_lessthan_5', 'quality_lessthan_6', 'quality_lessthan_6', 'quality_lessthan_6', 'quality_morethan_6']


array(['quality_lessthan_5', 'quality_lessthan_6'], dtype=object)

In [27]:
int_series.unique()

array([5, 6, 7, 4, 8, 3], dtype=int64)

In [42]:
def cat_processer(series, title, print_labels = False):
    '''
    SUMMARY: Gives bin labeling to categorical data based off of splits in the data
    
    INPUTS:
    - series: A pandas series object of type float
    - title: A title used for the feature of type string
    - print_labels: a boolean value to print out the label values before assignment. Useful for debugging.
    
    OUTPUTS:
    - a pandas series object of the bin-labelled data
    
    NOTES:
    - string labels can get long. opt for shorter ones if possible.
    - string labels will come out in the format "title_is_label"
    
    '''
    
    
    # find splits in data based on ordering
    len_series = series.shape[0]
    unique_vals = series.unique()
    
    # label creation
    str_labels = [title + "_is_" + val for val in unique_vals]
    
    if print_labels:
        print(str_labels)
    
    output_list = []
    # likely can have better parallelization if needed
    # populate a list with the appropriate label for the data
    for i in range(len_series-1):
        for j in range(unique_vals.shape[0]):
            if series[i] == unique_vals[j]:
                output_list.append(str_labels[j])
                next
    
    # can debate whether list or series is better later
    return pd.Series(output_list)


In [46]:
cat_processer(categorical_series, "acid_level", True)

['acid_level_is_low', 'acid_level_is_med', 'acid_level_is_high']


0       acid_level_is_low
1       acid_level_is_med
2       acid_level_is_med
3       acid_level_is_med
4       acid_level_is_low
5       acid_level_is_low
6       acid_level_is_med
7       acid_level_is_med
8       acid_level_is_med
9       acid_level_is_med
10      acid_level_is_med
11      acid_level_is_med
12      acid_level_is_low
13      acid_level_is_med
14      acid_level_is_med
15      acid_level_is_med
16      acid_level_is_med
17      acid_level_is_med
18      acid_level_is_med
19      acid_level_is_med
20      acid_level_is_med
21      acid_level_is_low
22      acid_level_is_med
23      acid_level_is_med
24      acid_level_is_med
25      acid_level_is_med
26      acid_level_is_med
27      acid_level_is_med
28      acid_level_is_med
29      acid_level_is_med
              ...        
1568    acid_level_is_med
1569    acid_level_is_med
1570    acid_level_is_med
1571    acid_level_is_med
1572    acid_level_is_med
1573    acid_level_is_low
1574    acid_level_is_med
1575    acid

In [49]:
cat_processer(boolean_series, "quality_2")

0        quality_2_is_bad
1        quality_2_is_bad
2        quality_2_is_bad
3        quality_2_is_bad
4        quality_2_is_bad
5        quality_2_is_bad
6        quality_2_is_bad
7       quality_2_is_good
8       quality_2_is_good
9        quality_2_is_bad
10       quality_2_is_bad
11       quality_2_is_bad
12       quality_2_is_bad
13       quality_2_is_bad
14       quality_2_is_bad
15       quality_2_is_bad
16      quality_2_is_good
17       quality_2_is_bad
18       quality_2_is_bad
19       quality_2_is_bad
20       quality_2_is_bad
21       quality_2_is_bad
22       quality_2_is_bad
23       quality_2_is_bad
24       quality_2_is_bad
25       quality_2_is_bad
26       quality_2_is_bad
27       quality_2_is_bad
28       quality_2_is_bad
29       quality_2_is_bad
              ...        
1568     quality_2_is_bad
1569     quality_2_is_bad
1570     quality_2_is_bad
1571     quality_2_is_bad
1572     quality_2_is_bad
1573     quality_2_is_bad
1574     quality_2_is_bad
1575     qua

In [50]:
nan_series = pd.Series([1, 4.2, np.nan, 5, 12, 1, 1.2, 7, np.nan])

In [56]:
float_processer(nan_series, "tester", cutoffs = 4, print_labels = True)

['tester_lessthan_4.2', 'tester_lessthan_7.0', 'tester_lessthan_nan', 'tester_morethan_nan']


0     tester_lessthan_4.2
1     tester_lessthan_7.0
2     tester_lessthan_4.2
3     tester_lessthan_7.0
4                     NaN
5     tester_lessthan_7.0
6     tester_lessthan_4.2
7     tester_lessthan_7.0
8     tester_lessthan_4.2
9     tester_lessthan_7.0
10    tester_lessthan_7.0
dtype: object