In [167]:
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import itertools

class ECLAT():
    """
    
    Arguments
    ---------------------
    data
        The `data` is a pandas dataframe format. The data should look like the example below.
        In this case, each line represents the purchase of one person.
            >>> Example of data format
            	0	1	2	3
            0	milk	beer	bread	butter
            1	coffe	bread	butter	NaN
            2	coffe	bread	butter	NaN
            3	milk	coffe	bread	butter
            4	beer	NaN	NaN	NaN
            5	butter	NaN	NaN	NaN
            6	bread	NaN	NaN	NaN
            7	bean	NaN	NaN	NaN
            8	rice	bean	NaN	NaN
            9	rice	NaN	NaN	NaN
    """
    
    def __init__(self, data):
        self.data = data
        self.uniq_ = []
        self.dictOfTIDList = {}
        
        ECLAT._getUnique(self)
        ECLAT._transformToVertical(self)
            
            
    def _getUnique(self):
        dif_atrib = []
        n_columns = len(self.data.columns)
        for column in range(n_columns):
            unique_items = self.data.iloc[:, column].dropna().unique()
            dif_atrib.extend(unique_items)
        
        self.uniq_ = list(set(dif_atrib))
        
    
    def _transformToVertical(self):
        numRow, numCol = self.data.shape
        
        for column in range(numCol):
            for row in range(numRow):
                item = self.data.at[row, column]
                
                if not pd.isna(item):
                    if item not in self.dictOfTIDList:
                        self.dictOfTIDList[item] = []
                        self.dictOfTIDList[item].append(f"TID{row + 1}")
                    else:
                        self.dictOfTIDList[item].append(f"TID{row + 1}")
        
        
    def support(self, min_support=None):
        dictOfTIDList = self.dictOfTIDList
        supportOfEachItem = {}
        supportOfEachItemWithTreshold = {}
        total = len(self.data)
        
                
        if min_support == None:
            for key, TIDList in dictOfTIDList.items():
                supportOfEachItem[key] = len(TIDList) / total
            
            return supportOfEachItem
        
        else:
            min_support = float(min_support)
            
            for key, TIDList in dictOfTIDList.items():
                supportOfEachItem[key] = len(TIDList) / total
            
            for key, value in supportOfEachItem.items():
                if value >= min_support:
                    supportOfEachItemWithTreshold[key] = value
                    
            return supportOfEachItemWithTreshold           
    
    
    # def _makeQuery(self, lst=[]):
    #     str_query = ''
    #     for item in lst:
    #         if item == lst[-1]:
    #             str_query = ''.join(str_query+'`{}` == 1'.format(item))
    #             break
    #         str_query = ''.join(str_query+'`{}` == 1 and '.format(item))
    #     return (str_query)
      
                        
    def fit_all(self, min_support=0.08, separator=' & ', min_combination=1):
        if min_support == None:
            min_support = 0
            
        support_dict = ECLAT.support(self, min_support=min_support)
        
        # total = len(self.data)
        # dict_finally_support = {}
        # dict_finally_index = {}
        # test_support = []
        
        # for j in range(2, len(self.df_bin.columns) + 1):
        #     for i in itertools.combinations(support_dict.keys(), r=j):
        #         get_query = ECLAT._makeQuery(self, list(i))
        #         try:
        #             numerator = len(self.df_bin.query('{}'.format(get_query)).loc[:, list(i)])
        #         except:
        #             continue
        #         support = numerator / total
                
        #         if support < min_support:
        #             continue
                
        #         test_support.append(support)
        #         dict_finally_support[separator.join(list(i))] = support
        #         dict_finally_index[separator.join(list(i))] = list(self.df_bin.query('{}'.format(get_query)).loc[:, list(i)].index)
            
        #     if test_support.count(0) == len(test_support) - 1 or any(test_support) == False:
        #         break
        #     test_support = []
            
        # return dict_finally_index, dict_finally_support

In [168]:
# Trigger
dataframe = pd.read_csv('./data/example3.csv', header=None)

eclat_instance = ECLAT(data=dataframe)
eclat_instance.fit_all(min_support=0.08, separator=' & ', min_combination=1)