In [1]:
import pandas as pd 
import numpy as np
from collections import Counter
import copy
import math
import json

In [2]:
item1 = ["pen","ink","diary","soap"]
item2 = ["pen","ink","diary"]
item3 = ["pen","diary"]
item4 = ["pen","ink","soap"]

example_df = pd.DataFrame([item1,item2,item3,item4])


In [3]:
example_df.head()

Unnamed: 0,0,1,2,3
0,pen,ink,diary,soap
1,pen,ink,diary,
2,pen,diary,,
3,pen,ink,soap,


In [4]:
def generateFirstPassCandidate(df, minsup):
    
    n = len(df)
    ctr = {}
    for i in range(n):
        row =  list(df.values[i])
        for entry in row:
            if entry != '' and entry!= None and entry!="nan" and not pd.isna(entry):
                ctr[tuple([entry])] = ctr.get(tuple([entry]),0) + 1
    
    candidateSet = []
    for key in ctr:
        if (ctr[key]/n) >= minsup:
            candidateSet.append(key)
            
    return sorted(candidateSet),ctr
        

In [5]:
generateFirstPassCandidate(example_df,0.7)

([('diary',), ('ink',), ('pen',)],
 {('pen',): 4, ('ink',): 3, ('diary',): 3, ('soap',): 2})

In [9]:
def generateCandidateSet(lprev):
    # lprev is a set of len k-1 
    candidateSet = []
    
    n = len(lprev)
    
    for i in range(n):
        for j in range(i+1,n):
            if lprev[i][:-1] == lprev[j][:-1] and lprev[i][-1] < lprev[j][-1]:
                candidateSet.append(lprev[i]+tuple([lprev[j][-1]]))
    
    candidateSet = pruneCandidateSet(candidateSet,lprev)
    
    return candidateSet
             
        
    

In [10]:
generateCandidateSet([('diary',), ('ink',), ('pen',)])

[('diary', 'ink'), ('diary', 'pen'), ('ink', 'pen')]

In [8]:
def pruneCandidateSet(candidateSets, lprev):
    # prunning set to remove such sets from candidate sets which posses a k-1 length subset that is not part of lprev
    prunnedCandidateSet = []
    
    for candidateSet in candidateSets:
        # generating all the subset os candidateSet with k-1 length
        allSubset = [] # all subsets of k-1 length
        n = len(candidateSet)
        
        for i in range(n):
            allSubset.append(candidateSet[:i]+candidateSet[i+1:])
        
        # if any one of the k-1 length subset is missing we don't included the candidateSet
        
        include = True
        for subset in allSubset:
            if subset not in lprev:
                include = False
                break
        
        if include:
            prunnedCandidateSet.append(candidateSet)
        
        
    
    return prunnedCandidateSet

In [11]:
pruneCandidateSet([('diary', 'ink'), ('diary', 'pen'), ('ink', 'pen')],[('diary',), ('ink',), ('pen',)])

[('diary', 'ink'), ('diary', 'pen'), ('ink', 'pen')]

In [76]:
# function to filter rows which match the values in the list
def match_all(row, match_list):
    return all(value in row.values for value in match_list)

def generateLargeItemSet(df, minSup):
    largeItemSet = []   
    
    dataSet = df.values
    
    n = len(dataSet)
    
    # Add an optimization to reduce the df size each time, no need to do a complete scan
    
    lprev,counter = generateFirstPassCandidate(df, minSup)
    largeItemSet = largeItemSet + lprev
    
    while lprev:
        candidateSet = generateCandidateSet(lprev)
        lnext = []
        cur_df=pd.DataFrame()
        for entry in candidateSet:
            print(entry)
#             tempDf = df.isin(list(entry))
#             print(df)
            tempDf = df[df.apply(lambda row: match_all(row, list(entry)), axis=1)]
            count=len(tempDf)
#             print(tempDf)
#             print(tempDf)
#             resPerRow = (tempDf.values+0).sum(1)
#             print(resPerRow)
#             n1 = len(entry)
#             count = sum(i >= n1 for i in resPerRow)
            
            counter[entry] = count
            
            if count/n >= minSup:
                lnext.append(entry)
                cur_df=pd.concat([cur_df, tempDf]).drop_duplicates();
                
        
        if lnext:
            largeItemSet = largeItemSet + lnext
            
        lprev = copy.copy(lnext)
        df=cur_df;
    
    return largeItemSet,counter
        
        
        

    

In [77]:
generateLargeItemSet(example_df, 0.7)

('diary', 'ink')
     0      1      2     3
0  pen    ink  diary  soap
1  pen    ink  diary  None
2  pen  diary   None  None
3  pen    ink   soap  None
     0    1      2     3
0  pen  ink  diary  soap
1  pen  ink  diary  None
('diary', 'pen')
     0      1      2     3
0  pen    ink  diary  soap
1  pen    ink  diary  None
2  pen  diary   None  None
3  pen    ink   soap  None
     0      1      2     3
0  pen    ink  diary  soap
1  pen    ink  diary  None
2  pen  diary   None  None
('ink', 'pen')
     0      1      2     3
0  pen    ink  diary  soap
1  pen    ink  diary  None
2  pen  diary   None  None
3  pen    ink   soap  None
     0    1      2     3
0  pen  ink  diary  soap
1  pen  ink  diary  None
3  pen  ink   soap  None


([('diary',), ('ink',), ('pen',), ('diary', 'pen'), ('ink', 'pen')],
 {('pen',): 4,
  ('ink',): 3,
  ('diary',): 3,
  ('soap',): 2,
  ('diary', 'ink'): 2,
  ('diary', 'pen'): 3,
  ('ink', 'pen'): 3})

In [51]:
def generateAssociationRules(largeItemSet,counter,minConf):
    # All the items in largeItemSet have sufficent support we need to find assoication rules from large itemsets such that they have good confidence
    associationRule = [] # tuples (LHS,RHS,confidence)
    for itemSet in largeItemSet:
        
        n = len(itemSet)
        
        for i in range(1,n):
            LHS = itemSet[:i]
            RHS = itemSet[i:]
            
            # For rule LHS -> RHS
            conf1 = counter[itemSet]/counter[LHS]
            if conf1 >= minConf:
                associationRule.append((LHS,RHS,conf1))
            
            # For rule RHS -> LHS
            conf2  = counter[itemSet]/counter[RHS]
                    
            if conf2 >= minConf:
                associationRule.append((RHS,LHS,conf2))
    
    return associationRule
            
                    

In [52]:
largeItemSet,counter = generateLargeItemSet(example_df, 0.7)
generateAssociationRules(largeItemSet,counter,0.8)

     0      1      2     3
0  pen    ink  diary  soap
1  pen    ink  diary  None
2  pen  diary   None  None
3  pen    ink   soap  None
2
     0    1      2     3
0  pen  ink  diary  soap
1  pen  ink  diary  None
     0      1      2     3
0  pen    ink  diary  soap
1  pen    ink  diary  None
2  pen  diary   None  None
3  pen    ink   soap  None
3
     0      1      2     3
0  pen    ink  diary  soap
1  pen    ink  diary  None
2  pen  diary   None  None
     0      1      2     3
0  pen    ink  diary  soap
1  pen    ink  diary  None
2  pen  diary   None  None
3  pen    ink   soap  None
3
     0    1      2     3
0  pen  ink  diary  soap
1  pen  ink  diary  None
3  pen  ink   soap  None


[(('diary',), ('pen',), 1.0), (('ink',), ('pen',), 1.0)]

In [16]:
def main(fileName,minSup,minConf,outputFile = "output.txt"):
    df = pd.read_csv(fileName,dtype=str)
    n = len(df)
    
    df = df.apply(lambda x: pd.Series(x,dtype=str).drop_duplicates(), axis=1) # Removing Duplicate values from the Row and replacing the duplicates with Nan

    largeItemSet,counter = generateLargeItemSet(df, minSup)
    
    f = open(outputFile, 'w')
    
    largeItemSet.sort(key = lambda x : counter[x],reverse = True)
    
    
    f.write("==Frequent itemsets (min_sup="+ str(minSup*100) + "%) \n")
    for itemSet in largeItemSet:
        json.dump(list(itemSet),f)
        f.write("," + str((counter[itemSet]/n)*100) + "% \n" )
    
    rules = generateAssociationRules(largeItemSet,counter,minConf)
    
    rules.sort(key = lambda x: x[2] , reverse = True)
    f.write("==High-confidence association rules (min_conf="+ str(minConf*100) + "%) \n")
    for rule in rules:
        json.dump(list(rule[0]),f)
        f.write(" => ")
        json.dump(list(rule[1]),f)
        f.write(" (Conf: " + str(rule[2]*100) + "%, Supp: " + str((counter[itemSet]/n)*100) + "%) \n")
    
    
        

        


        
    

In [47]:
df = pd.read_csv("example.csv")

In [48]:
main("example.csv",0.7,0.8)

In [129]:
main("./INTEGRATED-DATASET-SMALL.csv",0.01,0.5)