In [1]:
from itertools import combinations
from operator import itemgetter
import pandas as pd
from time import time
import itertools

In [2]:
#load dataset
datasetURL='https://raw.githubusercontent.com/abhashpanwar/BDA/master/DataMining/aprioriAlgo_dataset.csv'
table = pd.read_csv(datasetURL,error_bad_lines=False)
print(table)

     items
0    A,B,E
1      B,D
2      B,C
3    A,B,D
4      A,C
5      B,C
6      A,C
7  A,B,C,E
8    A,B,C
9        F


In [3]:
support=2 #min support
minimum_confidence=30 #30%

In [4]:
def aprioriAlgo(data, support_count):
    #\ is used of indent

    #generate list of single items 
    single_items = (data['items'].str.split(",", expand=True))\
        .apply(pd.value_counts).sum(axis=1).where(lambda value: value >= support_count).dropna()
    
    #create dataframe of items with support 
    apriori_data = pd.DataFrame(
        {'items': single_items.index.astype(str), 'support': single_items.values, 'Li': 1})

    data['Li'] = data['items'].str.count(",") + 1

    data['items'] = data['items'].apply(lambda row: set(map(str, row.split(","))))
    
    #first candidate itemset C1
    single_items_set = set(single_items.index.astype(str))

    #loop for frequent items
    for length in range(2, len(single_items_set) + 1):
        data = data[data['Li'] >= length]        
        d = data['items'] \
            .apply(lambda st: pd.Series(s if set(s).issubset(st) else None for s in combinations(single_items_set, length))) \
            .apply(lambda col: [col.dropna().unique()[0], col.count()] if col.count() >= support_count else None).dropna()
        if d.empty:
            break
        apriori_data = apriori_data.append(pd.DataFrame(
            {'items': list(map(itemgetter(0), d.values)), 'support': list(map(itemgetter(1), d.values)),
             'Li': length}), ignore_index=True)

    return apriori_data    

In [5]:
df=aprioriAlgo(data=table, support_count=support) #call function to find frequent items

for i in df.Li.unique():
  print('L{}:'.format(i))
  print(df[df['Li']==i][['items','support']].to_string(index=False))

L1:
items  support
    A      6.0
    B      7.0
    C      6.0
    D      2.0
    E      2.0
L2:
  items  support
 (E, A)      2.0
 (E, B)      2.0
 (D, B)      2.0
 (C, A)      4.0
 (C, B)      4.0
 (A, B)      4.0
L3:
     items  support
 (E, A, B)      2.0
 (C, A, B)      2.0


In [6]:
lastIndex=df.Li.unique()[-1]
print("Largest Frequent Items:")
for i in df[df['Li']==lastIndex][['items']].values:
  print(",".join(sorted(i.item())))  

Largest Frequent Items:
A,B,E
A,B,C


## **Find Association**

In [7]:
#no. items in association rules
totalConfidence=len(df[df['Li']==lastIndex][['items']].values.flatten()[0])*2*len(df[df['Li']==lastIndex][['items']].values.flatten())

In [8]:
confidenceTable=pd.DataFrame()
confidenceTable['item']=list(range(totalConfidence))

In [9]:
def get_pairs(left,right):
  lst=[]
  for i in range(len(left)):
    temp=",".join(left[i])+'->'+str(right[i])
    lst.append(temp)
  for i in range(len(left)):
    temp=str(right[i])+'->'+",".join(left[i])
    lst.append(temp)
  return lst

In [10]:
def generatePairs():
  allelements=[]
  for i in df[df['Li']==lastIndex][['items']].values:
    temp=list(sorted(i.item()))
    res = list(zip(temp, temp[1:] + temp[:1])) 
    left = [[(temp[i]), temp[(i + 1) % len(temp)]]  
            for i in range(len(temp))] 
    right = [temp[(i + 1) % len(temp)-2]  for i in range(len(temp))]    
    allelements.append(get_pairs(left,right))
  return allelements

In [11]:
#generate all possible subsets of frequent items
a=generatePairs()
finalList=list(itertools.chain.from_iterable(a))
print(finalList)

['A,B->E', 'B,E->A', 'E,A->B', 'E->A,B', 'A->B,E', 'B->E,A', 'A,B->C', 'B,C->A', 'C,A->B', 'C->A,B', 'A->B,C', 'B->C,A']


In [12]:
confidenceTable['item']=finalList
confidenceTable['confidence']=0

In [13]:
#confidenceTable

In [14]:
for k in range(len(confidenceTable)):
  l1=list(confidenceTable.iloc[k].values[0].split('->')[0].split(','))
  l2=list(confidenceTable.iloc[k].values[0].split('->')[1].split(','))
  l2=l1+l2

  #left->right i.e AB->E
  union_count=0
  left_count=0
  for i in table['items']:
    check1 =  all(item in i for item in l1) 
    check2 =  all(item in i for item in l2)
    if check1 is True:
      union_count+=1 #support of left 
    if check2 is True:
      left_count+=1 #support of left union right
  confi=round(left_count/union_count*100,2)
  #print(k,confi)
  confidenceTable.loc[k,'confidence']=confi

In [15]:
#print(confidenceTable)

In [16]:
print("Selected Association Rules with confidence {}%:".format(minimum_confidence))
print(confidenceTable[confidenceTable.confidence>=30])

Selected Association Rules with confidence 30%:
      item  confidence
0   A,B->E       50.00
1   B,E->A      100.00
2   E,A->B      100.00
3   E->A,B      100.00
4   A->B,E       33.33
6   A,B->C       50.00
7   B,C->A       50.00
8   C,A->B       50.00
9   C->A,B       33.33
10  A->B,C       33.33
