In [18]:
import pandas as pd
import numpy as np
from matplotlib import pyplot
import time
import itertools
from tabulate import tabulate
from termcolor import colored

In [19]:
names = ["age","workclass","fnlwgt","education","education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country",
         "income_bracket"]
dataset = pd.read_csv("adult.data",names = names,skipinitialspace=True,dtype=object)
minimum_sup = 0.20
len_dataset = len(dataset)
len_columns = len(names)
dataset.drop(columns=['capital_gain', 'capital_loss'],inplace=True,axis=1)

In [20]:
def findsubsets(s, n):
    return list(itertools.combinations(s, n)) 

In [21]:
start = time.time()
k = 1
c1 = {}
for i,val in dataset.iterrows():
    for j in val:
        if j in c1:
            c1[j] += 1
        else:
            c1[j] = 1
k = k+1
            

In [22]:
support_data = pd.DataFrame()
support_data["value"] = c1.keys()
support_data["freq"] = c1.values()

support_data["support%"] = support_data["freq"]/len_dataset
l1 = support_data[support_data["support%"] > minimum_sup]


In [23]:
l1.sort_values('value',inplace=True)
print(tabulate(l1, headers = 'keys', tablefmt = 'psql'))

+----+--------------------+--------+------------+
|    | value              |   freq |   support% |
|----+--------------------+--------+------------|
| 59 | 10                 |   7569 |   0.232456 |
| 10 | 40                 |  16011 |   0.491723 |
| 23 | 9                  |  10519 |   0.323055 |
| 12 | <=50K              |  24720 |   0.75919  |
| 52 | >50K               |   7841 |   0.24081  |
| 35 | Female             |  10771 |   0.330795 |
| 22 | HS-grad            |  10501 |   0.322502 |
| 18 | Husband            |  13193 |   0.405178 |
|  9 | Male               |  21790 |   0.669205 |
| 16 | Married-civ-spouse |  14976 |   0.459937 |
|  5 | Never-married      |  10683 |   0.328092 |
|  7 | Not-in-family      |   8305 |   0.25506  |
| 20 | Private            |  22696 |   0.69703  |
| 58 | Some-college       |   7291 |   0.223918 |
| 11 | United-States      |  29170 |   0.895857 |
|  8 | White              |  27816 |   0.854274 |
+----+--------------------+--------+------------+


Step-2: K=2
1. Generate candidate set C2 using L1 (this is called join step). 
2. Condition of joining Lk-1 and Lk-1 is that it should have (K-2) elements in common.
3. Check all subsets of an itemset are frequent or not and if not frequent remove that itemset.
4. Now find support count of these itemsets by searching in dataset.

In [24]:
c2 = {}
potential_c2 = []
pre_l2 = {}
value_list = l1["value"].to_list()
#creating the two item values
for i in range(0,len(value_list)):
    for j in range(i+1,len(value_list)):
        potential_c2.append((value_list[i],value_list[j]))
    
#checking if subset is a freqent iemset
for i in potential_c2:
    for j in i:
        if j in value_list:
            flag = 1
        else:
            potential_c2.remove(i)
            print(i)

#Find support count of each itemset in potential_c2 where they appear together
for x in potential_c2:
    for i,val in dataset.iterrows():
        val_sets = set(val)
        if x[0] in val_sets and x[1] in val_sets:
            if x in pre_l2:
                pre_l2[x] += 1
            else:
                pre_l2[x] = 1 
support_data_l2 = pd.DataFrame()
support_data_l2["value"] = pre_l2.keys()
support_data_l2["freq"] = pre_l2.values()               
support_data_l2["support%"] = support_data_l2["freq"]/len_dataset
l2 = support_data_l2[support_data_l2["support%"] >= minimum_sup]

k = k +1

In [25]:
l2.sort_values('value',inplace=True)
print(tabulate(l2, headers = 'keys', tablefmt = 'psql'))

+-----+-----------------------------------------+--------+------------+
|     | value                                   |   freq |   support% |
|-----+-----------------------------------------+--------+------------|
|  12 | ('10', 'Some-college')                  |   7291 |   0.223918 |
|  13 | ('10', 'United-States')                 |   6898 |   0.211849 |
|  16 | ('40', '<=50K')                         |  12221 |   0.375326 |
|  21 | ('40', 'Male')                          |  10554 |   0.32413  |
|  22 | ('40', 'Married-civ-spouse')            |   7311 |   0.224532 |
|  25 | ('40', 'Private')                       |  11374 |   0.349314 |
|  27 | ('40', 'United-States')                 |  13790 |   0.423513 |
|  28 | ('40', 'White')                         |  12893 |   0.395964 |
|  29 | ('9', '<=50K')                          |   8840 |   0.27149  |
|  32 | ('9', 'HS-grad')                        |  10501 |   0.322502 |
|  34 | ('9', 'Male')                           |   7117 |   0.2

In [26]:
pre_processed_ck = find_potential_tup(l2)
l3 = find_freq_itemset_k(pre_processed_ck)
print(tabulate(l3, headers = 'keys', tablefmt = 'psql'))

<class 'list'>
('United-States', '10', 'Some-college')
('Male', '40', '<=50K')
('Married-civ-spouse', '40', '<=50K')
('Private', '40', '<=50K')
('United-States', '40', '<=50K')
('White', '40', '<=50K')
('Male', 'Married-civ-spouse', '40')
('Male', 'Private', '40')
('Male', 'United-States', '40')
('Male', 'White', '40')
('Private', 'Married-civ-spouse', '40')
('United-States', 'Married-civ-spouse', '40')
('Married-civ-spouse', '40', 'White')
('United-States', 'Private', '40')
('Private', 'White', '40')
('United-States', 'White', '40')
('9', 'HS-grad', '<=50K')
('Male', '9', '<=50K')
('Private', '9', '<=50K')
('United-States', '9', '<=50K')
('White', '9', '<=50K')
('Male', '9', 'HS-grad')
('Private', '9', 'HS-grad')
('United-States', '9', 'HS-grad')
('White', '9', 'HS-grad')
('Male', 'Private', '9')
('Male', 'United-States', '9')
('Male', 'White', '9')
('United-States', 'Private', '9')
('Private', 'White', '9')
('United-States', 'White', '9')
('Female', 'Husband', '<=50K')
('Female', 'Ma

In [27]:
k = 4
pre_processed_ck = find_potential_tup(l3)
l4 = find_freq_itemset_k(pre_processed_ck)
print(tabulate(l4, headers = 'keys', tablefmt = 'psql'))

<class 'list'>
('Male', 'Married-civ-spouse', '40', '<=50K')
('Male', 'Private', '40', '9')
('Male', 'Private', '40', 'HS-grad')
('Male', 'Private', 'Married-civ-spouse', '40')
('Male', 'United-States', '40', '9')
('Male', 'United-States', '40', 'HS-grad')
('Male', 'United-States', 'Married-civ-spouse', '40')
('Male', 'United-States', 'White', '40')
('Male', 'White', '40', '<=50K')
('Male', 'Husband', 'White', '40')
('Private', 'Married-civ-spouse', '40', 'White')
('United-States', 'Married-civ-spouse', '40', 'White')
('United-States', 'Private', '40', '<=50K')
('United-States', 'Private', '40', 'Husband')
('United-States', 'Private', 'Never-married', '40')
('Private', 'White', '40', '9')
('Private', 'White', '40', 'HS-grad')
('United-States', 'White', '40', '9')
('United-States', 'White', '40', 'HS-grad')
('United-States', 'White', '40', 'Not-in-family')
('Male', 'Private', '9', 'HS-grad')
('Male', 'Private', 'Married-civ-spouse', '9')
('Male', 'United-States', '9', '<=50K')
('Male', 

In [28]:
k = 5
pre_processed_ck = find_potential_tup(l4)
l5 = find_freq_itemset_k(pre_processed_ck)
print(tabulate(l5, headers = 'keys', tablefmt = 'psql'))
stop = time.time()

print()
print("Frequent pattern mining using Apriori took {:.2f} minutes".format((stop-start)/60))

<class 'list'>
('40', 'Male', '9', 'Private', 'Married-civ-spouse')
('40', 'White', 'Male', 'Private', 'Married-civ-spouse')
('40', '<=50K', 'United-States', 'Male', 'Married-civ-spouse')
('40', 'White', 'United-States', 'Male', 'Married-civ-spouse')
('40', 'United-States', 'Male', 'HS-grad', 'White')
('40', '<=50K', 'United-States', 'Husband', 'Private')
('40', 'United-States', 'Never-married', 'HS-grad', 'Private')
('40', 'HS-grad', '9', 'Private', 'White')
('40', 'United-States', '9', 'Not-in-family', 'White')
('<=50K', 'Male', '9', 'Private', 'Married-civ-spouse')
('United-States', 'Male', '9', 'Husband', 'Private')
('United-States', '9', 'Husband', 'Private', 'White')
('White', '<=50K', 'Male', 'Private', 'Married-civ-spouse')
('White', '<=50K', 'United-States', 'Male', 'Married-civ-spouse')
('<=50K', 'United-States', 'Never-married', 'HS-grad', 'Private')
('<=50K', 'HS-grad', 'Husband', 'Private', 'White')
('Male', 'HS-grad', 'Husband', 'Private', 'White')
('United-States', 'Neve

In [30]:
print(tabulate(l5, headers = 'keys', tablefmt = 'psql'))

+----+-------------------------------------------------------------------+--------+------------+
|    | value                                                             |   freq |   support% |
|----+-------------------------------------------------------------------+--------+------------|
|  1 | ('40', 'White', 'Male', 'Private', 'Married-civ-spouse')          |   9039 |   0.277602 |
|  2 | ('40', '<=50K', 'United-States', 'Male', 'Married-civ-spouse')    |   9715 |   0.298363 |
|  3 | ('40', 'White', 'United-States', 'Male', 'Married-civ-spouse')    |  10506 |   0.322656 |
|  4 | ('40', 'United-States', 'Male', 'HS-grad', 'White')               |   9836 |   0.302079 |
|  5 | ('40', '<=50K', 'United-States', 'Husband', 'Private')            |  10282 |   0.315777 |
|  8 | ('40', 'United-States', '9', 'Not-in-family', 'White')            |   6602 |   0.202758 |
|  9 | ('<=50K', 'Male', '9', 'Private', 'Married-civ-spouse')           |   7115 |   0.218513 |
| 10 | ('United-States', 'Male

In [1]:
def find_potential_tup(lk):
    value_lk = lk["value"].to_list()
    print(type(value_lk))
    potential_ck = []
    for i in range(0,len(value_lk)):
        for j in range(i+1,len(value_lk)):
            if(value_lk[i][:k-2] == value_lk[j][:k-2]):
                #print(value_lk[i][:k-2],value_lk[j][:k-2])
                potential_ck.append((value_lk[i],value_lk[j]))       

    potential_lk = []
    for i in potential_ck:
        num = len(i)
        xi = set(i[0] +  i[num-1])
        potential_lk.append(tuple(xi))
    trimmed_infreq_subs = has_infrequent_subsets(potential_lk,value_lk)
    return(potential_lk)

In [2]:
def has_infrequent_subsets(pk,vk):
    #checking if subset is a freqent iemset
    flag = 0
    for i in pk:
        pp = findsubsets(i,k-1)
        for xi in pp:
            #print(tuple(sorted(xi)),sorted(value_l2))
            if tuple(sorted(xi)) in sorted(vk):
                #print(sorted(xi))
                pass
            else:
                flag = 1
        if flag == 1:
            pk.remove(i)
            #print("** removed: ",i)
            flag = 0
    return(pk)


In [3]:
def find_freq_itemset_k(pp_ck):
    pre_l3 = {}
    pre_processed_ck_func = pp_ck
    for x in pre_processed_ck_func:
        print(x)
        for i,val in dataset.iterrows():
            val_sets = set(val)
            num = len(x)
            cnt = 1
            loop_var = 0
            while loop_var < num:
                if x[loop_var] in val_sets:
                    cnt = cnt + 1
                    loop_var += 1
                else:
                    loop_var += 1
            if cnt == num:
                if x in pre_l3:
                    pre_l3[x] += 1
                else:
                    pre_l3[x] = 1

    support_data_lk = pd.DataFrame()
    support_data_lk["value"] = pre_l3.keys()
    support_data_lk["freq"] = pre_l3.values()               
    support_data_lk["support%"] = support_data_lk["freq"]/len_dataset
    lk = support_data_lk[support_data_lk["support%"] >= minimum_sup]
    #print("Frequent ",k,"Itemsets:")
    return lk