In [1]:
import pandas as pd
import timeit
import numpy as np
import itertools
#Reading Dataset
df = pd.read_csv('adult.data', sep=",", header = None , na_values = "?")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
#Adding column names in the Dataset
df.columns = ['age', 'workclass','fnlwgt','education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race',
              'sex', 'capital_gain', 'capital_loss','hours_per_week','native_country', 'class']

In [3]:
#Preprocessing
#Converting continuous data into Categorical data
df['Age'] = pd.cut(x=df['age'], bins=[0, 18, 30, 50, 1000], labels=['Underage', 'Young', 'Adult','Elderly'])
#In this I have divided Age into 4 categories
#Similarly done for hours per week
df['Hours_per_Week'] = pd.cut(x=df['hours_per_week'], bins=[0, 20, 40, 1000], labels=['Part-Time', 'Full-Time', 'Overtime'])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class,Age,Hours_per_Week
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,Adult,Full-Time
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,Adult,Part-Time
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,Adult,Full-Time
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,Elderly,Full-Time
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,Young,Full-Time


In [4]:
#Some columns like fnlwgt, education_num didn't give any perspective.
#Removed old age and hours_per_week columns
#Capital_Gain and Capital_Loss had many 0 values.
df = df.drop(['age','fnlwgt','education_num','hours_per_week','capital_loss', 'capital_gain'], axis = 1)
df.head() 

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,class,Age,Hours_per_Week
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K,Adult,Full-Time
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K,Adult,Part-Time
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K,Adult,Full-Time
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K,Elderly,Full-Time
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K,Young,Full-Time


In [5]:
#Removing space from the each item.
df = df.applymap(lambda space: space.strip() if type(space) is str else space)
#Here Preprocessing and Cleaning is completed

FP Growth

In [6]:
df = df.values.tolist()

In [7]:
#Here we start timing
start_time = timeit.default_timer()

#Defining class for Tree
class Tree(object):

    def __init__(self, value, count, parent):
        self.value = value
        self.count = count
        self.parent = parent
        self.link = None
        self.child = []
#function for getting child
    def getting_child(self, value): 
        for n in self.child:
            if n.value == value:
                return n
        return None
#function for adding child
    def adding_child(self, value):
        a_child = Tree(value, 1, self)
        self.child.append(a_child)
        return a_child

In [8]:
#Defining class for Building FP Growth Tree
class Build_FPGrowth_Tree(object):
    
    def __init__(self, trans, threshold, r_value, r_count):
        self.frequent = self.get_freq_items(trans, threshold)
        self.headers = self.gen_header(self.frequent)
        self.root = self.gen_fp_tree(trans, r_value, r_count, self.frequent, self.headers)
        
#Function for getting frequent items
    def get_freq_items(self, db_trans, sup_threshold):
        freq_item = {}
        for t in db_trans:
            for i in t:
                if i in freq_item:
                    freq_item[i] += 1
                else:
                    freq_item[i] = 1
        
        for key in list(freq_item.keys()):
            if freq_item[key] < sup_threshold:
                del freq_item[key]
        return freq_item
   
    def gen_header(self, freq):
        h_table = {}
        for key in freq.keys():
            h_table[key] = None
        return h_table

#Function for building FP Tree
    def gen_fp_tree(self, db_trans, r_value, r_count, freq, heads):
        root_node = Tree(r_value, r_count, None)
        for t in db_trans:
            sort_item = [i for i in t if i in freq]
            sort_item.sort(key=lambda i: freq[i], reverse=True)
            if len(sort_item) > 0: 
                #Checking if sorted items are more than 0 and if they are, we append.
                self.node_insert(sort_item, root_node, heads)
        return root_node

#Function for inserting tree
    def node_insert(self, items, node, head):
        f = items[0]
        new_child = node.getting_child(f) #Getting Child
        if new_child is not None: #Checking Child
            new_child.count += 1
        else:
            new_child = node.adding_child(f) #Adding Child
            if head[f] is None:
                head[f] = new_child
            else:
                head_list = head[f]
                while head_list.link is not None:
                    head_list = head_list.link
                head_list.link = new_child
                
        rem_items = items[1:] # Recurrsive calling
        if len(rem_items) > 0: 
            #Checking whether items are present and if they are, we append
            self.node_insert(rem_items, new_child, head)

    def tree_path(self, n):
        child_num = len(n.child)
        if child_num > 1:
            return False
        elif child_num == 0:
            return True
       
    #Pattern Mining    
    def pattern_mining(self, t_hold): 
        if self.tree_path(self.root):
            return self.pattern_generation()
        else:
            return self.zpattern(self.subtrees_mining(t_hold))
    #Conditional tree
    def zpattern(self, f_pattern):
        i = self.root.value
        if i is not None:
            new_pattern = {}
            for key in f_pattern.keys():
                new_pattern[tuple((list(key) + [i]))] = f_pattern[key]
            return new_pattern
        return f_pattern
    #Pattern Generation
    def pattern_generation(self): 
        fre_pattern = {}
        i = self.frequent.keys() #Merging Index and Values
        if self.root.value is None:
            s_value = []
        else:
            s_value = [self.root.value]
            fre_pattern[tuple(s_value)] = self.root.count
        for j in range(1, len(i)):
            for k in itertools.combinations(i, j):
                ptn = tuple((list(k) + s_value))
                fre_pattern[ptn] = min([self.frequent[f] for f in k])
        return fre_pattern

    def subtrees_mining(self, threshold):
        fre_pat = {}
        m_order = sorted(self.frequent.keys(),key=lambda l : self.frequent[l])
        for each_item in m_order:
            cond_tree = []
            head_node = self.headers[each_item]
            tree_suff = []
            while head_node is not None: # When node is not null we append 
                tree_suff.append(head_node)
                head_node = head_node.link
                
            for i in tree_suff:
                freq = i.count
                path_tree = []
                parent = i.parent
                while parent.parent is not None:
                    path_tree.append(parent.value)
                    parent = parent.parent
                for i in range(freq):
                    cond_tree.append(path_tree)
                    #Constructing subtree with frequent patterns
            stree = Build_FPGrowth_Tree(cond_tree, threshold,each_item, self.frequent[each_item])
            stree_pat = stree.pattern_mining(threshold)
            
            # Adding patterns generated in subtree to the main tree
            for freq_pa in stree_pat.keys():
                if freq_pa in fre_pat:
                    fre_pat[freq_pa] += stree_pat[freq_pa]
                else:
                    fre_pat[freq_pa] = stree_pat[freq_pa]
        return fre_pat

In [9]:
#Getting Frequent patterns
def fp_growth_freq_patterns(data, sup_threshold):
    tree = Build_FPGrowth_Tree(data, sup_threshold, None, None)
    return tree.pattern_mining(sup_threshold)
#Defining minimum support
min_sup = 0.12
x = min_sup*32561 
print("((Pattern) , Support Count) are:- ")
fp_freq_itemsets = fp_growth_freq_patterns(df, x)
fpgrowth_freq_itemsets = list(fp_freq_itemsets.items())
end_time = timeit.default_timer() 
fpgrowth_freq_itemsets

((Pattern) , Support Count) are:- 


[(('Exec-managerial',), 4066),
 (('Craft-repair',), 4099),
 (('Prof-specialty',), 4140),
 (('?',), 4262),
 (('<=50K', 'Divorced'), 3980),
 (('United-States', 'Divorced'), 4162),
 (('<=50K', 'United-States', 'White', 'Own-child'), 3966),
 (('<=50K', 'White', 'Own-child'), 4196),
 (('<=50K', 'United-States', 'Never-married', 'Own-child'), 4134),
 (('<=50K', 'Never-married', 'Own-child'), 4451),
 (('<=50K', 'United-States', 'Own-child'), 4632),
 (('<=50K', 'Own-child'), 5001),
 (('United-States', 'White', 'Bachelors'), 4380),
 (('United-States', 'Bachelors'), 4766),
 (('Married-civ-spouse', 'Elderly'), 4009),
 (('United-States', '<=50K', 'Elderly'), 3941),
 (('White', 'Male', 'Elderly'), 4114),
 (('United-States', 'Male', 'Elderly'), 4159),
 (('United-States', 'White', 'Elderly'), 5265),
 (('United-States', 'Elderly'), 5866),
 (('United-States', 'Full-Time', 'Some-college'), 4106),
 (('White', 'Male', 'Some-college'), 3938),
 (('United-States', 'Male', 'Some-college'), 4144),
 (('<=50K', 

In [10]:
total_time = end_time - start_time
total_time

7.597444199999998