In [1]:
import pandas as pd
import timeit
import numpy as np

#Reading Dataset
df = pd.read_csv('adult.data', sep=",", header = None , na_values = "?")
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
#Adding column names in the Dataset
df.columns = ['age', 'workclass','fnlwgt','education', 'education_num', 'marital_status', 'occupation', 'relationship', 'race',
              'sex', 'capital_gain', 'capital_loss','hours_per_week','native_country', 'class']
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K


In [4]:
#Preprocessing
#Converting continuous data into Categorical data
df['Age'] = pd.cut(x=df['age'], bins=[0, 18, 30, 50, 1000], labels=['Underage', 'Young', 'Adult','Elderly'])
#In this I have divided Age into 4 categories
#Similarly done for hours per week
df['Hours_per_Week'] = pd.cut(x=df['hours_per_week'], bins=[0, 20, 40, 1000], labels=['Part-Time', 'Full-Time', 'Overtime'])
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,class,Age,Hours_per_Week
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,Adult,Full-Time
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,Adult,Part-Time
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,Adult,Full-Time
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,Elderly,Full-Time
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,Young,Full-Time


In [5]:
#Some columns like fnlwgt, education_num didn't give any perspective.
#Removed old age and hours_per_week columns
#Capital_Gain and Capital_Loss had many 0 values.
df = df.drop(['age','fnlwgt','education_num','hours_per_week','capital_loss', 'capital_gain'], axis = 1)
df.head() 

Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,class,Age,Hours_per_Week
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K,Adult,Full-Time
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K,Adult,Part-Time
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K,Adult,Full-Time
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K,Elderly,Full-Time
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K,Young,Full-Time


In [6]:
#Removing space from the each item.
df = df.applymap(lambda space: space.strip() if type(space) is str else space)
#Here Preprocessing and Cleaning is completed

APRIORI ALGORITHM

In [7]:
#We will start the timer for Apriori Algorithm here
start_time = timeit.default_timer()
# Earlier we cleaned the dataframe. Now we will be using dataframe.values.tolist() to load the dataset.
def load_df():
    return df.values.tolist()
#First step is generating Candidate1.
def gen_cand1(itemset):
    CANDIDATE1 = []
    for i in itemset:
        for j in i:
            if not [j] in CANDIDATE1:
                CANDIDATE1.append([j])
    return list(map(frozenset, CANDIDATE1)) 
#Set in unhashable therefore we use frozenset which is nothing but immutable version of python set object.

itemset = load_df()
CANDIDATE1 = gen_cand1(itemset)


In [8]:
# Scanning Databse
def database_scan(Db, Ck, min_sup):
    sup_count = {}
    sup_data = {}
    r_list = []
    Db_length = len(Db)
    for t in Db:
        for i in Ck:
            if i.issubset(t):
                if not i in sup_count: sup_count[i]=1
                else: sup_count[i] += 1
    
    total_items = int(Db_length)
    for key in sup_count:
        support = sup_count[key]/total_items
        if support >= min_sup:
            r_list.insert(0,key)
        sup_data[key] = support
    return r_list, sup_data

#In this step we use our knowledge of support count. 
#If support of an item is greater than min_support, we insert that item and store it.

# Generating Apriori
def generate_apriori(L_k, k): 
    Ck = []
    for l in range(len(L_k)):
        for i in range(l+1, len(L_k)): 
            l1 = list(L_k[l])[:k-2]
            l2 = list(L_k[i])[:k-2]
            l1.sort()
            l2.sort()
            if l1==l2: 
                Ck.append(L_k[l] | L_k[i]) 
    return Ck

In [9]:
# Apriori function in which we specify min_sup and obtain frequent itemsets corresponding to it.
def apriori(itemset, min_sup = 0.12):
    CANDIDATE1 = gen_cand1(itemset)
    D = list(map(set, itemset))
    L1, sup_data = database_scan(D, CANDIDATE1, min_sup)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = generate_apriori(L[k-2], k)
        L_k, sup = database_scan(D, Ck, min_sup)
        sup_data.update(sup)
        L.append(L_k)
        k += 1
    return L, sup_data
L,sdata = apriori(itemset)
new_list = []
for index in range(len(L)):
    print("\nL{} ".format(index))
    print("Number of patterns={} \n".format(len(L[index])))
    apriori_freq_pattern = [list(i) for i in L[index]]
    print(apriori_freq_pattern)


L0 
Number of patterns=24 

[['Craft-repair'], ['Own-child'], ['Some-college'], ['Overtime'], ['>50K'], ['Young'], ['Female'], ['Prof-specialty'], ['Elderly'], ['Divorced'], ['HS-grad'], ['Private'], ['Husband'], ['Exec-managerial'], ['Married-civ-spouse'], ['Full-Time'], ['Adult'], ['<=50K'], ['United-States'], ['Male'], ['White'], ['Not-in-family'], ['Never-married'], ['Bachelors']]

L1 
Number of patterns=106 

[['White', 'Some-college'], ['Some-college', 'Full-Time'], ['<=50K', 'Some-college'], ['Overtime', '<=50K'], ['Never-married', 'Young'], ['White', 'Young'], ['Young', 'United-States'], ['Never-married', 'Own-child'], ['White', 'Own-child'], ['Own-child', 'United-States'], ['<=50K', 'Own-child'], ['Male', 'Young'], ['Male', 'Some-college'], ['Some-college', 'United-States'], ['Private', 'Some-college'], ['Private', 'Never-married'], ['Female', 'Never-married'], ['>50K', 'Adult'], ['>50K', 'Private'], ['Overtime', 'Adult'], ['Overtime', 'Private'], ['HS-grad', 'Married-civ-spo

In [10]:
end_time = timeit.default_timer() #ending timer
total_time = end_time - start_time
total_time

6.483096700000004