# Association Rule Mining

In [26]:
import sys
import csv
import pandas as pd

DEBUGGING = False
DATA_DIR  = 'data/'
DATA_FILE = 'survey-data.csv'

MINIMUM_COVERAGE = 500
MINIMUM_CONFIDENCE = 0.5

class Clause:
    def __init__(self,column,value):
        self.column = column
        self.value = value
    def __str__(self):
        return ( '(' + self.column + '==' + self.value + ')' )
    def getColumn(self):
        return self.column
    def getValue(self):
        return self.value
    
class AssocRule:
    def __init__(self,ant_column,ant_value,con_column,con_value):
        self.antecedent = Clause(ant_column,ant_value)
        self.consequent = Clause(con_column,con_value)
        
    def __str__(self):
        return ( str( self.antecedent ) + '->' + str( self.consequent ))
    
    def getAntecedent( self ):
        return (self.antecedent)
    def getConsequent(self):
        return self.consequent

try:
    df = pd.read_csv(DATA_DIR+DATA_FILE,na_filter=False)
except IOError as iox:
    print('there was an I/O error trying to open the data file: ' + str( iox ))
    sys.exit()
    
N = len(df.columns)
M = len(df.values)

if DEBUGGING:
    print('INPUT FILE = ' + DATA_DIR + DATA_FILE)
    print('number of attributes = ' + str( N ))
    print('number of instances = ' + str( M ))
    for ( i, c, t ) in zip( range( N ), df.columns, df.dtypes ):
        print('{} - {} ({})'.format( i, c, t ))

coverage1 = {}
for c in df.columns:
    coverage1[c] = {}
for j in range (M):
    for c in df.columns:
        if(df[c].dtypes == 'object'):
            att_value = df[c].values[j]
            if(att_value in coverage1[c].keys()):
                coverage1[c][att_value] += 1
            else:
                coverage1[c][att_value] = 1
# print('one-item sets:')
# for c in df.columns:
#     if ( len( coverage1[c] ) > 0 ):
#         for k in coverage1[c].keys():
#             print(c, '==', k, ':', coverage1[c][k])
            
num_item1 = 0
for c in df.columns:
    for k in list(coverage1[c].keys()):
        if(coverage1[c][k] < MINIMUM_COVERAGE):
            del coverage1[c][k]
        else:
            num_item1 +=1 
# print('number of one-item sets above minimum coverage of ', MINIMUM_COVERAGE,'=', num_item1)
# print('one-item sets above minimum coverage of ', MINIMUM_COVERAGE,':')
# for c in df.columns:
#     if ( len( coverage1[c] ) > 0 ):
#         print(c, ':', end='')
#         for k in coverage1[c].keys():
#             print(k, '=', coverage1[c][k], end='')
#         print()

columns1 = []
for c in df.columns:
    if ( len( coverage1[c] ) > 0 ):
        columns1.append( c )

coverage2 = {}

for c1 in columns1:
    coverage2[c1] = {}
    
for j  in range (M):
    for i1 in range(len(columns1)):
        c1 = columns1[i1]
        att1_value = df[c1].values[j]
        if(att1_value not in coverage2[c1].keys()):
            coverage2[c1][att1_value] = {}
        
        for i2 in range(i1+1,len(columns1)):
            c2 = columns1[i2]
            if(c2 not in coverage2[c1][att1_value].keys()):
                coverage2[c1][att1_value][c2] = {}
           
            att2_value = df[c2].values[j]
            if(att2_value not in coverage2[c1][att1_value][c2].keys()):
                coverage2[c1][att1_value][c2][att2_value] = 1
            else:
                coverage2[c1][att1_value][c2][att2_value] += 1
                        
# print('two-item sets:')
# for c1 in columns1:
#     for k1 in coverage2[c1].keys():
#         for k2 in coverage2[c1][k1].keys():
#             for k3 in coverage2[c1][k1][k2].keys():
#                 print(c1, '==', k1, ' and ', k2, '==', k3, ':', coverage2[c1][k1][k2][k3])
    
# remove two-item sets that do not meet the minimum coverage
# num_item2 = 0
for c1 in columns1:
    for k1 in list( coverage2[c1].keys()):
        for c2 in list(coverage2[c1][k1].keys()):
            for k2 in list(coverage2[c1][k1][c2].keys()):
                if(coverage2[c1][k1][c2][k2] < MINIMUM_COVERAGE):
                    del coverage2[c1][k1][c2][k2]
# print('two-item sets after deleting:')
# for c1 in columns1:
#     for k1 in coverage2[c1].keys():
#         for k2 in coverage2[c1][k1].keys():
#             for k3 in coverage2[c1][k1][k2].keys():
#                 print(c1, '==', k1, ' and ', k2, '==', k3, ':', coverage2[c1][k1][k2][k3])

rules = []

for c1 in columns1:
    for v1 in coverage2[c1].keys():
        for c2 in coverage2[c1][v1].keys():
            for v2 in coverage2[c1][v1][c2].keys():
                rules.append(AssocRule(c1,v1,c2,v2))
                rules.append(AssocRule(c2,v2,c1,v1))
# for rule in rules:
#     print(str(rule))
confidence = [0.0 for i in range(len(rules))]
for i in range (len(rules)):
    ant = rules[i].getAntecedent()
    con = rules[i].getConsequent()
    rule_coverage = 0
    ant_coverage = 0
    for j in range(M):
        if(df[ant.getColumn()].values[j] == ant.getValue()):
            
            ant_coverage += 1
            if(df[con.getColumn()].values[j] == con.getValue()):
                rule_coverage += 1
    confidence[i] = float( rule_coverage ) / float( ant_coverage )
# print the rules and their confidence values
print('number of association rules = ', len( rules ))
print('association rules:')
for ( i, r ) in zip( range( len( rules )), rules ):
    print(r, ', confidence =', round(confidence[i],2) )

number of association rules =  6
association rules:
(internet==yes)->(maths==no) , confidence = 0.76
(maths==no)->(internet==yes) , confidence = 0.74
(internet==yes)->(politics==no) , confidence = 0.73
(politics==no)->(internet==yes) , confidence = 0.75
(maths==no)->(politics==no) , confidence = 0.76
(politics==no)->(maths==no) , confidence = 0.8
