In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [2]:
data = pd.read_csv('bike_buyers.csv')
data.nunique()

Marital Status       2
Gender               2
Income              16
Children             6
Education            5
Occupation           5
Home Owner           2
Cars                 5
Commute Distance     5
Region               3
Age                 49
Purchased Bike       2
dtype: int64

In [3]:
data.loc[data['Cars']>0, 'Cars']=1
data[['Income', 'Age']].describe()

Unnamed: 0,Income,Age
count,500.0,500.0
mean,56420.0,43.93
std,31840.40236,11.249097
min,10000.0,25.0
25%,30000.0,35.0
50%,60000.0,43.0
75%,70000.0,51.0
max,170000.0,80.0


In [4]:
data.loc[data['Income']<30000, 'Income']=1
data.loc[(data['Income']>=30000) & (data['Income']<60000), 'Income']=2
data.loc[(data['Income']>=60000) & (data['Income']<70000), 'Income']=3
data.loc[(data['Income']>=70000), 'Income']=4

data.loc[data['Age']<35, 'Age']=1
data.loc[(data['Age']>=35) & (data['Age']<43), 'Age']=2
data.loc[(data['Age']>=43) & (data['Age']<52), 'Age']=3
data.loc[(data['Age']>=52), 'Age']=4

data.loc[data['Purchased Bike']=='Yes', 'Purchased Bike']='positive'
data.loc[data['Purchased Bike']=='No', 'Purchased Bike']='negative'

In [5]:
attrib_names = [str(i) for i in range(36)]
attrib_names.append('class')

#### Functions for estimating prediction quality

In [6]:
def false_pos_rate(res):
    return float(res["FP"]) / max(1, res["FP"] + res["TN"])    

def false_neg_rate(res):
    return float(res["FN"]) / max(1, res["FN"] + res["TP"])   

def true_neg_rate(res):
    return float(res["TN"]) / max(1, res["TN"] + res["FP"])

def neg_pred_val(res):
    return float(res["TN"]) / max(1, res["TN"] + res["FN"])

def false_disc_rate(res):
    return float(res["FP"]) / max(1, res["TP"] + res["FP"])

def accuracy(res):
    return float(res["TP"] + res["TN"]) / max(1, res["TP"] + res["TN"] + res["FP"] + res["FN"] + res["contradiction"])

def precision(res):
    return float(res["TP"]) / max(1, res["TP"] + res["FP"])

def recall(res):
    return float(res["TP"]) / max(1, res["TP"] + res["FN"])

def F1_score(res):
    prec = precision(res)
    rec = recall(res)
    return 2 * prec * rec / max(1, prec + rec)

def share_of_contradiction(res):
    return res['contradiction']/(res['TP']+res['TN']+res['FP']+res['FN']+res['contradiction'])


def summary(res):
    quality = {}
    quality["accuracy"] = accuracy(res)
    quality["precision"] = precision(res)
    quality["recall"] = recall(res)
    quality["F1"] = F1_score(res)
    quality["Neg_pred_rate"] = neg_pred_val(res)
    quality["FP_rate"] = false_pos_rate(res)
    quality["FN_rate"] = false_neg_rate(res)
    quality["F_disc_rate"] = false_disc_rate(res)
    quality['Relative number of contradictions'] = share_of_contradiction(res)
    return quality 

#### Supporting functions

In [7]:
# Data transformation
def prepare_data(train, target_name):
    y=train[target_name]
    train = train.drop(columns=target_name)                      
    for column in train.columns:
        if train[column].nunique()>2:
            new_columns = pd.get_dummies(train[column], prefix = '_' + column)
            train = train.drop(columns=column)
            train = pd.concat([train, new_columns], axis=1)
    train = pd.concat([train, y], axis=1)
    return np.array(train).tolist()

In [8]:
#Divide dataset into two contexts - positive and negative.
def separation(train):
    plus = [a for a in train if a[-1] == 'positive']
    minus = [a for a in train if a[-1] == 'negative']
    return plus, minus

In [9]:
def make_intent(obs, attrib_names=attrib_names):
    intent = set([i+':'+str(k) for i, k in zip(attrib_names,obs)][:-1])
    return intent

In [10]:
def run_argorithm(algorithm_name, plus, minus, test, params=None):
    res = {'TP':0, 'TN':0, 'FP':0, 'FN':0, 'contradiction':0}
    if params!=None:
        if len(params)==1:
            for i in range(len(test)):
                key = algorithm_name(plus, minus, test[i], params[list(params.keys())[0]])
                res[key]+=1
        else:
            for i in range(len(test)):
                key = algorithm_name(plus, minus, test[i], params[list(params.keys())[0]], params[list(params.keys())[1]])
                res[key]+=1
    else: 
        for i in range(len(test)):
                key = algorithm_name(plus, minus, test[i])
                res[key]+=1
    quality = summary(res)
    return quality

### Algorithms

In [11]:
#Algorithm 1
#Based on majority voting rule

def algorithm1(plus, minus, test_example, min_cardinality):
    test_intent = make_intent(test_example)
    prediction = {"positive": True, "negative": True}
    pos_votes = 0
    neg_votes = 0
    for obs in plus:
        intent = make_intent(obs)
        intersection = intent & test_intent
        minus_check = [make_intent(i) for i in minus if make_intent(i).issuperset(intersection)]
        if len(minus_check)==0 and len(intersection)>min_cardinality:
            pos_votes+=1

        
    for obs in minus:
        intent = make_intent(obs)
        intersection = intent & test_intent
        plus_check = [make_intent(i) for i in plus if make_intent(i).issuperset(intersection)]
        if len(plus_check)==0 and len(intersection)>min_cardinality:
            neg_votes+=1
            
    if pos_votes>neg_votes:
        prediction['negative']=False
    elif pos_votes<neg_votes:
        prediction['positive']=False
    else: 
        prediction['positive']=False
        prediction['negative']=False
    
    if not prediction['positive'] and not prediction['negative']:
        return 'contradiction'
    
    if test_example[-1] == "positive" and prediction['positive']:
        return 'TP'
    
    if test_example[-1] == "positive" and prediction["negative"]:
        return 'FN'

    if test_example[-1] == "negative" and prediction["positive"]:
        return 'FP'

    if test_example[-1] == "negative" and prediction["negative"]:
        return 'TN'
        

In [None]:
#Hyperparameter tunning

kf = KFold(n_splits=3, shuffle=True, random_state=12)
kfold_results = []
for train_index, test_index in kf.split(data):
    train, test = data.iloc[train_index], data.iloc[test_index]
    train = prepare_data(train, 'Purchased Bike')
    test = prepare_data(test, 'Purchased Bike')
    plus,minus = separation(train)
    min_cardinality = [5,10,15,30]
    acc = []
    for i in min_cardinality:
        params = {"min_cardinality":i}
        results = run_argorithm(algorithm1, plus, minus, test, params)['accuracy']
        acc.append(results)
    kfold_results.append(acc)
    results = pd.DataFrame({'min_cardinality':min_cardinality,'accuracy':acc})
    
params_results = pd.DataFrame({'1':kfold_results[0],'2':kfold_results[1],'3':kfold_results[2]}, index=min_cardinality).mean(axis=1)
params_results

In [80]:
#Algorithm 2
#Based on average support 

def algorithm2(plus, minus, test_example):
    test_intent = make_intent(test_example)
    prediction = {"positive": True, "negative": True}
    total_plus_support = 0
    total_minus_support = 0
    for obs in plus:
        intent = make_intent(obs)
        intersection = intent & test_intent
        plus_support = len([make_intent(i) for i in plus if make_intent(i).issuperset(intersection)])/len(plus)
        total_plus_support+=plus_support
    total_plus_support = total_plus_support/len(plus)

        
    for obs in minus:
        intent = make_intent(obs)
        intersection = intent & test_intent
        minus_support = len([make_intent(i) for i in minus if make_intent(i).issuperset(intersection)])/len(minus)
        total_minus_support+=minus_support
    total_minus_support = total_minus_support/len(minus)
    
    if total_plus_support>total_minus_support:
        prediction['negative']=False
    elif total_plus_support<total_minus_support:
        prediction['positive']=False
    else: 
        prediction['positive']=False
        prediction['negative']=False
    
    if not prediction['positive'] and not prediction['negative']:
        return 'contradiction'
    
    if test_example[-1] == "positive" and prediction['positive']:
        return 'TP'
    
    if test_example[-1] == "positive" and prediction["negative"]:
        return 'FN'

    if test_example[-1] == "negative" and prediction["positive"]:
        return 'FP'

    if test_example[-1] == "negative" and prediction["negative"]:
        return 'TN'


In [83]:
#Algorithm 3
#Based on the length of the intersection 

def algorithm3(plus, minus, test_example):
    test_intent = make_intent(test_example)
    prediction = {"positive": True, "negative": True}
    len_plus = []
    len_minus = []
    for obs in plus:
        intent = make_intent(obs)
        intersection = len(intent & test_intent)
        len_plus.append(intersection)
        
    for obs in minus:
        intent = make_intent(obs)
        intersection = len(intent & test_intent)
        len_minus.append(intersection)
        
    if max(len_plus)>max(len_minus):
        prediction['negative']=False
    elif max(len_plus)<max(len_minus):
        prediction['positive']=False
    else: 
        prediction['positive']=False
        prediction['negative']=False
    
    if not prediction['positive'] and not prediction['negative']:
        return 'contradiction'
    
    if test_example[-1] == "positive" and prediction['positive']:
        return 'TP'
    
    if test_example[-1] == "positive" and prediction["negative"]:
        return 'FN'

    if test_example[-1] == "negative" and prediction["positive"]:
        return 'FP'

    if test_example[-1] == "negative" and prediction["negative"]:
        return 'TN'


### Getting the results

In [90]:
kf = KFold(n_splits=3, shuffle=True, random_state=12)
kfold_results = []
for train_index, test_index in kf.split(data):
    train, test = data.iloc[train_index], data.iloc[test_index]
    train = prepare_data(train, 'Purchased Bike')
    test = prepare_data(test, 'Purchased Bike')
    plus,minus = separation(train)
    results1 = run_argorithm(algorithm1, plus, minus, test, {'min_cardinality':15})
    kfold_results.append(results2)
    results2 = run_argorithm(algorithm2, plus, minus, test)
    kfold_results.append(results3)
    results3 = run_argorithm(algorithm3, plus, minus, test)
    kfold_results.append(results4)

In [91]:
algo_1 = pd.DataFrame()
algo_1 = algo_1.append(kfold_results[0], ignore_index=True)
algo_1 = algo_1.append(kfold_results[3], ignore_index=True)
algo_1 = algo_1.append(kfold_results[6], ignore_index=True)
algo_1.mean()

accuracy                             0.613989
precision                            0.625132
recall                               0.640180
F1                                   0.629370
Neg_pred_rate                        0.642482
FP_rate                              0.374199
FN_rate                              0.359820
F_disc_rate                          0.374868
Relative number of contradictions    0.025996
dtype: float64

In [92]:
algo_2 = pd.DataFrame()
algo_2 = algo_2.append(kfold_results[1], ignore_index=True)
algo_2 = algo_2.append(kfold_results[4], ignore_index=True)
algo_2 = algo_2.append(kfold_results[7], ignore_index=True)
algo_2.mean()

accuracy                             0.637965
precision                            0.635526
recall                               0.630807
F1                                   0.632554
Neg_pred_rate                        0.637087
FP_rate                              0.361166
FN_rate                              0.369193
F_disc_rate                          0.364474
Relative number of contradictions    0.000000
dtype: float64

In [93]:
algo_3 = pd.DataFrame()
algo_3 = algo_3.append(kfold_results[2], ignore_index=True)
algo_3 = algo_3.append(kfold_results[5], ignore_index=True)
algo_3 = algo_3.append(kfold_results[8], ignore_index=True)
algo_3.mean()

accuracy                             0.537912
precision                            0.638490
recall                               0.655354
F1                                   0.646015
Neg_pred_rate                        0.659847
FP_rate                              0.356602
FN_rate                              0.344646
F_disc_rate                          0.361510
Relative number of contradictions    0.172041
dtype: float64