# Beam search implementation

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import csv
from queue import Queue
import math

In [2]:
df_action = pd.read_csv('data/action_condition_meta.csv')
df_action.head()

Unnamed: 0,action,user_id,condition,geo_country,refr_source,browser_language,os_name,os_timezone,dvce_type
0,clic,379881d5-32d7-49f4-bf5b-81fefbc5fcce,1-Control,FI,Google,greek,Android,Europe,Mobile
1,clic,2a0f4218-4f62-479b-845c-109b2720e6e7,2-Buttony-Conversion-Buttons,AU,Google,english,iOS,Australia,Mobile
2,clic,a511b6dc-2dca-455b-b5e2-bf2d224a5505,2-Buttony-Conversion-Buttons,GB,Google,english,Android,Europe,Mobile
3,clic,9fb616a7-4e13-4307-ac92-0b075d7d376a,2-Buttony-Conversion-Buttons,FI,Google,english,iOS,Europe,Mobile
4,clic,64816772-688d-4460-a591-79aa49bba0d5,2-Buttony-Conversion-Buttons,BD,Google,english,Android,Asia,Mobile


<b>We created our own priorityqueue class, because the standard priorityqueue blocks (can't insert element) when max.size is reached. We need to have a priorityqueue where the item with lowest priority is discarded and replaced by the new item.
<i>priority_queue</i> represents min heap</b>

In [3]:
import heapq
from heapq import heappush, heappop

class priority_queue:
    def __init__(self, max_size):
        self.items = []
        self.max = max_size
   
    def push(self, item, priority):
        if len(self.items) < self.max:
            heapq.heappush(self.items, (priority, item))
        else:
            heapq.heappushpop(self.items, (priority, item))
    
    def get_items(self):
        result = []
        for i in self.items:
            result.append(i[1])
        return result

    def pop(self):
        return heapq.heappop(self.items)

    def get_max_item(self):
        return self.items[0]
    
    def empty(self):
        return not self.items
    
    def print_elements(self):
        result = []
        for i in self.items:
            result.append(i)
        return result
       
    def heap_sort(self):
        return [heapq.heappop(self.items) for _ in range(len(self.items))]

The following function <i>create_dataframe</i> is used to obtain a subset of the <i>df_action</i> dataframe, based on the the description language which is passed as argument. The dataframe which is returned is the subgroup of records (users) belonging to that particular description language. For example, a group of users who all were using iOS as operating system at the time of the experiment

In [4]:
def create_dataframe(Set):
    d =df_action.copy()
    count = 0
    for column, item in Set:
        if count==0:
            if isinstance(item, str): # single value
                df_new = pd.merge(d, df_action.loc[(df_action[column] == item)], on=list(df_action), how='inner')
            else: # list of values
                df_new = pd.merge(d, df_action.loc[df_action[column].isin(item)], on=list(df_action), how='inner')
            df_new.drop_duplicates(inplace = True)
        else:
            if isinstance(item, str): # single value
                df_new = pd.merge(df_new, df_action.loc[(df_action[column] == item)], on=list(df_action), how='inner')
            else: # list of values
                df_new = pd.merge(df_new, df_action.loc[df_action[column].isin(item)], on=list(df_action), how='inner')
        count +=1
    return df_new

The following function <i>constraints</i> checks wheter a certain subgroup, represented by a dataframe which is passed as argument, satisfies the constraints $C$. At the moment the only constraint of a subgroup is that it should be represented by at least $5$ records (users) in the original dataset.

In [5]:
def constraints(df_matches):
    return df_matches.shape[0] > 5

The refinement operator. The refinement operator gets the records and chooses to which type of attribute they belong.
There are 3 types of attributes:
    1. Numeric: Attribute with all number records
    2. Binary: Attribute with true or false records
    3. Nominal: Attribute with multiple different values in their records which are not numeric
    
1. For numeric values we sort all records which are in the description language D. After that we make equal-sized bins.
The amount of bins is dependend on a predefined value. For each split point we add a description based on whether the
numeric value is greater or equal or smaller or equal as the split point.
2. For binary records we add one description where the description is true and one description where the description is false.
3. For nominal values we add for each description an entry with the description and one without the description. In our case
we take the first description in the list which is not equal to that description.

In [6]:
def all_paterns_one_condition(): # level 1
    result = []
    columns = list(df_action.columns.values)
    not_relevant = ['action', 'condition', 'user_id']
    columns = [x for x in columns if (x not in not_relevant)]
    for column in columns:
        if df_action[column].unique().size == 2: # binary attribute
            result.append((column, True))
            result.append((column, False))
        else: # nominal attribute
            for value in df_action[column].unique():
                all_unique = list(df_action[column].unique())
                complement_value = [x for x in all_unique if x != value]
                result.append((column, value))
                result.append((column, complement_value))
    return result

In [7]:
'''
for level >= 2
desc_language = [] # the description language

def refinement(seed):
    unique_records = df_action[seed].unique()
    if unique_records.size == 0:
        return desc_language
    # no numeric attributes in dataset, so no check/implementation oft this type of attribute  
    if unique_records.size == 2:    #Attribute is binary
        desc_language.append((seed, True))
        desc_language.append((seed, False))
    else:                             #Attribute is nominal
        for item in unique_records:
            desc_language.append((seed, item))
            desc_language.append((seed, unique_records[next((i for i, v in enumerate(unique_records) if v != item), -1)]))
    return desc_language
'''

'\nfor level >= 2\ndesc_language = [] # the description language\n\ndef refinement(seed):\n    unique_records = df_action[seed].unique()\n    if unique_records.size == 0:\n        return desc_language\n    # no numeric attributes in dataset, so no check/implementation oft this type of attribute  \n    if unique_records.size == 2:    #Attribute is binary\n        desc_language.append((seed, True))\n        desc_language.append((seed, False))\n    else:                             #Attribute is nominal\n        for item in unique_records:\n            desc_language.append((seed, item))\n            desc_language.append((seed, unique_records[next((i for i, v in enumerate(unique_records) if v != item), -1)]))\n    return desc_language\n'

As alluded in the referenced paper, the StudyPortals (original) dataset ocmes natural equiped with $m=2$ nominal targets. The first nominal target attribute is $condition$, which represents a binary column that tells us to which version the particular user was exposed during the experiment (i.e. version A or B). The second nominal target attribute is $action$, which is the binary column representing whether the page visitor merely viewed or also clicked on the button in question during the experiment. Considering these pecularities of the StudyPortals dataset, the natural choice of EMM instance would be the association model class. So we strive to find subgroups for which the association between view/click and A/B is exceptional.

Now that we know what model class will be exploited, the next step is to define or exploit an appropriate quality measure. Since one can easily achieve huge deviations in target behaviour (assiociation between the differences in the two nominal target attributes), it makes sense to have a dimension in the quality measure which reflects the group size. In addition one also needs to have a target deviation dimension/component in the quality measure, of course. 

<b>The Target Deviation Component</b> ($\varphi_{Q}(S)$)

In [8]:
def target_deviation(df_matches):
    n_1 = df_matches.loc[(df_matches.action == 'view') & (df_matches.condition == '1-Control')].count()[0]
    n_2 = df_matches.loc[(df_matches.action == 'clic') & (df_matches.condition == '1-Control')].count()[0]
    n_3 = df_matches.loc[(df_matches.action == 'view') & (df_matches.condition == '2-Buttony-Conversion-Buttons')].count()[0]
    n_4 = df_matches.loc[(df_matches.action == 'clic') & (df_matches.condition == '2-Buttony-Conversion-Buttons')].count()[0]
    result = (n_1*n_4 - n_2*n_3)/(n_1*n_4 + n_2*n_3)
    return result

<b>The Subgroup Size Component</b><br>
To represent the subgroup size, we take the entropy function as described in section $3.1$ of the referenced paper. The function conceptually rewards $50/50$ splits between subgroup and complements, while punishing subgroups that are either (relatively) small or cover the vast majority of the dataset.

$\varphi_{ef}(D) = - \frac{n}{N}lg(\frac{n}{N}) - \frac{n^C}{N}lg(\frac{n^C}{N})$

In [9]:
def entropy_function(df_matches):
    zero = np.finfo(np.double).tiny # dealing with divisions/logarithms of 0
    n = df_matches.shape[0] # size subgroup in original dataset
    N = df_action.shape[0] # size original dataset
    n_c = N - n # complement of subgroup in orginal dataset
    if (n == 0):
        n = zero
    elif (n_c == 0):
        n_c = zero
    return ((-n/N) * math.log2(n/N)) - ((-n_c/N) * math.log2(n_c/N))

When combining these two components/dimensions, one obtains an association model class quality measure known as <i>Yule's Quality Measure</i>. $\varphi_{Yule}(S) = \varphi_{Q}(S) \cdot \varphi_{ef}(S) $, which boils down to a multiplication of the target deviation- and the subgroup size -component.

In [10]:
def phiYule(df_matches):
    return target_deviation(df_matches) * entropy_function(df_matches)

The multiplication of the two components ensures that subgroups are evaluated well (i.e. score well) on both components

The next step is the implementation of the $beam\_search$ algorithm. There are a few parameters which influence the outcome, and can be changed accordingly to what one sees fit for a certain experiment

In [11]:
d = 1 # search depth
w = 5 # search width (i.e. beam width)
q = 5 # size of the set of best subgroup results (i.e. top q subgroups)
quality_measure = phiYule # choose between phiYule (Yule's Quality Measure), ...

In [12]:
def beam_search(d, w, q, quality_measure):
    resultSet = priority_queue(q)
    beam = priority_queue(w)
    # first level
    all_paterns = all_paterns_one_condition()
    print("level: "+str(1))
    for desc in all_paterns:
        subset_desc = create_dataframe([desc])
        quality = quality_measure(subset_desc)
        if (constraints(subset_desc)):
            beam.push([desc], quality)
            resultSet.push([desc], quality)
    # level >= 2
    #for level in (2, d + 1):
     #   print("level: "+str(level))
        
        
    return resultSet

In [20]:
result = beam_search(d = d, w = w, q = q, quality_measure = quality_measure)
#priority_queue.heap_sort(result)
temp = result.get_items()
temp 

level: 1


  


[[('geo_country', 'BD')],
 [('geo_country', 'GB')],
 [('geo_country', 'US')],
 [('geo_country', 'IN')],
 [('os_timezone', 'America')]]

In [21]:
param = temp
result = []
for i in range(0, 5):
    for j in range(0, 5):
        if (i != j):
            result.append([param[i][0], param[j][0]])
result

[[('geo_country', 'BD'), ('geo_country', 'GB')],
 [('geo_country', 'BD'), ('geo_country', 'US')],
 [('geo_country', 'BD'), ('geo_country', 'IN')],
 [('geo_country', 'BD'), ('os_timezone', 'America')],
 [('geo_country', 'GB'), ('geo_country', 'BD')],
 [('geo_country', 'GB'), ('geo_country', 'US')],
 [('geo_country', 'GB'), ('geo_country', 'IN')],
 [('geo_country', 'GB'), ('os_timezone', 'America')],
 [('geo_country', 'US'), ('geo_country', 'BD')],
 [('geo_country', 'US'), ('geo_country', 'GB')],
 [('geo_country', 'US'), ('geo_country', 'IN')],
 [('geo_country', 'US'), ('os_timezone', 'America')],
 [('geo_country', 'IN'), ('geo_country', 'BD')],
 [('geo_country', 'IN'), ('geo_country', 'GB')],
 [('geo_country', 'IN'), ('geo_country', 'US')],
 [('geo_country', 'IN'), ('os_timezone', 'America')],
 [('os_timezone', 'America'), ('geo_country', 'BD')],
 [('os_timezone', 'America'), ('geo_country', 'GB')],
 [('os_timezone', 'America'), ('geo_country', 'US')],
 [('os_timezone', 'America'), ('ge

In [27]:
temp

[[('geo_country', 'BD')],
 [('geo_country', 'GB')],
 [('geo_country', 'US')],
 [('geo_country', 'IN')],
 [('os_timezone', 'America')]]

In [46]:
def refine(current_beam):
    w = len(current_beam)
    result= []
    for i in range(0, w):
        for j in range(0, w):
            if (i != j):
                len_i = len(current_beam[i])
                len_j = len(current_beam[j])
                min_len = min(len_i, len_j)
                new = []
                for t in (0, min_len - 1):
                    if(current_beam[i][t] not in new):
                        new.append(current_beam[i][t])
                    if(current_beam[j][t] not in new):
                        new.append(current_beam[j][t])
                if ((list(reversed(new)) not in result) & (new not in result)):
                    result.append(new)
    return result

In [25]:
len(result[0])

2

In [47]:
test = refine(temp)
test

[[('geo_country', 'BD'), ('geo_country', 'GB')],
 [('geo_country', 'BD'), ('geo_country', 'US')],
 [('geo_country', 'BD'), ('geo_country', 'IN')],
 [('geo_country', 'BD'), ('os_timezone', 'America')],
 [('geo_country', 'GB'), ('geo_country', 'US')],
 [('geo_country', 'GB'), ('geo_country', 'IN')],
 [('geo_country', 'GB'), ('os_timezone', 'America')],
 [('geo_country', 'US'), ('geo_country', 'IN')],
 [('geo_country', 'US'), ('os_timezone', 'America')],
 [('geo_country', 'IN'), ('os_timezone', 'America')]]

In [48]:
slice_test = test[0:5]
slice_test

[[('geo_country', 'BD'), ('geo_country', 'GB')],
 [('geo_country', 'BD'), ('geo_country', 'US')],
 [('geo_country', 'BD'), ('geo_country', 'IN')],
 [('geo_country', 'BD'), ('os_timezone', 'America')],
 [('geo_country', 'GB'), ('geo_country', 'US')]]

In [49]:
test3 = refine(slice_test)
test3

[[('geo_country', 'BD'), ('geo_country', 'GB'), ('geo_country', 'US')],
 [('geo_country', 'BD'), ('geo_country', 'GB'), ('geo_country', 'IN')],
 [('geo_country', 'BD'), ('geo_country', 'GB'), ('os_timezone', 'America')],
 [('geo_country', 'BD'), ('geo_country', 'US'), ('geo_country', 'GB')],
 [('geo_country', 'BD'), ('geo_country', 'US'), ('geo_country', 'IN')],
 [('geo_country', 'BD'), ('geo_country', 'US'), ('os_timezone', 'America')],
 [('geo_country', 'BD'), ('geo_country', 'IN'), ('geo_country', 'GB')],
 [('geo_country', 'BD'), ('geo_country', 'IN'), ('geo_country', 'US')],
 [('geo_country', 'BD'), ('geo_country', 'IN'), ('os_timezone', 'America')],
 [('geo_country', 'BD'),
  ('geo_country', 'GB'),
  ('geo_country', 'IN'),
  ('geo_country', 'US')],
 [('geo_country', 'BD'), ('os_timezone', 'America'), ('geo_country', 'GB')],
 [('geo_country', 'BD'), ('os_timezone', 'America'), ('geo_country', 'US')],
 [('geo_country', 'BD'), ('os_timezone', 'America'), ('geo_country', 'IN')],
 [('g

In [50]:
slice_test_2 = test3[-5:]
slice_test_2

[[('geo_country', 'BD'), ('os_timezone', 'America'), ('geo_country', 'IN')],
 [('geo_country', 'BD'),
  ('geo_country', 'GB'),
  ('os_timezone', 'America'),
  ('geo_country', 'US')],
 [('geo_country', 'GB'), ('geo_country', 'BD'), ('geo_country', 'US')],
 [('geo_country', 'GB'),
  ('geo_country', 'BD'),
  ('geo_country', 'US'),
  ('geo_country', 'IN')],
 [('geo_country', 'GB'),
  ('geo_country', 'BD'),
  ('geo_country', 'US'),
  ('os_timezone', 'America')]]

In [51]:
test4 = refine(slice_test_2)
test4

[[('geo_country', 'BD'), ('geo_country', 'IN'), ('os_timezone', 'America')],
 [('geo_country', 'BD'),
  ('geo_country', 'GB'),
  ('geo_country', 'IN'),
  ('geo_country', 'US')],
 [('geo_country', 'BD'), ('os_timezone', 'America'), ('geo_country', 'IN')],
 [('geo_country', 'BD'),
  ('geo_country', 'GB'),
  ('os_timezone', 'America'),
  ('geo_country', 'US')],
 [('geo_country', 'BD'),
  ('geo_country', 'GB'),
  ('geo_country', 'US'),
  ('geo_country', 'IN')],
 [('geo_country', 'BD'),
  ('geo_country', 'GB'),
  ('geo_country', 'US'),
  ('os_timezone', 'America')],
 [('geo_country', 'GB'),
  ('geo_country', 'BD'),
  ('geo_country', 'US'),
  ('geo_country', 'IN')],
 [('geo_country', 'GB'),
  ('geo_country', 'BD'),
  ('geo_country', 'US'),
  ('os_timezone', 'America')],
 [('geo_country', 'GB'), ('geo_country', 'US')],
 [('geo_country', 'GB'),
  ('geo_country', 'BD'),
  ('geo_country', 'IN'),
  ('geo_country', 'US')],
 [('geo_country', 'GB'), ('geo_country', 'IN'), ('os_timezone', 'America')]

In [None]:
'''    
    for level in range(2, d + 1): # if d = depth >= 2
        print("level: "+str(level))
        
        while (len(candidateQueue) > 0):
            seed = candidateQueue.pop(0)
            set_refined = list(set(refinement(seed)))
            for desc in set_refined:
                df_match_condition = create_dataframe([desc])
                quality = quality_measure(df_match_condition)
                if constraints(df_match_condition):
                    resultSet.push(desc, quality)
                    beam.push(desc, quality)
            while not beam.empty:
                candidateQueue.append(priority_queue.pop(beam))
    return resultSet
'''