# Beam search implementation

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import csv
from queue import Queue

In [2]:
df_action = pd.read_csv('data/action_condition_meta.csv')
df_action.head()

Unnamed: 0,action,user_id,condition,geo_country,refr_source,browser_language,os_name,os_timezone,dvce_type
0,clic,379881d5-32d7-49f4-bf5b-81fefbc5fcce,1-Control,FI,Google,greek,Android,Europe,Mobile
1,clic,2a0f4218-4f62-479b-845c-109b2720e6e7,2-Buttony-Conversion-Buttons,AU,Google,english,iOS,Australia,Mobile
2,clic,a511b6dc-2dca-455b-b5e2-bf2d224a5505,2-Buttony-Conversion-Buttons,GB,Google,english,Android,Europe,Mobile
3,clic,9fb616a7-4e13-4307-ac92-0b075d7d376a,2-Buttony-Conversion-Buttons,FI,Google,english,iOS,Europe,Mobile
4,clic,64816772-688d-4460-a591-79aa49bba0d5,2-Buttony-Conversion-Buttons,BD,Google,english,Android,Asia,Mobile


In [3]:
#created own priorityqueue class, because the standard priorityqueue blocks (can't insert element) when max.size is reached.
#we need to have a priorityqueue where the item with lowest priority is discarded and replaced by the new item.
#priorityqueue represents min heap 
import heapq
from heapq import heappush, heappop

class priority_queue:
    def __init__(self, max_size):
        self.items = []
        self.max = max_size
   
    def push(self, item, priority):
        if len(self.items) < self.max:
            heapq.heappush(self.items, (priority, item))
        else:
            heapq.heappushpop(self.items, (priority, item))
            

    def pop(self):
        return heapq.heappop(self.items)

    def get_max_item(self):
        return self.items[0]
    
    def empty(self):
        return not self.items
    
    def print_elements(self):
        result = []
        for i in self.items:
            result.append(i)
        return result
       
    def heap_sort(self):
        return [heapq.heappop(self.items) for _ in range(len(self.items))]

In [4]:
#test-cases for priority_queue class
#implementation (seems to) work
heap = priority_queue(2)
heap.push("browser_language = EN", 1)
heap.push("OS_Name = iOS", 2)
heap.push("browser_language = NL", 3)
heap.push("browser_language = GE", 5)
priority_queue.get_max_item(heap)

(3, 'browser_language = NL')

In [5]:
print(priority_queue.heap_sort(heap))

[(3, 'browser_language = NL'), (5, 'browser_language = GE')]


In [6]:
#drop user_id, because we want to build a description language (where we don't need user_id for)
df_action.drop('user_id', axis =1, inplace=True)

In [7]:
#def phiYule(Set):
#implementation of Yule's Quality measure (2b)
def phiYule(Set, column_name):
    n_1 = df_action.loc[(df_action[column_name] == Set) & (df_action.action == 'view') & (df_action.condition == '1-Control')].count()[0]
    n_2 = df_action.loc[(df_action[column_name] == Set) & (df_action.action == 'clic') & (df_action.condition == '1-Control')].count()[0]
    n_3 = df_action.loc[(df_action[column_name] == Set) & (df_action.action == 'view') & (df_action.condition == '2-Buttony-Conversion-Buttons')].count()[0]
    n_4 = df_action.loc[(df_action[column_name] == Set) & (df_action.action == 'clic') & (df_action.condition == '2-Buttony-Conversion-Buttons')].count()[0]
    yuleQ = (n_1*n_4 - n_2*n_3)/(n_1*n_4-n_2*n_3)
    return yuleQ

In [8]:
#implementation of refinement operator
#returns the unique values in the column, so that we can use these values for building a description language
def refinement(seed):
    descriptions = df_action[seed].unique()
    return descriptions

In [9]:
descList = []

In [10]:
def refinement_2(seed):
    descriptions = df_action[seed].unique()
    for item in descriptions:
        descList.append((seed, item))
    return descList

In [11]:
candidateQueue = list(df_action.columns.values) 
candidateQueue.remove('action') 
candidateQueue.remove('condition')
descriptor = candidateQueue.pop(0)
seed = descriptor
set_refined = refinement_2(seed)
set_refined

[('geo_country', 'FI'),
 ('geo_country', 'AU'),
 ('geo_country', 'GB'),
 ('geo_country', 'BD'),
 ('geo_country', 'NG'),
 ('geo_country', 'EG'),
 ('geo_country', 'DE'),
 ('geo_country', 'HK'),
 ('geo_country', 'PG'),
 ('geo_country', 'SD'),
 ('geo_country', 'BR'),
 ('geo_country', 'PK'),
 ('geo_country', 'CM'),
 ('geo_country', 'GH'),
 ('geo_country', 'US'),
 ('geo_country', 'ML'),
 ('geo_country', 'RU'),
 ('geo_country', 'JP'),
 ('geo_country', 'LB'),
 ('geo_country', 'MY'),
 ('geo_country', 'IN'),
 ('geo_country', 'CA'),
 ('geo_country', 'BM'),
 ('geo_country', 'PE'),
 ('geo_country', 'ID'),
 ('geo_country', 'TZ'),
 ('geo_country', 'ZM'),
 ('geo_country', 'MM'),
 ('geo_country', 'AT'),
 ('geo_country', 'KW'),
 ('geo_country', 'PH'),
 ('geo_country', 'KZ'),
 ('geo_country', 'PY'),
 ('geo_country', 'AZ'),
 ('geo_country', 'NP'),
 ('geo_country', 'ET'),
 ('geo_country', 'SA'),
 ('geo_country', 'IL'),
 ('geo_country', 'MX'),
 ('geo_country', 'TN'),
 ('geo_country', 'MG'),
 ('geo_country',

In [12]:
descriptor = candidateQueue.pop(0)
seed = descriptor
set_refined = refinement_2(seed)
set_refined

[('geo_country', 'FI'),
 ('geo_country', 'AU'),
 ('geo_country', 'GB'),
 ('geo_country', 'BD'),
 ('geo_country', 'NG'),
 ('geo_country', 'EG'),
 ('geo_country', 'DE'),
 ('geo_country', 'HK'),
 ('geo_country', 'PG'),
 ('geo_country', 'SD'),
 ('geo_country', 'BR'),
 ('geo_country', 'PK'),
 ('geo_country', 'CM'),
 ('geo_country', 'GH'),
 ('geo_country', 'US'),
 ('geo_country', 'ML'),
 ('geo_country', 'RU'),
 ('geo_country', 'JP'),
 ('geo_country', 'LB'),
 ('geo_country', 'MY'),
 ('geo_country', 'IN'),
 ('geo_country', 'CA'),
 ('geo_country', 'BM'),
 ('geo_country', 'PE'),
 ('geo_country', 'ID'),
 ('geo_country', 'TZ'),
 ('geo_country', 'ZM'),
 ('geo_country', 'MM'),
 ('geo_country', 'AT'),
 ('geo_country', 'KW'),
 ('geo_country', 'PH'),
 ('geo_country', 'KZ'),
 ('geo_country', 'PY'),
 ('geo_country', 'AZ'),
 ('geo_country', 'NP'),
 ('geo_country', 'ET'),
 ('geo_country', 'SA'),
 ('geo_country', 'IL'),
 ('geo_country', 'MX'),
 ('geo_country', 'TN'),
 ('geo_country', 'MG'),
 ('geo_country',

In [13]:
df_new = pd.DataFrame(columns = df_action.columns.values)
df_new.head()

Unnamed: 0,action,condition,geo_country,refr_source,browser_language,os_name,os_timezone,dvce_type


In [14]:
#constraints for the subgroups
#let's say that the subgroups must be larger than 5, before we put it into the queue.
#what i want to reach: get pandas dataframe that matches ALL the conditions in the descriptions


def constraints_2(Set):
    d = {}
    appended_data = []
    total = 0
    for column, item in Set:
        for column2, item2 in Set:
        #total += df_action.loc[(df_action[column] == item)].count()[0]
        #d.update({column: df_action.loc[(df_action[column] == item)]})
            d = df_action.loc[(df_action[column] == item) & (df_action[column2] == item2)]
    return d

In [15]:
desc = [('refr_source', 'StudyPortal'), ('geo_country', 'GB')]
constraints_2(desc)

Unnamed: 0,action,condition,geo_country,refr_source,browser_language,os_name,os_timezone,dvce_type
2,clic,2-Buttony-Conversion-Buttons,GB,Google,english,Android,Europe,Mobile
5,clic,2-Buttony-Conversion-Buttons,GB,Google,english,iOS,Europe,Mobile
12,clic,1-Control,GB,Google,english,Android,Europe,Mobile
22,clic,2-Buttony-Conversion-Buttons,GB,Google,english,iOS,Europe,Mobile
23,clic,2-Buttony-Conversion-Buttons,GB,Google,english,iOS,Europe,Mobile
24,view,2-Buttony-Conversion-Buttons,GB,Google,english,iOS,Europe,Mobile
25,view,2-Buttony-Conversion-Buttons,GB,Google,english,iOS,Europe,Mobile
26,view,2-Buttony-Conversion-Buttons,GB,Google,english,iOS,Europe,Mobile
27,clic,1-Control,GB,Google,english,Android,Europe,Mobile
28,clic,1-Control,GB,Google,english,Android,Europe,Mobile


In [16]:
def phiYule_2(Set):
    n_1 = 0
    n_2 = 0
    n_3 = 0
    n_4 = 0
    for column, item in Set:
        n_1 += df_action.loc[(df_action[column] == item) & (df_action.action == 'view') & (df_action.condition == '1-Control')].count()[0]
        n_2 += df_action.loc[(df_action[column] == item) & (df_action.action == 'clic') & (df_action.condition == '1-Control')].count()[0]
        n_3 += df_action.loc[(df_action[column] == item) & (df_action.action == 'view') & (df_action.condition == '2-Buttony-Conversion-Buttons')].count()[0]
        n_4 += df_action.loc[(df_action[column] == item) & (df_action.action == 'clic') & (df_action.condition == '2-Buttony-Conversion-Buttons')].count()[0]
        yuleQ = (n_1*n_4 - n_2*n_3)/(n_1*n_4-n_2*n_3)
    return yuleQ

In [17]:
def constraints(Set):
    total = 0
    for column, item in Set:
        total += df_action.loc[(df_action[column] == item)].count()[0]
    return total

In [18]:
#def phiEntropy(Set): --> to be implemented!
       
def beam_search(d, w, q):
    candidateQueue = list(df_action.columns.values) #instead of queue that is used in the paper, intialize the queue immediately
    candidateQueue.remove('action') #with the headers of the dataframe
    candidateQueue.remove('condition') #remove action and condition (targets) from this, because we only need the descriptors 
    
    #pseudo-code below is in line with the paper
    resultSet = priority_queue(q)
    for level in range(0, d):
        print("level: "+str(level))
        beam = priority_queue(w)
        while (len(candidateQueue) > 0):
            descriptor = candidateQueue.pop(0)
            seed = descriptor
            set_refined = refinement_2(seed)
            for desc in set_refined:
                #quality = phiYule(desc, descriptor)
                quality = phiYule_2([desc])
                if constraints([desc]):
                    resultSet.push(desc, quality)
                    beam.push(desc, quality)
            while not beam.empty:
                candidateQueue.append(priority_queue.pop(beam))
    return resultSet

result = beam_search(d=2, w=5, q=5)
priority_queue.heap_sort(result)

level: 0


  # This is added back by InteractiveShellApp.init_path()


level: 1


[(1.0, ('refr_source', 'refr_source not available')),
 (1.0, ('refr_source', 'refr_source not available')),
 (1.0, ('refr_source', 'refr_source not available')),
 (1.0, ('refr_source', 'refr_source not available')),
 (1.0, ('refr_source', 'refr_source not available'))]