# Beam search implementation

In [33]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import csv
from queue import Queue

In [113]:
df_action = pd.read_csv('data/action_condition_meta.csv')
df_action = df_action.drop('user_id', 1)

In [35]:
#created own priorityqueue class, because the standard priorityqueue blocks (can't insert element) when max.size is reached.
#we need to have a priorityqueue where the item with lowest priority is discarded and replaced by the new item.
#priorityqueue represents min heap 
import heapq
from heapq import heappush, heappop

class priority_queue:
    def __init__(self, max_size):
        self.items = []
        self.max = max_size
   
    def push(self, item, priority):
        if len(self.items) < self.max:
            heapq.heappush(self.items, (priority, item))
        else:
            heapq.heappushpop(self.items, (priority, item))
            

    def pop(self):
        return heapq.heappop(self.items)

    def get_max_item(self):
        return self.items[0]
    
    def empty(self):
        return not self.items
    
    def print_elements(self):
        result = []
        for i in self.items:
            result.append(i)
        return result
       
    def heap_sort(self):
        return [heapq.heappop(self.items) for _ in range(len(self.items))]

In [36]:
df_action.shape

(899, 9)

In [37]:
#drop user_id, because we want to build a description language (where we don't need user_id for)
df_action.drop_duplicates(subset = 'user_id', inplace=True)
df_action.shape

(761, 9)

In [38]:
#desc = [('refr_source', 'Google'), ('geo_country', 'GB')]
#df_match_condition = create_dataframe(desc)
#constraints_2(df_match_condition)
#df_match_condition.shape

In [39]:
#desc = [('refr_source', 'Google'), ('geo_country', 'AZ')]
#phiYule(df_match_condition)

In [109]:
'''
bins = 3
language = []
The refinement operator. The refinement operator gets the records and chooses to which type of attribute they belong.
There are 3 types of attributes:
    1. Numeric: Attribute with all number records
    2. Binary: Attribute with true or false records
    3. Nominal: Attribute with multiple different values in their records which are not numeric
    
1. For numeric values we sort all records which are in the description language D. After that we make equal-sized bins.
The amount of bins is dependend on a predefined value. For each split point we add a description based on whether the
numeric value is greater or equal or smaller or equal as the split point.
2. For binary records we add one description where the description is true and one description where the description is false.
3. For nominal values we add for each description an entry with the description and one without the description. In our case
we take the first description in the list which is not equal to that description.
def refinement(seed):
    dataframe = create_dataframe(language)
    unique_records = dataframe[seed].unique()
    if unique_records.size == 0:
        return language
    print("uniques: ")
    print(unique_records)
    if isNumeric(unique_records):     #Attribute is numeric
        language.append(-1) #TODO       
    elif unique_records.size == 2:    #Attribute is binary
        language.append((seed, True))
        language.append((seed, False))
    else:                             #Attribute is nominal
        for item in unique_records:
            language.append((seed, item))
            language.append((seed, unique_records[next((i for i, v in enumerate(unique_records) if v != item), -1)]))
    return language

Checks whether the records are all numeric.

def isNumeric(unique_records):
    for item in unique_records:
        if not item.isdigit():
            return False
    return True

candidateQueue = list(df.columns.values) #instead of queue that is used in the paper, intialize the queue immediately
candidateQueue.remove('action') #with the headers of the dataframe
candidateQueue.remove('condition') #remove action and condition (targets) from this, because we only need the descriptors 

seed = candidateQueue.pop()
print("seed: " + seed)
print("lang:")
print(language)
print(refinement(seed))
print("ref result:")

seed = candidateQueue.pop()
print("seed: " + seed)
print("lang:")
print(language)
print(refinement(seed))
print("ref result:")
'''

SyntaxError: invalid syntax (<ipython-input-109-c723c39aff41>, line 35)

In [115]:
#def phiEntropy(Set): --> to be implemented!
descLang = []
numberOfBins = 3 #Should be used for numeric value refinement
'''
The refinement operator. The refinement operator gets the records and chooses to which type of attribute they belong.
There are 3 types of attributes:
    1. Numeric: Attribute with all number records
    2. Binary: Attribute with true or false records
    3. Nominal: Attribute with multiple different values in their records which are not numeric
    
1. For numeric values we sort all records which are in the description language D. After that we make equal-sized bins.
The amount of bins is dependend on a predefined value. For each split point we add a description based on whether the
numeric value is greater or equal or smaller or equal as the split point.
2. For binary records we add one description where the description is true and one description where the description is false.
3. For nominal values we add for each description an entry with the description and one without the description. In our case
we take the first description in the list which is not equal to that description.
'''
def refinement(seed):
    dataframe = create_dataframe(descLang)
    unique_records = dataframe[seed].unique()
    if unique_records.size == 0:
        return language
    if isNumeric(unique_records):     #Attribute is numeric
        descLang.append(-1) # TODO       
    elif unique_records.size == 2:    #Attribute is binary
        descLang.append((seed, True))
        descLang.append((seed, False))
    else:                             #Attribute is nominal
        for item in unique_records:
            descLang.append((seed, item))
            descLang.append((seed, unique_records[next((i for i, v in enumerate(unique_records) if v != item), -1)]))
    return descLang

'''
Checks whether a list of values is all numeric
'''
def isNumeric(values):
    for item in values:
        if not item.isdigit():
            return False
    return True

'''
Creates a dataframe filtered by descriptors
'''
def create_dataframe(decriptors):
    if not decriptors:
        return df_action
    d = df_action.copy()
    for column, item in decriptors:
        df_new = pd.merge(d, df_action.loc[(df_action[column] == item)], on=list(df_action), how='inner')
    return df_new

def constraints(df_matches):
    return df_matches.shape[0] > 5

def phiYule(df_matches):
    #d = df_action.copy()
    #df_match_condition = pd.merge(d, df_action.loc[(df_action[column] == item)], on=list(df_action), how='inner')
    #for column, item in Set:
    n_1 = df_matches.loc[(df_matches.action == 'view') & (df_matches.condition == '1-Control')].count()[0]
    n_2 = df_matches.loc[(df_matches.action == 'clic') & (df_matches.condition == '1-Control')].count()[0]
    n_3 = df_matches.loc[(df_matches.action == 'view') & (df_matches.condition == '2-Buttony-Conversion-Buttons')].count()[0]
    n_4 = df_matches.loc[(df_matches.action == 'clic') & (df_matches.condition == '2-Buttony-Conversion-Buttons')].count()[0]
    yuleQ = (n_1*n_4 - n_2*n_3)/(n_1*n_4 + n_2*n_3)
    print(yuleQ)
    return yuleQ

def beam_search(d, w, q):
    candidateQueue = list(df_action.columns.values) #instead of queue that is used in the paper, intialize the queue immediately
    candidateQueue.remove('action') #with the headers of the dataframe
    candidateQueue.remove('condition') #remove action and condition (targets) from this, because we only need the descriptors 
    
    #pseudo-code below is in line with the paper
    resultSet = priority_queue(q)
    for level in range(0, d):
        print("level: "+str(level))
        beam = priority_queue(w)
        while (len(candidateQueue) > 0):
            seed = candidateQueue.pop(0)
            set_refined = refinement(seed)
            for desc in set_refined:
                df_match_condition = create_dataframe([desc])
                quality = phiYule(df_match_condition)
                if constraints(df_match_condition):
                    resultSet.push(desc, quality)
                    beam.push(desc, quality)
            while not beam.empty:
                candidateQueue.append(priority_queue.pop(beam))
    return resultSet

result = beam_search(d=2, w=5, q=5)
priority_queue.heap_sort(result)

level: 0
0.180722891566
1.0
1.0
0.180722891566
0.622222222222
0.180722891566
1.0
0.180722891566
-0.547619047619
0.180722891566
1.0
0.180722891566
-0.330451488953
0.180722891566
1.0
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
-1.0




0.180722891566
1.0
0.180722891566
0.754442649435
0.180722891566
1.0
0.180722891566
nan
0.180722891566
-1.0
0.180722891566
1.0
0.180722891566
-1.0
0.180722891566
nan
0.180722891566
1.0
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
-1.0
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
1.0
0.180722891566
nan
0.180722891566
-1.0
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
1.0
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
0.850746268657
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
-1.0
0.180722891566
-1.0
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.180722891566
nan
0.

[(1.0, ('geo_country', 'US')),
 (1.0, ('os_timezone', 'Australia')),
 (1.0, ('os_timezone', 'Australia')),
 (1.0, ('os_timezone', 'Australia')),
 (1.0, ('os_timezone', 'Australia'))]