# 2 A&B Testing

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import csv
from queue import Queue
import math
import itertools

import warnings 
warnings.filterwarnings('ignore') #ignore warning messages from output beam_search

In [2]:
df_action = pd.read_csv('data/action_condition_meta.csv')
df_action.head()

Unnamed: 0,action,user_id,condition,geo_country,refr_source,browser_language,os_name,os_timezone,dvce_type
0,clic,379881d5-32d7-49f4-bf5b-81fefbc5fcce,1-Control,FI,Google,greek,Android,Europe,Mobile
1,clic,2a0f4218-4f62-479b-845c-109b2720e6e7,2-Buttony-Conversion-Buttons,AU,Google,english,iOS,Australia,Mobile
2,clic,a511b6dc-2dca-455b-b5e2-bf2d224a5505,2-Buttony-Conversion-Buttons,GB,Google,english,Android,Europe,Mobile
3,clic,9fb616a7-4e13-4307-ac92-0b075d7d376a,2-Buttony-Conversion-Buttons,FI,Google,english,iOS,Europe,Mobile
4,clic,64816772-688d-4460-a591-79aa49bba0d5,2-Buttony-Conversion-Buttons,BD,Google,english,Android,Asia,Mobile


### 2a - Beam Search implementation

<b>We created our own priorityqueue class, because the standard priorityqueue blocks (can't insert element) when max.size is reached. We need to have a priorityqueue where the item with lowest priority is discarded and replaced by the new item.
<i>priority_queue</i> represents a min heap</b>

In [3]:
def alreadyInList(list1, list2):
    permutations = list(itertools.permutations(list1))
    for t in permutations:
        if list(t) in list2:
            return True
    return False

In [4]:
import heapq
from heapq import heappush, heappop

class priority_queue:
    def __init__(self, max_size):
        self.items = []
        self.max = max_size
   
    def push(self, item, priority):
        if ((len(self.items) < self.max) and (not alreadyInList(item, self.get_items()))):
            heapq.heappush(self.items, (priority, item))
        elif (not alreadyInList(item, self.get_items())):
            heapq.heappushpop(self.items, (priority, item))
    
    def get_items(self):
        result = []
        for i in self.items:
            result.append(i[1])
        return result

    def pop(self):
        return heapq.heappop(self.items)

    def get_max_item(self):
        return self.items[0]
    
    def empty(self):
        return not self.items
    
    def print_elements(self):
        result = []
        for i in self.items:
            result.append(i)
        return result
       
    def heap_sort(self):
        return [heapq.heappop(self.items) for _ in range(len(self.items))]

The following function <i>create_dataframe</i> is used to obtain a subset of the <i>df_action</i> dataframe, based on the the description language which is passed as argument. The dataframe which is returned is the subgroup of records (users) belonging to that particular description language. For example, a group of users who all were using iOS as operating system at the time of the experiment. 

<b>to-do (in prepocessing part?)</b>: The <i>create_dataframe</i> drops duplicate rows after each iteration. In other words, users that visited the particular version multiple times, with the same cookie-settings and the same action-type, are discarded from the dataset. It could be that a user is very fanatic in visiting the website and at every visit the user has the same action (view/clic). This can influence the dataset, because it is not fair compared to users who visit the particular version less frequently. We don't remove all the duplicated user (id's), because sometimes a user clicked on the button in a particular version, while the other time the user did not clicked. 

In [5]:
def create_dataframe(Set):
    d =df_action.copy()
    count = 0
    for column, item in Set:
        if count==0:
            if isinstance(item, str): # single value
                df_new = pd.merge(d, df_action.loc[(df_action[column] == item)], on=list(df_action), how='inner')
            else: # list of values
                df_new = pd.merge(d, df_action.loc[df_action[column].isin(item)], on=list(df_action), how='inner')
            df_new.drop_duplicates(inplace = True)
        else:
            if isinstance(item, str): # single value
                df_new = pd.merge(df_new, df_action.loc[(df_action[column] == item)], on=list(df_action), how='inner')
            else: # list of values
                df_new = pd.merge(df_new, df_action.loc[df_action[column].isin(item)], on=list(df_action), how='inner')
        count +=1
    return df_new

The following function <i>constraints</i> checks wheter a certain subgroup, represented by a dataframe which is passed as argument, satisfies the constraints $C$. At the moment the only constraint of a subgroup is that it should be represented by at least $2$% of the records (users) in the original dataset.

In [6]:
def constraints(df_matches):
    return df_matches.shape[0] > (df_action.shape[0] *0.30)

The refinement operator gets the records and chooses to which type of attribute they belong.
There are 3 types of attributes:
    1. Numeric: Attribute with all number records
    2. Binary: Attribute with true or false records
    3. Nominal: Attribute with multiple different values in their records which are not numeric
    
1. For numeric values we sort all records which are in the description language D. After that we make equal-sized bins.
The amount of bins is dependend on a predefined value. For each split point we add a description based on whether the
numeric value is greater or equal or smaller or equal as the split point.
2. For binary records we add one description where the description is true and one description where the description is false.
3. For nominal values we add for each description an entry with the description and one without the description. In our case
we take the first description in the list which is not equal to that description. 

For the first level, we generate all patterns consisting of <i>one</i> condition on <i>one</i> attribute. All patterns are evaluated with the quality measure $\varphi$ and the $w$ best are saved as the <i>beam</i>. The attributes `action`, `condition` and `user_id` are discarded from the list of attributes, because these are not relevant for defining a description language.

In [7]:
def all_paterns_one_condition(): # level 1
    result = []
    columns = list(df_action.columns.values)
    not_relevant = ['action', 'condition', 'user_id']
    columns = [x for x in columns if (x not in not_relevant)]
    for column in columns:
        if df_action[column].unique().size == 2: # binary attribute
            result.append((column, True))
            result.append((column, False))
        else: # nominal attribute
            for value in df_action[column].unique():
                all_unique = list(df_action[column].unique())
                complement_value = [x for x in all_unique if x != value]
                result.append((column, value))
                result.append((column, complement_value))
    return result

As alluded in the referenced paper, the StudyPortals (original) dataset comes natural equiped with $m=2$ nominal targets. The first nominal target attribute is $condition$, which represents a binary column that tells us to which version the particular user was exposed during the experiment (i.e. version A or B). The second nominal target attribute is $action$, which is the binary column representing whether the page visitor merely viewed or also clicked on the button in question during the experiment. Considering these pecularities of the StudyPortals dataset, the natural choice of EMM instance would be the association model class. So we strive to find subgroups for which the association between view/click and A/B is exceptional.

|      | View | Click |
|------|------|-------|
|   A  |$n_1$ | $n_2$ |
|   B  |$n_3$ | $n_4$ |

Now that we know what model class will be exploited, the next step is to define or exploit an appropriate quality measure. Since one can easily achieve huge deviations in target behaviour (assiociation between the differences in the two nominal target attributes), it makes sense to have a dimension in the quality measure which reflects the group size. In addition one also needs to have a target deviation dimension/component in the quality measure, of course. 

<b>The Target Deviation Component</b> ($\varphi_{Q}(S)$)

The first quality measure that is implemented, is Yule's Quality Measure as described in section $4.3$ of the A&B Testing paper. For the quality measure, we use the cells of the target contingency table, given in the table above. Given a subgroup $S\subseteq \Omega$, we can assign each record in $S$ to the appropiate cell of this contingency table, which leads to count values for each of the $n_i$ such that: $n_1 + n_2 + n_3 + n_4 = |S|$. Yule's Q is defined as: $\frac{(n_1\bullet n_4 - n_2\bullet n_3)}{(n_1\bullet n_4 + n_2 \bullet n_3)}$. Higher numbers on the main diagonal implies a possive assocation between the two targets and higher numbers off the main diagonal implies a negative association between the two targets. The value for $Q$ instantiated by the subgroup $S$ is denoted by $Q_S$. We contrast Yule's Q instantiated by a subgroup $S$ with Yule's Q instantiated by that subgroup complements $S^\mathsf{C}$: $\varphi_{Q}(S) = |Q_S - Q_{S^\mathsf{C}} |$. This component detects subgroups whose view/click-A/B association is different from the rest of the dataset.

In [8]:
def target_deviation(df_matches):
    zero = np.finfo(np.double).tiny
    
    n_1 = df_matches.loc[(df_matches.action == 'view') & (df_matches.condition == '1-Control')].count()[0]
    n_2 = df_matches.loc[(df_matches.action == 'clic') & (df_matches.condition == '1-Control')].count()[0]
    n_3 = df_matches.loc[(df_matches.action == 'view') & (df_matches.condition == '2-Buttony-Conversion-Buttons')].count()[0]
    n_4 = df_matches.loc[(df_matches.action == 'clic') & (df_matches.condition == '2-Buttony-Conversion-Buttons')].count()[0]
    Q_s = (n_1*n_4 - n_2*n_3)/(n_1*n_4 + n_2*n_3)
    
    df_complement = df_matches.merge(df_action, indicator=True, how='outer')
    df_complement = df_complement[df_complement['_merge'] == 'right_only']
    
    n_c_1 = df_complement.loc[(df_complement.action == 'view') & (df_complement.condition == '1-Control')].count()[0]
    n_c_2 = df_complement.loc[(df_complement.action == 'clic') & (df_complement.condition == '1-Control')].count()[0]
    n_c_3 = df_complement.loc[(df_complement.action == 'view') & (df_complement.condition == '2-Buttony-Conversion-Buttons')].count()[0]
    n_c_4 = df_complement.loc[(df_complement.action == 'clic') & (df_complement.condition == '2-Buttony-Conversion-Buttons')].count()[0]
    
    Q_s_c = (n_c_1*n_c_4 - n_c_2*n_c_3)/(n_c_1*n_c_4 + n_c_2*n_c_3)
    
    if (math.isnan(Q_s_c)):
        Q_s_c = zero
    if (math.isnan(Q_s)):
        Q_s = zero
        
    phi_Q_S = abs(Q_s - Q_s_c)
    
    return phi_Q_S

<b>The Subgroup Size Component</b><br>
To represent the subgroup size, we take the entropy function as described in section $3.1$ of the referenced paper. The function conceptually rewards $50/50$ splits between subgroup and complements, while punishing subgroups that are either (relatively) small or cover the vast majority of the dataset.

$\varphi_{ef}(D) = - \frac{n}{N}lg(\frac{n}{N}) - \frac{n^C}{N}lg(\frac{n^C}{N})$

In [9]:
def entropy_function(df_matches):
    zero = np.finfo(np.double).tiny # dealing with divisions/logarithms of 0
    n = df_matches.shape[0] # size subgroup in original dataset
    N = df_action.shape[0] # size original dataset
    n_c = N - n # complement of subgroup in orginal dataset
    if (n == 0):
        n = zero
    if (n_c == 0): #to-do if (n_c <= 0)
        n_c = zero
    return ((-(n/N) * math.log2(n/N)) - ((n_c/N) * math.log2(n_c/N)))

When combining these two components/dimensions, one obtains an association model class quality measure known as <i>Yule's Quality Measure</i>. $\varphi_{Yule}(S) = \varphi_{Q}(S) \cdot \varphi_{ef}(S) $, which boils down to a multiplication of the target deviation- and the subgroup size -component.

In [10]:
def phiYule(df_matches):
    return target_deviation(df_matches) * entropy_function(df_matches)

The multiplication of the two components ensures that subgroups are evaluated well (i.e. score well) on both components

The next function is responsible for the generation of new candidate patterns for the $n^{th}+1\,level$ by refining patterns from the $n^{th}\,level$ beam (denoted by <i>current_beam</i>). A pattern is refined into many new candidates by the conjuction of each possible single condition on a single attribute. The function results a list with all the candidates. The $beam-search$ algorithm should evaluate all $level\,n+1$ candidates with $\varphi$ and store the $w$ best as the new beam. In addition, it should update th list of $q-$best-performing subgroups, if there are new candidates which surpass the current top$-q$ in terms of the particular quality measure $\varphi$

In [11]:
def refine(current_beam, d):
    w = len(current_beam)
    result= []
    for i in range(0, w):
        for j in range(0, w):
            if (i != j):
                len_i = len(current_beam[i])
                len_j = len(current_beam[j])
                min_len = min(len_i, len_j)
                new = []
                for t in (0, min_len - 1):
                    if(current_beam[i][t] not in new):
                        new.append(current_beam[i][t])
                    if(current_beam[j][t] not in new):
                        new.append(current_beam[j][t])
                if ((not alreadyInList(new, current_beam)) & (len(new) == d)):
                    result.append(new)
    return result

The next step is the implementation of the $beam\_search$ algorithm. There are a few parameters which influence the outcome, and can be changed accordingly to what one sees fit for a certain experiment

In [12]:
d = 2 # search depth
w = 5 # search width (i.e. beam width)
q = 5 # size of the set of best subgroup results (i.e. top q subgroups)
quality_measure = phiYule # choose between phiYule (Yule's Quality Measure), ...

In [13]:
def beam_search(d, w, q, quality_measure):
    resultSet = priority_queue(q)
    beam = priority_queue(w)
    # first level
    all_paterns = all_paterns_one_condition()
    print("level: "+str(1))
    for desc in all_paterns:
        subset_desc = create_dataframe([desc])
        quality = quality_measure(subset_desc)
        if (constraints(subset_desc)):
            beam.push([desc], quality)
            resultSet.push([desc], quality)
    for level in range(2, d + 1):
        print("level: "+str(level))
        current_beam = beam.get_items()
        new_candidates = refine(current_beam, level)
        for c in new_candidates:
            subset_desc = create_dataframe(c)
            quality = quality_measure(subset_desc)
            if (constraints(subset_desc)):
                beam.push(c, quality)
                resultSet.push(c, quality)
    return resultSet

In [14]:
result = beam_search(d = d, w = w, q = q, quality_measure = quality_measure)
result = priority_queue.heap_sort(result)
result

level: 1
level: 2


[(0.70885143594824818,
  [('refr_source',
    ['Google',
     'StudyPortal',
     'refr_source not available',
     'Yandex',
     'Bing',
     'DuckDuckGo',
     'Facebook',
     'Vkontakte',
     'Everyclick'])]),
 (0.7349766298497219,
  [('geo_country',
    ['FI',
     'AU',
     'GB',
     'BD',
     'NG',
     'EG',
     'DE',
     'HK',
     'PG',
     'SD',
     'BR',
     'PK',
     'CM',
     'GH',
     'US',
     'ML',
     'RU',
     'JP',
     'LB',
     'MY',
     'IN',
     'CA',
     'BM',
     'PE',
     'ID',
     'TZ',
     'ZM',
     'MM',
     'AT',
     'KW',
     'PH',
     'KZ',
     'PY',
     'AZ',
     'NP',
     'ET',
     'IL',
     'MX',
     'TN',
     'MG',
     'ZA',
     'IQ',
     'SG',
     'DO',
     'DZ',
     'PS',
     'CN',
     'AE',
     'TH',
     'KR',
     'HN',
     'GM',
     'TW',
     'EC',
     'MN',
     'VN',
     'HT',
     'BY',
     'TD',
     'UY',
     'NI',
     'KG',
     'BH',
     'KE',
     'CO',
     'HR',
     'GU',
     '

The implementation given above allows the end user to:
* manually set the beam width $w$ and search depth $d$
* <b>#To-Do</b>: manually choose the number of bins in which numeric desciptors are dynamically distretized (<b>if we want to deal with numeric values!</b>)
* <b>#To-Do</b>: easily swap out the association model class on these specific two targets for another model class (to be coded by the end user) with any number of targets of the user's choosing


### 2b - Found Subgroups

The <i>beam search</i> algorithm is executed with parameters $d=2, w = 5$ and $q = 5$. We choose for $d=2$, i.e. a conjunction of at most 2 conditions on single descriptors, because of interpretability. When $d>2$, the results become more complex and therefore give less information on which a domain expert can act. With $q = 5$, the output of <i> beam search</i> can be easily compared to the top five subgroups found in the A&B Testing paper. After experimenting with the paramaters, we found out that the search width $w$ is hard to set manually and has a big influence on the outcome of subgroups. We choose for $w=10$, for the same reason when we choose for $q=5$. The <i>beam search</i> is therefore executed with the same parameters as the referenced A/B Testing paper.

The dataset $\Omega$ that is preprocessed in Q1 has a total number of 899 records. Yule's Q has a value of $\varphi_Q(\Omega) = 0.27$. In other words, the result of the traditional A/B test tells us that variant B: the more buttony variantion generates more clicks than the less buttony control version. The new variation is therefore slightly better, it can be argued whether the difference is significant. In A&B Testing, we can mine deeper into the data and find specific subgroups which prefer version A or version B.

* <b>#to-do</b>: describe top five subgroups in detail 
* <b>#to-do</b>: compare found subgroups to subgroups in the paper

In [15]:
from IPython.display import HTML, display
display(HTML(
    '<table><tr>{}</tr></table>'.format(
        '</tr><tr>'.join(
            '<td>{}</td>'.format('</td><td>'.join(str(_) for _ in row)) for row in result)
        )
 ))

0,1
0.708851435948,"[('refr_source', ['Google', 'StudyPortal', 'refr_source not available', 'Yandex', 'Bing', 'DuckDuckGo', 'Facebook', 'Vkontakte', 'Everyclick'])]"
0.73497662985,"[('geo_country', ['FI', 'AU', 'GB', 'BD', 'NG', 'EG', 'DE', 'HK', 'PG', 'SD', 'BR', 'PK', 'CM', 'GH', 'US', 'ML', 'RU', 'JP', 'LB', 'MY', 'IN', 'CA', 'BM', 'PE', 'ID', 'TZ', 'ZM', 'MM', 'AT', 'KW', 'PH', 'KZ', 'PY', 'AZ', 'NP', 'ET', 'IL', 'MX', 'TN', 'MG', 'ZA', 'IQ', 'SG', 'DO', 'DZ', 'PS', 'CN', 'AE', 'TH', 'KR', 'HN', 'GM', 'TW', 'EC', 'MN', 'VN', 'HT', 'BY', 'TD', 'UY', 'NI', 'KG', 'BH', 'KE', 'CO', 'HR', 'GU', 'RW', 'PA', 'UG'])]"
0.742529848939,"[('geo_country', ['FI', 'AU', 'GB', 'BD', 'NG', 'EG', 'DE', 'HK', 'PG', 'SD', 'BR', 'PK', 'CM', 'GH', 'US', 'ML', 'JP', 'LB', 'MY', 'IN', 'CA', 'BM', 'PE', 'ID', 'TZ', 'ZM', 'MM', 'AT', 'KW', 'PH', 'KZ', 'PY', 'AZ', 'NP', 'ET', 'SA', 'IL', 'MX', 'TN', 'MG', 'ZA', 'IQ', 'SG', 'DO', 'DZ', 'PS', 'CN', 'AE', 'TH', 'KR', 'HN', 'GM', 'TW', 'EC', 'MN', 'VN', 'HT', 'BY', 'TD', 'UY', 'NI', 'KG', 'BH', 'KE', 'CO', 'HR', 'GU', 'RW', 'PA', 'UG'])]"
0.753558571964,"[('geo_country', ['FI', 'AU', 'GB', 'BD', 'NG', 'EG', 'DE', 'HK', 'PG', 'SD', 'BR', 'PK', 'CM', 'GH', 'US', 'ML', 'RU', 'JP', 'MY', 'IN', 'CA', 'BM', 'PE', 'ID', 'TZ', 'ZM', 'MM', 'AT', 'KW', 'PH', 'KZ', 'PY', 'AZ', 'NP', 'ET', 'SA', 'IL', 'MX', 'TN', 'MG', 'ZA', 'IQ', 'SG', 'DO', 'DZ', 'PS', 'CN', 'AE', 'TH', 'KR', 'HN', 'GM', 'TW', 'EC', 'MN', 'VN', 'HT', 'BY', 'TD', 'UY', 'NI', 'KG', 'BH', 'KE', 'CO', 'HR', 'GU', 'RW', 'PA', 'UG'])]"
0.774981043522,"[('geo_country', ['FI', 'AU', 'GB', 'BD', 'NG', 'EG', 'DE', 'HK', 'PG', 'SD', 'BR', 'CM', 'GH', 'US', 'ML', 'RU', 'JP', 'LB', 'MY', 'IN', 'CA', 'BM', 'PE', 'ID', 'TZ', 'ZM', 'MM', 'AT', 'KW', 'PH', 'KZ', 'PY', 'AZ', 'NP', 'ET', 'SA', 'IL', 'MX', 'TN', 'MG', 'ZA', 'IQ', 'SG', 'DO', 'DZ', 'PS', 'CN', 'AE', 'TH', 'KR', 'HN', 'GM', 'TW', 'EC', 'MN', 'VN', 'HT', 'BY', 'TD', 'UY', 'NI', 'KG', 'BH', 'KE', 'CO', 'HR', 'GU', 'RW', 'PA', 'UG'])]"
