# Evaluating Display Advertising Campaigns

## Set-up:

In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import scipy.stats as stats

## Question 1:

### Part 1:

First, let's load the raw clicks data for our five display ad campaigns.

In [6]:
xl_file = pd.ExcelFile('clicks.dataset.2.xlsx')
raw_clicks = xl_file.parse("clicks.dataset.1")
raw_clicks.set_index('ad', inplace=True)
raw_clicks

Unnamed: 0_level_0,1,2,3,4,5
ad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
clicks,52,38,51,45,25
exposures,1000,1000,1000,1000,1000


Next, let's obtain our success and failures for each ad campaign and the corresponding alpha/beta parameters associated with them.

In [31]:
s = np.array([52, 38, 51, 45, 25])
f = np.ones(5)*1000 - s
alpha = s + 1
beta = f + 1

Now, let's generate draws from the beta distribution to help us calculate the bayesian posterior probability that the campaign's true CTR is the highest across all campaigns.

In [65]:
CTR = np.zeros((5, 100000))
for i in range(5):
    draws = stats.beta.rvs(a=alpha[i], b=beta[i], size=100000)
    CTR[i] = draws
    
df_CTR = pd.DataFrame(CTR, index = ['draws1','draws2','draws3','draws4','draws5'])
df_CTR = df_CTR.T
df_CTR

Unnamed: 0,draws1,draws2,draws3,draws4,draws5
0,0.034689,0.043646,0.061557,0.026082,0.017945
1,0.049225,0.038211,0.050139,0.050671,0.035358
2,0.049939,0.027565,0.039771,0.044653,0.018269
3,0.054112,0.051600,0.050721,0.043510,0.032769
4,0.056000,0.038125,0.039536,0.036883,0.019688
...,...,...,...,...,...
99995,0.052289,0.032795,0.054018,0.038478,0.023840
99996,0.060705,0.043256,0.054565,0.043910,0.020197
99997,0.060901,0.033402,0.047308,0.049912,0.028071
99998,0.048001,0.048818,0.059454,0.046135,0.023075


Next, let's compute the maximum of each of the draws and add it to our dataframe.

In [74]:
max_list = []

for i in range(100000):
    maximum = df_CTR.iloc[i].max()
    max_list.append(maximum)

df_CTR['max'] = max_list
df_CTR

Unnamed: 0,draws1,draws2,draws3,draws4,draws5,max
0,0.034689,0.043646,0.061557,0.026082,0.017945,0.061557
1,0.049225,0.038211,0.050139,0.050671,0.035358,0.050671
2,0.049939,0.027565,0.039771,0.044653,0.018269,0.049939
3,0.054112,0.051600,0.050721,0.043510,0.032769,0.054112
4,0.056000,0.038125,0.039536,0.036883,0.019688,0.056000
...,...,...,...,...,...,...
99995,0.052289,0.032795,0.054018,0.038478,0.023840,0.054018
99996,0.060705,0.043256,0.054565,0.043910,0.020197,0.060705
99997,0.060901,0.033402,0.047308,0.049912,0.028071,0.060901
99998,0.048001,0.048818,0.059454,0.046135,0.023075,0.059454


Now, let's create dummy variables to represent whether or not a campaign has the highest value. Then, we add the dummy variables into our dataframe.

In [192]:
is_max = np.zeros((100000, 5))

for i in range(5):
    dummy_list = []
    for j in range(len(max_list)):
        if df_CTR.iloc[j][5] == df_CTR.iloc[j][i]:
            dummy_list.append(1)
        else:
            dummy_list.append(0)
    is_max[:,i] = dummy_list

df_ismax = pd.DataFrame(is_max, columns=['ismax1','ismax2','ismax3','ismax4','ismax5'])
new_df = pd.concat([df_CTR, df_ismax], axis=1)
new_df

Unnamed: 0,draws1,draws2,draws3,draws4,draws5,max,ismax1,ismax2,ismax3,ismax4,ismax5
0,0.034689,0.043646,0.061557,0.026082,0.017945,0.061557,0.0,0.0,1.0,0.0,0.0
1,0.049225,0.038211,0.050139,0.050671,0.035358,0.050671,0.0,0.0,0.0,1.0,0.0
2,0.049939,0.027565,0.039771,0.044653,0.018269,0.049939,1.0,0.0,0.0,0.0,0.0
3,0.054112,0.051600,0.050721,0.043510,0.032769,0.054112,1.0,0.0,0.0,0.0,0.0
4,0.056000,0.038125,0.039536,0.036883,0.019688,0.056000,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
99995,0.052289,0.032795,0.054018,0.038478,0.023840,0.054018,0.0,0.0,1.0,0.0,0.0
99996,0.060705,0.043256,0.054565,0.043910,0.020197,0.060705,1.0,0.0,0.0,0.0,0.0
99997,0.060901,0.033402,0.047308,0.049912,0.028071,0.060901,1.0,0.0,0.0,0.0,0.0
99998,0.048001,0.048818,0.059454,0.046135,0.023075,0.059454,0.0,0.0,1.0,0.0,0.0


Next, let's compute average of our "ismax" columns to obtain our probability/fraction of times that each campaign's CTR is the highest.

In [228]:
CTR_highest = []

for i in range(6,11):
    highest = new_df.iloc[:,i].mean()
    CTR_highest.append(highest)

for i in range(5):
    print('Highest CTR for ad ' + str(i+1) + ': ' + str(CTR_highest[i]))

Highest CTR for ad 1: 0.4694
Highest CTR for ad 2: 0.01672
Highest CTR for ad 3: 0.39416
Highest CTR for ad 4: 0.11971
Highest CTR for ad 5: 1e-05


### Part 2:

For our second task, let's load the data for our post-click transaction profit volumes.

In [127]:
xl_file2 = pd.ExcelFile('volumes.dataset.2.xlsx')
profit_volume = xl_file2.parse("volumes.dataset.2")
profit_volume.set_index('cust', inplace=True)
profit_volume

Unnamed: 0_level_0,ad,volume
cust,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,32
2,1,54
3,1,31
4,1,24
5,1,42
...,...,...
207,5,47
208,5,61
209,5,63
210,5,102


Next, let's compute some descriptive statistics such as sample mean, standard error of the mean, and bayes posterior degrees of freedom for each campaign.

In [187]:
# Campaign 1
volume_ad1 = profit_volume[profit_volume['ad'] == 1]
stats_ad1 = []
stats_ad1.append(volume_ad1['volume'].mean())
stats_ad1.append(volume_ad1['volume'].std() / math.sqrt(len(volume_ad1)))
stats_ad1.append(len(volume_ad1) + 1)
stats_ad1 = np.array(stats_ad1)

# Campaign 2
volume_ad2 = profit_volume[profit_volume['ad'] == 2]
stats_ad2 = []
stats_ad2.append(volume_ad2['volume'].mean())
stats_ad2.append(volume_ad2['volume'].std() / math.sqrt(len(volume_ad2)))
stats_ad2.append(len(volume_ad2) + 1)
stats_ad2 = np.array(stats_ad2)

# Campaign 3
volume_ad3 = profit_volume[profit_volume['ad'] == 3]
stats_ad3 = []
stats_ad3.append(volume_ad3['volume'].mean())
stats_ad3.append(volume_ad3['volume'].std() / math.sqrt(len(volume_ad3)))
stats_ad3.append(len(volume_ad3) + 1)
stats_ad3 = np.array(stats_ad3)

# Campaign 4
volume_ad4 = profit_volume[profit_volume['ad'] == 4]
stats_ad4 = []
stats_ad4.append(volume_ad4['volume'].mean())
stats_ad4.append(volume_ad4['volume'].std() / math.sqrt(len(volume_ad4)))
stats_ad4.append(len(volume_ad4) + 1)
stats_ad4 = np.array(stats_ad4)

# Campaign 5
volume_ad5 = profit_volume[profit_volume['ad'] == 5]
stats_ad5 = []
stats_ad5.append(volume_ad5['volume'].mean())
stats_ad5.append(volume_ad5['volume'].std() / math.sqrt(len(volume_ad5)))
stats_ad5.append(len(volume_ad5) + 1)
stats_ad5 = np.array(stats_ad5)

desc_stats = np.vstack((stats_ad1,stats_ad2,stats_ad3,stats_ad4,stats_ad5))
df_desc = pd.DataFrame(desc_stats, columns=['mean','SE','bayes DF'], index=['ad1','ad2','ad3','ad4','ad5'])
df_desc

Unnamed: 0,mean,SE,bayes DF
ad1,42.076923,1.391324,53.0
ad2,57.289474,2.82211,39.0
ad3,41.803922,1.945134,52.0
ad4,47.888889,2.017655,46.0
ad5,90.64,6.137285,26.0


Now, let's generate draws from the t distribution to help us calculate the bayesian posterior probability that a certain campaign's average profit volume is the highest across all campaigns.

In [186]:
postclick_vol = np.zeros((5, 100000))
for i in range(5):
    draws = stats.t.rvs(df=df_desc.iloc[i][2], size=100000)
    post_draws = draws * df_desc.iloc[i][1] + df_desc.iloc[i][0]
    postclick_vol[i] = post_draws
    
df_volume = pd.DataFrame(postclick_vol, index = ['draws1','draws2','draws3','draws4','draws5'])
df_volume = df_volume.T
df_volume

Unnamed: 0,draws1,draws2,draws3,draws4,draws5
0,41.414638,57.268019,40.604129,48.233677,87.497236
1,40.872692,59.606526,45.451489,47.140667,87.586751
2,41.686182,61.654093,39.365090,48.771830,87.899729
3,43.863786,54.861237,42.569847,49.038843,77.115218
4,44.142264,54.478950,44.735735,46.308916,92.115540
...,...,...,...,...,...
99995,40.741011,58.676376,44.767033,46.256995,92.985481
99996,41.967436,57.525143,40.268815,47.389765,82.995030
99997,40.296850,57.681598,42.616639,49.049392,88.406830
99998,43.112979,66.294492,41.658700,46.162219,92.922698


Next, we compute the max of the draws for volume and add it to our dataframe.

In [189]:
max_list2 = []

for i in range(100000):
    maximum = df_volume.iloc[i].max()
    max_list2.append(maximum)

df_volume['max'] = max_list2
df_volume

Unnamed: 0,draws1,draws2,draws3,draws4,draws5,max
0,41.414638,57.268019,40.604129,48.233677,87.497236,87.497236
1,40.872692,59.606526,45.451489,47.140667,87.586751,87.586751
2,41.686182,61.654093,39.365090,48.771830,87.899729,87.899729
3,43.863786,54.861237,42.569847,49.038843,77.115218,77.115218
4,44.142264,54.478950,44.735735,46.308916,92.115540,92.115540
...,...,...,...,...,...,...
99995,40.741011,58.676376,44.767033,46.256995,92.985481,92.985481
99996,41.967436,57.525143,40.268815,47.389765,82.995030,82.995030
99997,40.296850,57.681598,42.616639,49.049392,88.406830,88.406830
99998,43.112979,66.294492,41.658700,46.162219,92.922698,92.922698


Now, let's create dummy variables to represent whether or not a campaign has the highest value. Then, we add the dummy variables into our dataframe.

In [191]:
is_max2 = np.zeros((100000, 5))

for i in range(5):
    dummy_list = []
    for j in range(len(max_list2)):
        if df_volume.iloc[j][5] == df_volume.iloc[j][i]:
            dummy_list.append(1)
        else:
            dummy_list.append(0)
    is_max2[:,i] = dummy_list

df_ismax2 = pd.DataFrame(is_max2, columns=['ismax1','ismax2','ismax3','ismax4','ismax5'])
new_df2 = pd.concat([df_volume, df_ismax2], axis=1)
new_df2

Unnamed: 0,draws1,draws2,draws3,draws4,draws5,max,ismax1,ismax2,ismax3,ismax4,ismax5
0,41.414638,57.268019,40.604129,48.233677,87.497236,87.497236,0.0,0.0,0.0,0.0,1.0
1,40.872692,59.606526,45.451489,47.140667,87.586751,87.586751,0.0,0.0,0.0,0.0,1.0
2,41.686182,61.654093,39.365090,48.771830,87.899729,87.899729,0.0,0.0,0.0,0.0,1.0
3,43.863786,54.861237,42.569847,49.038843,77.115218,77.115218,0.0,0.0,0.0,0.0,1.0
4,44.142264,54.478950,44.735735,46.308916,92.115540,92.115540,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
99995,40.741011,58.676376,44.767033,46.256995,92.985481,92.985481,0.0,0.0,0.0,0.0,1.0
99996,41.967436,57.525143,40.268815,47.389765,82.995030,82.995030,0.0,0.0,0.0,0.0,1.0
99997,40.296850,57.681598,42.616639,49.049392,88.406830,88.406830,0.0,0.0,0.0,0.0,1.0
99998,43.112979,66.294492,41.658700,46.162219,92.922698,92.922698,0.0,0.0,0.0,0.0,1.0


Next, let's compute average of our "ismax" columns to obtain our probability/fraction of times that each campaign is at its highest value.

In [229]:
vol_highest = []

for i in range(6,11):
    highest = new_df2.iloc[:,i].mean()
    vol_highest.append(highest)

for i in range(5):
    print('Highest post-click volume for ad ' + str(i+1) + ': ' + str(vol_highest[i]))

Highest post-click volume for ad 1: 0.0
Highest post-click volume for ad 2: 2e-05
Highest post-click volume for ad 3: 0.0
Highest post-click volume for ad 4: 0.0
Highest post-click volume for ad 5: 0.99998


### Part 3:

Now, we can compute the bayesian posterior probability that the campaign's true expected volume per exposure is the highest.

First, we will multiply the CTR by the average profit volume "m".

In [213]:
CTRs = df_CTR.loc[:, df_CTR.columns!='max'].to_numpy()
volumes = df_volume.loc[:, df_volume.columns!='max'].to_numpy()
CTR_m = CTRs * volumes

df_CTR_m = pd.DataFrame(CTR_m, columns = ['CTR_m1','CTR_m2','CTR_m3','CTR_m4','CTR_m5'])
df_CTR_m

Unnamed: 0,CTR_m1,CTR_m2,CTR_m3,CTR_m4,CTR_m5
0,1.436623,2.499537,2.499480,1.258051,1.570102
1,2.011951,2.277637,2.278901,2.388683,3.096896
2,2.081761,1.699480,1.565606,2.177828,1.605845
3,2.373550,2.830825,2.159165,2.133686,2.526995
4,2.471986,2.077024,1.768682,1.708027,1.813528
...,...,...,...,...,...
99995,2.130298,1.924298,2.418221,1.779869,2.216768
99996,2.547627,2.488331,2.197255,2.080883,1.676219
99997,2.454121,1.926668,2.016089,2.448133,2.481625
99998,2.069486,3.236341,2.476782,2.129717,2.144203


Next, we compute the max of the CTR\*m and add it to our dataframe.

In [214]:
max_list3 = []

for i in range(100000):
    maximum = df_CTR_m.iloc[i].max()
    max_list3.append(maximum)

df_CTR_m['max'] = max_list3
df_CTR_m

Unnamed: 0,CTR_m1,CTR_m2,CTR_m3,CTR_m4,CTR_m5,max
0,1.436623,2.499537,2.499480,1.258051,1.570102,2.499537
1,2.011951,2.277637,2.278901,2.388683,3.096896,3.096896
2,2.081761,1.699480,1.565606,2.177828,1.605845,2.177828
3,2.373550,2.830825,2.159165,2.133686,2.526995,2.830825
4,2.471986,2.077024,1.768682,1.708027,1.813528,2.471986
...,...,...,...,...,...,...
99995,2.130298,1.924298,2.418221,1.779869,2.216768,2.418221
99996,2.547627,2.488331,2.197255,2.080883,1.676219,2.547627
99997,2.454121,1.926668,2.016089,2.448133,2.481625,2.481625
99998,2.069486,3.236341,2.476782,2.129717,2.144203,3.236341


Now, let's create dummy variables to represent whether or not a campaign has the highest value. Then, we add the dummy variables into our dataframe.

In [215]:
is_max3 = np.zeros((100000, 5))

for i in range(5):
    dummy_list = []
    for j in range(len(max_list3)):
        if df_CTR_m.iloc[j][5] == df_CTR_m.iloc[j][i]:
            dummy_list.append(1)
        else:
            dummy_list.append(0)
    is_max3[:,i] = dummy_list

df_ismax3 = pd.DataFrame(is_max3, columns=['ismax1','ismax2','ismax3','ismax4','ismax5'])
new_df3 = pd.concat([df_CTR_m, df_ismax3], axis=1)
new_df3

Unnamed: 0,CTR_m1,CTR_m2,CTR_m3,CTR_m4,CTR_m5,max,ismax1,ismax2,ismax3,ismax4,ismax5
0,1.436623,2.499537,2.499480,1.258051,1.570102,2.499537,0.0,1.0,0.0,0.0,0.0
1,2.011951,2.277637,2.278901,2.388683,3.096896,3.096896,0.0,0.0,0.0,0.0,1.0
2,2.081761,1.699480,1.565606,2.177828,1.605845,2.177828,0.0,0.0,0.0,1.0,0.0
3,2.373550,2.830825,2.159165,2.133686,2.526995,2.830825,0.0,1.0,0.0,0.0,0.0
4,2.471986,2.077024,1.768682,1.708027,1.813528,2.471986,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
99995,2.130298,1.924298,2.418221,1.779869,2.216768,2.418221,0.0,0.0,1.0,0.0,0.0
99996,2.547627,2.488331,2.197255,2.080883,1.676219,2.547627,1.0,0.0,0.0,0.0,0.0
99997,2.454121,1.926668,2.016089,2.448133,2.481625,2.481625,0.0,0.0,0.0,0.0,1.0
99998,2.069486,3.236341,2.476782,2.129717,2.144203,3.236341,0.0,1.0,0.0,0.0,0.0


Next, let's compute average of our "ismax" columns to obtain our probability/fraction of times that each campaign's "CTR\*m" is at its highest value.

In [230]:
CTRm_highest = []

for i in range(6,11):
    highest = new_df3.iloc[:,i].mean()
    CTRm_highest.append(highest)

for i in range(5):
    print('Highest volume per exposure for ad ' + str(i+1) + ': ' + str(CTRm_highest[i]))

Highest volume per exposure for ad 1: 0.16838
Highest volume per exposure for ad 2: 0.19963
Highest volume per exposure for ad 3: 0.13204
Highest volume per exposure for ad 4: 0.15933
Highest volume per exposure for ad 5: 0.34062
