## Step 1. Explore data set

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', None)
from pandas.io import sql
import sqlite3
import scipy
from scipy import stats

In [2]:
test_data = pd.read_csv("case_study_dataset.csv")

In [3]:
print test_data.shape
test_data.head()

(500000, 11)


Unnamed: 0,vuid,param_value,browser_name,browser_capability,total_play,gross_secs_played,comment,like,follow,share,login
0,698785.0738,1,chrome,desktop,0,0.0,0,0,0,0,0
1,276157.069243,1,chrome,desktop,2,101.96,0,0,0,1,0
2,2465.57762,1,firefox,desktop,0,0.0,0,0,0,0,0
3,517481.191739,1,chrome,desktop,0,0.0,0,0,0,0,0
4,287185.465177,2,chrome,desktop,0,0.0,0,0,0,0,0


In [4]:
test_data.isnull().sum() # No null values

vuid                  0
param_value           0
browser_name          0
browser_capability    0
total_play            0
gross_secs_played     0
comment               0
like                  0
follow                0
share                 0
login                 0
dtype: int64

In [5]:
conn = sqlite3.connect('vimeo.db', detect_types=sqlite3.PARSE_DECLTYPES)
test_data.to_sql('test_data',
            con=conn,
            if_exists='replace',
            index=False)

In [6]:
# Create table of results
variant_binary_results=sql.read_sql("""
SELECT
param_value,
COUNT(param_value) AS num_visitors,
SUM(CASE WHEN total_play>=1 THEN 1 ELSE 0 END) AS num_viewed_video,
SUM(CASE WHEN comment>=1 THEN 1 ELSE 0 END) AS num_comments,
SUM(CASE WHEN like>=1 THEN 1 ELSE 0 END) AS num_likes,
SUM(CASE WHEN follow>=1 THEN 1 ELSE 0 END) AS num_follows,
SUM(CASE WHEN share>=1 THEN 1 ELSE 0 END) AS num_shares,
SUM(CASE WHEN login>=1 THEN 1 ELSE 0 END) AS num_logins,
AVG(total_play) AS average_num_videos,
AVG(gross_secs_played) AS average_time_watching

FROM
test_data

GROUP BY 1
""", con=conn)
variant_binary_results['percent_viewed_video'] = variant_binary_results['num_viewed_video']/variant_binary_results['num_visitors']*100
variant_binary_results['percent_comment'] = variant_binary_results['num_comments']/variant_binary_results['num_visitors']*100
variant_binary_results['percent_like'] = variant_binary_results['num_likes']/variant_binary_results['num_visitors']*100
variant_binary_results['percent_follow'] = variant_binary_results['num_follows']/variant_binary_results['num_visitors']*100
variant_binary_results['percent_share'] = variant_binary_results['num_shares']/variant_binary_results['num_visitors']*100
variant_binary_results['percent_login'] = variant_binary_results['num_logins']/variant_binary_results['num_visitors']*100
variant_binary_results

Unnamed: 0,param_value,num_visitors,num_viewed_video,num_comments,num_likes,num_follows,num_shares,num_logins,average_num_videos,average_time_watching,percent_viewed_video,percent_comment,percent_like,percent_follow,percent_share,percent_login
0,1,452357,76554,117,844,411,1625,1004,0.574438,112.059166,16.923359,0.025865,0.186578,0.090857,0.35923,0.221949
1,2,23886,3566,8,42,25,101,42,0.504773,94.942602,14.929247,0.033492,0.175835,0.104664,0.422842,0.175835
2,3,23757,3697,6,57,46,108,59,0.52132,97.543716,15.561729,0.025256,0.239929,0.193627,0.454603,0.248348


## Step 2. Compare values for binary outcomes and determine whether differences are statistically significant

In [7]:
visitors_var1 = variant_binary_results['num_visitors'][variant_binary_results['param_value']==1].values
visitors_var2 = variant_binary_results['num_visitors'][variant_binary_results['param_value']==2].values
print "Test p values for variation 2:"
for i in ['num_viewed_video','num_comments','num_likes','num_follows','num_shares','num_logins']:
    target_performers1 = variant_binary_results[i][variant_binary_results['param_value']==1].values
    target_performers2 = variant_binary_results[i][variant_binary_results['param_value']==2].values
    _,p_val,_,_ = scipy.stats.chi2_contingency([[visitors_var1-target_performers1,target_performers1],
                                                [visitors_var2-target_performers2,target_performers2]])
    print i,p_val

Test p values for variation 2:
num_viewed_video 1.05172578194e-15
num_comments 0.614008150604
num_likes 0.765333911254
num_follows 0.563359053523
num_shares 0.123743661044
num_logins 0.157712181227


In [8]:
visitors_var1 = variant_binary_results['num_visitors'][variant_binary_results['param_value']==1].values
visitors_var3 = variant_binary_results['num_visitors'][variant_binary_results['param_value']==3].values
print "Test p values for variation 3:"
for i in ['num_viewed_video','num_comments','num_likes','num_follows','num_shares','num_logins']:
    target_performers1 = variant_binary_results[i][variant_binary_results['param_value']==1].values
    target_performers3 = variant_binary_results[i][variant_binary_results['param_value']==3].values
    _,p_val,_,_ = scipy.stats.chi2_contingency([[visitors_var1-target_performers1,target_performers1],
                                                [visitors_var2-target_performers3,target_performers3]])
    print i,p_val

Test p values for variation 3:
num_viewed_video 6.30253290489e-09
num_comments 0.891247277683
num_likes 0.0839919255319
num_follows 1.28906419529e-06
num_shares 0.0232536543734
num_logins 0.465724698528


#### Based on these significance tests, variation 1 had a significantly higher percentage of visitors who watched a video. Variation 3 had a significantly higher percentage of users that 'followed' and 'shared'.

## Step 3. Compare values for continuous outcomes and determine whether differences are statistically significant

In [9]:
sec_var1 = test_data['gross_secs_played'][test_data.param_value==1].copy()
sec_var2 = test_data['gross_secs_played'][test_data.param_value==2].copy()
sec_var3 = test_data['gross_secs_played'][test_data.param_value==3].copy()

In [10]:
two_sample_1 = stats.ttest_ind(sec_var1.values, sec_var2.values,equal_var=False)
print "When comparing gross play time for variations 1 and 2, the t-statistic is %.3f and the p-value is %.3f." % two_sample_1

When comparing gross play time for variations 1 and 2, the t-statistic is 2.913 and the p-value is 0.004.


In [11]:
two_sample_2 = stats.ttest_ind(sec_var1.values, sec_var3.values, equal_var=False)
print "When comparing gross play time for variations 1 and 3, the t-statistic is %.3f and the p-value is %.3f." % two_sample_2

When comparing gross play time for variations 1 and 3, the t-statistic is 2.514 and the p-value is 0.012.


#### Based on these two sample t-tests, variation 1 had a statistically higher average play time compared to variations 2 and 3

In [12]:
vid_var1 = test_data['total_play'][test_data.param_value==1].copy()
vid_var2 = test_data['total_play'][test_data.param_value==2].copy()
vid_var3 = test_data['total_play'][test_data.param_value==3].copy()

In [13]:
two_sample_3 = stats.ttest_ind(vid_var1.values, vid_var2.values,equal_var=False)
print "When comparing number of plays for variations 1 and 2, the t-statistic is %.3f and the p-value is %.3f." % two_sample_3

When comparing number of plays for variations 1 and 2, the t-statistic is 3.078 and the p-value is 0.002.


In [14]:
two_sample_4 = stats.ttest_ind(vid_var1.values, vid_var3.values, equal_var=False)
print "When comparing number of plays for variations 1 and 3, the t-statistic is %.3f and the p-value is %.3f." % two_sample_4

When comparing number of plays for variations 1 and 3, the t-statistic is 2.188 and the p-value is 0.029.


#### Based on these two sample t-tests, variation 1 had a statistically higher average number of videos watched compared to variations 2 and 3