In [None]:
## General Structure for all Notebooks 
## 1) Describe the problem being tackled, source of data, output 
## 2) Import libraries/set up wd; general working setup
## 3) Examine the quality of the data 
## 4) Build the right dataset 
## 5) Apply the structure

# Problem 
### What are we solving for?

https://product-data-science.datamasked.com/courses/496549/lectures/9194606

* Control = 66% @ £39
* Variant = 33% @ £59
* Experiment is running for 'some time'; outstanding questions: 


1. Should the product be sold @ £39 or £59? 
2. The VP of Product is interested in having a holistic view into user behavior, especially focusing on actionable insights that might increase conversion rate. What are your main findings looking at the data?


3. The VP of Product feels that the test has been running for too long and they should have been able to get statistically significant results in a shorter time. Do you agree with this? After how many days you would have stopped the test? Please, explain why.

In [1]:
import pandas as pd 
import numpy as np
import os 
import datetime

import warnings 
warnings.filterwarnings('ignore')

from termcolor import colored 
import matplotlib as plt 
import seaborn as sns 

wd=os.getcwd()
print(wd)

/Users/annadudek/00_DataMasked


In [2]:
events_raw = pd.read_csv(wd + '/06_pricing_test_results.csv')
users_raw = pd.read_csv(wd + '/06_pricing_user_table.csv')

In [3]:
def describe_data(data):
    print(f"shape: {data.shape}")
    print('')
    print(data.dtypes)

    
    ## assume first id column in the dataset (in terms of left to right order)
    try:
        id_column=data.filter(like='id').columns.to_list()[0]
        print('')
        print(f"Column taken as id: {id_column}")
        print(f"If {len(data)} = {len(data[id_column].unique())}, then dataframe is at this level") 

    except:
        print(colored('No id column','red', attrs=['bold']))

    
    print('')
    return data.head(3)


In [4]:
describe_data(events_raw)

shape: (316800, 8)

user_id              int64
timestamp           object
source              object
device              object
operative_system    object
test                 int64
price                int64
converted            int64
dtype: object

Column taken as id: user_id
If 316800 = 316800, then dataframe is at this level



Unnamed: 0,user_id,timestamp,source,device,operative_system,test,price,converted
0,604839,2015-05-08 03:38:34,ads_facebook,mobile,iOS,0,39,0
1,624057,2015-05-10 21:08:46,seo-google,mobile,android,0,39,0
2,317970,2015-04-04 15:01:23,ads-bing,mobile,android,0,39,0


In [5]:
describe_data(users_raw)

shape: (275616, 5)

user_id      int64
city        object
country     object
lat        float64
long       float64
dtype: object

Column taken as id: user_id
If 275616 = 275616, then dataframe is at this level



Unnamed: 0,user_id,city,country,lat,long
0,510335,Peabody,USA,42.53,-70.97
1,89568,Reno,USA,39.54,-119.82
2,434134,Rialto,USA,34.11,-117.39


In [6]:
def set_dtypes(data): 
    
    """ Transforms columns with 'date' or 'time' into date types. 
        Transforms any two-level columns into categories.
        
        USAGE: new_df = set_dtypes(example_df)"""

    ### transforms dates 
    to_transform_dates=data.filter(like='date').columns.to_list()
    if len(to_transform_dates) == 0:
        print('no objects named date; trying time')
        to_transform_dates=data.filter(like='time').columns.to_list()
        for i in to_transform_dates:
#             data[i] = pd.to_datetime(data[i])
            data[['date','time']] = data[i].str.split(expand=True)
            data['datetime'] = (pd.to_datetime(data.pop('date'), format='%Y-%m-%d') + 
                          pd.to_timedelta(data.pop('time')))
    else: 
        for i in to_transform_dates:
#             data[i] = pd.to_datetime(data[i])
            data[['date','time']] = data[i].str.split(expand=True)
            data['datetime'] = (pd.to_datetime(data.pop('date'), format='%Y-%m-%d') + 
                          pd.to_timedelta(data.pop('time')))
    
    print('')
            
     ### transforms numerics to binary      
    to_transform_binary = data.select_dtypes('number').columns.tolist()
    print(to_transform_binary)
    binary_to_drop = input("Enter any columns that need omitting from binary transformation ")
    binary_to_drop = list(binary_to_drop.split(' '))  
    
    try: 
        for x in binary_to_drop: 
            to_transform_binary.remove(x)
    except: to_transform_binary
        
    print('')
    for col in to_transform_binary:
        if len(data[col].unique())>3:
            to_transform_binary.remove(col)
        else: 
            data[col] = data[col].astype('category')
            
            
    print(data.dtypes)    
    return data

In [7]:
events_raw = set_dtypes(events_raw)
# events_raw['test'] = events_raw['test'].replace({0: 'control', 1: 'variant'})


no objects named date; trying time

['user_id', 'test', 'price', 'converted']
Enter any columns that need omitting from binary transformation user_id price

user_id                      int64
timestamp                   object
source                      object
device                      object
operative_system            object
test                      category
price                        int64
converted                 category
datetime            datetime64[ns]
dtype: object


In [8]:
master_raw = pd.merge(events_raw, users_raw, how = 'left', on = 'user_id')

In [11]:

## transform to dates should always run first because all date-like objects will be considered an object 
## and will be made into dummies, which will unnecessarily explode the number of dummies 

def make_dummies(data): 
    
    """ Takes all object columns and transforms them to dummy variables. Asks for input whether any variables
        should be omitted. 
        
        USAGE: new_df = make_dummies(example_df)"""
    
    dummies_list = []
    variables=data.select_dtypes('object').columns.tolist()
    print("number of unique values within object variables")
    for v in variables: 
        print(f"{v}  {len(data[v].unique())}")
    
    
    print(variables)
    variables_to_drop = input("Enter any columns that need omitting from becoming dummies ")
    variables_to_drop = list(variables_to_drop.split(' '))  
    
    try: 
        for x in variables_to_drop: 
            variables.remove(x)
    except: variabes
    print('')
    
    
    for var in variables:   
        dummies_list = pd.get_dummies(data[var]).rename(columns=lambda x: str(var) + '_' + str(x))
        data=data.join(dummies_list)
#         del dummies_list
    
    return data    
    


def categorical_randomization(data, exp_group_col):
    
    """ Takes all the categorical variables in the dataframe and creates frequency tables by 
    Control v Variant in order to check randomization. """
    categorical_variables = data.select_dtypes('uint8').columns.tolist()   ## picks up on only dummied variables
#     print(categorical_variables)
#     columns_to_drop = input("Enter any columns that need omitting")
#     columns_to_drop = list(columns_to_drop.split(' '))  
    
#     try: 
#         for x in columns_to_drop: 
#             categorical_variables.remove(x)
#     except: categorical_variables 
#     print('')
              
    
    freq_table=pd.DataFrame()
    for i in categorical_variables: 
        var_table=pd.DataFrame(data[[exp_group_col,i]].pivot_table(index=exp_group_col, columns=i, 
                                aggfunc=len, fill_value=0)).reset_index()
#         print(var_table)

        levels_list = var_table.columns.tolist()
        levels_list.remove(exp_group_col)

        for l in levels_list: 
            var_table[str("pct_" + str(l))] = var_table[l]/var_table.sum(axis=1)
        
        freq_table = pd.concat([freq_table, var_table], axis =1)
    
        
    
    
    """Clean up freq_table & flag problematic randomization VARIANCE BETWEEN GROUPS > 5% """
    
    freq_table=freq_table.transpose()
    freq_table.reset_index(inplace=True)
    freq_table.columns = ['categorical_variable', 'Control', 'Variant']
    freq_table=freq_table[freq_table.categorical_variable != exp_group_col]
    
    freq_table['pct_variable']=freq_table['categorical_variable'].str.contains("pct")
    freq_table['group_variance'] = np.where(freq_table['pct_variable'] == True,
                                            abs((freq_table['Control'] - freq_table['Variant'])/freq_table['Control']) , 0)


    print(colored('Following categorical variables have greater than 5% variance between Control & Variant',
                  'red', attrs=['bold']))
    print("")
    print(freq_table[freq_table.group_variance >= 0.05])
    
    
    pd.options.display.max_rows = 1000                              ## increase the length you can see in notebooks
    return freq_table                                


In [10]:
master_raw= make_dummies(master_raw)

number of unique values within object variables
timestamp  140931
source  12
device  2
operative_system  6
city  924
country  2
['timestamp', 'source', 'device', 'operative_system', 'city', 'country']
Enter any columns that need omitting from becoming dummies timestamp city



In [12]:
randomization_results=categorical_randomization(data= master_raw, exp_group_col = 'test')


source_ads-bing test       0      1
0                  0  188118  14609
1                  1  105809   8264
[0, 1]
source_ads-google test       0      1
0                    0  164863  37864
1                    1   92558  21515
[0, 1]
source_ads-yahoo test       0     1
0                   0  197909  4818
1                   1  111308  2765
[0, 1]
source_ads_facebook test       0      1
0                      0  168628  34099
1                      1   94776  19297
[0, 1]
source_ads_other test       0      1
0                   0  183443  19284
1                   1  103481  10592
[0, 1]
source_direct_traffic test       0      1
0                        0  163968  38759
1                        1   92475  21598
[0, 1]
source_friend_referral test       0      1
0                         0  189592  13135
1                         1  106513   7560
[0, 1]
source_seo-bing test       0     1
0                  0  201393  1334
1                  1  113254   819
[0, 1]
source_seo-google test 

In [None]:
categorical_variables = master_raw.select_dtypes('uint8').columns.tolist()   ## picks up on only dummied variables
categorical_variables

In [18]:
var_table = pd.DataFrame(master_raw[['test','operative_system_other']].pivot_table(index='test', columns='operative_system_other', 
                                aggfunc=len, fill_value=0)).reset_index()

var_table[str("pct_" + str(0))] = var_table[0]/var_table.sum(axis=1)
var_table[str("pct_" + str(1))] = var_table[1]/var_table.sum(axis=1)
var_table        
# freq_table = pd.concat([freq_table, var_table], axis =1)

operative_system_other,test,0,1,pct_0,pct_1
0,0,192229,10498,0.948216,0.051784
1,1,108367,5706,0.949971,0.05002
