In [2]:
import numpy as np
import pandas as pd

In [4]:
# load and preprocess data
#import os
#os.listdir()

__DATA_DIR__ = "./drive/MyDrive/data-playground/sample_data_final.csv"

df = pd.read_csv(__DATA_DIR__,header = "infer")

df = df.drop(["user_id",
              "orders",
              "room_nights",
              "booking_gbv",
              "orders_outlier_ind",
              "room_nights_outlier_ind",
              "booking_gbv_outlier_ind",
              ],axis=1).reset_index(drop = True)

df.head()

Unnamed: 0.1,Unnamed: 0,experiment_variant_code,new_return_visitor,platform_type,posa_super_region,marketing_channel,conv_ind
0,0,VARIANT,RETURN,MOBILE,EMEA,SEM,0
1,1,CONTROL,NEW,MOBILE,APAC,SEM,0
2,3,VARIANT,RETURN,DESKTOP,APAC,META,0
3,4,CONTROL,NEW,MOBILE,EMEA,SEM,0
4,5,VARIANT,NEW,MOBILE,N_AM,DIRECT,0


### Util: Detect group label bias on covariates

In [15]:
def testAdversarial(A_group,
                    B_group,
                    covariates,
                    sample_frac = 1.0,
                    thr_max = 0.51,
                    thr_min = 0.49,
                    verbose = False
                    ):
  """
  Check if the group variants were stratified correctly using adverserial strategy

  Parameters:
  * A_group: Group A
  * B_group: Group B
  * covariates: List of covariates
  * sample_frac: Sample of fraction % for faster validation (default 1)
  * thr_max: Max threshold for ROC (default 0.51)
  * thr_min: Min threshold for ROC (default 0.49)
  * verbose: if True then print all step status

  Returns: True / False
  """ 
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.model_selection import cross_val_predict
  from sklearn.metrics import roc_auc_score

  if verbose == True:
    print("Adverserial validation started...")
  if len(covariates) < 1:
    return False
  
  A_deepcopy = A_group.copy(deep = True)
  B_deepcopy = B_group.copy(deep = True)

  # if sample fraction is specified, extract the subsets randomly
  if (sample_frac < 1.0) and (sample_frac > 0.0):
    if verbose == True:
      print("Sampling {} of both datasets".format(sample_frac))

    A_deepcopy = A_deepcopy.sample(frac = sample_frac)
    B_deepcopy = B_deepcopy.sample(frac = sample_frac)
    
  # Prepare a dataset by combining A & B and put labels of the groups - 2 classes
  if verbose == True:
    print("Combine A & B groups and create group label for each")
  X = A_deepcopy.append(B_deepcopy) 
  y = [0]*len(A_deepcopy) + [1]*len(B_deepcopy) # put a pseudolabel

  # convert all categorical variables to binary 
  X = pd.get_dummies(X,columns = covariates)

  if verbose == True:
    print("Run a classifier to distinguish between the 2 datasets")
  model = RandomForestClassifier() # use RandomForest here but could be any classifier
  
  # do cross-val and output prediction of pseudo-label
  cv_preds = cross_val_predict(model, 
                               X, 
                               y, 
                               cv=2, 
                               n_jobs = None,
                               method = "predict_proba",
                               verbose = verbose)
  
  roc_score = roc_auc_score(y_true = y, y_score = cv_preds[:,1])
  if verbose == True:
    print ("ROC Score = {}".format(roc_score))
    print ("Adverserial validation finished.")

  return thr_min <= roc_score <= thr_max

In [13]:
# create a test case

train = df[df["experiment_variant_code"]== "CONTROL"].drop("experiment_variant_code",axis = 1)
test = df[df["experiment_variant_code"] == "VARIANT"].drop("experiment_variant_code",axis = 1)
features = ["new_return_visitor","platform_type","posa_super_region","marketing_channel"]

print(testAdversarial(train,test,features,sample_frac = 0.1,verbose = True))



Sampling 0.1 of both datasets
Combine A & B groups and create group label for each
Run a classifier to distinguish between the 2 datasets


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ROC Score = 0.4977313415754645
True


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   50.3s finished


### Utils: Perform chisqr tests on covariates

In [21]:
def testChiSqr(A_group,
               B_group,
               covariates,
               alpha = 0.05,
               verbose = True
              ):
  """
  Check if the group variants are balanced based on chi square tests

  Parameters:
  * A_group: Group A
  * B_group: Group B
  * covariates: List of covariates
  * alpha: Confidence level (default 5%)
  * verbose: if True then print all step status

  Returns: True / False if any covariates is found unbalanced, together with the lsit
  """ 

  
  from scipy.stats import chi2_contingency

  if verbose == True:
    print("Covariate balancing check using Chisqr test...")

  if len(covariates) < 1:
    return False

  A_deepcopy = A_group.copy(deep = True)
  B_deepcopy = B_group.copy(deep = True)

  # label group
  A_deepcopy["group_"] = "A"
  B_deepcopy["group_"] = "B"


  # Prepare a dataset by combining A & B 
  if verbose == True:
    print("Combine A & B groups and create group label for each")
  AB_df = A_deepcopy.append(B_deepcopy) 

  output_list = []

  for cov in covariates:
      pct = pd.crosstab(AB_df[cov],AB_df["group_"],normalize = "columns")
      _, p, _, _ = chi2_contingency(pct) 

      if verbose == True:
        print("Covariate {}:".format(cov))
        print(pct)
        print("p_val = {}".format(p))
        print()
      
      if p <= alpha:
        output_list.append(cov)
  
  return len(output_list)>0, output_list  

In [20]:
# create a test case

train = df[df["experiment_variant_code"]== "CONTROL"].drop("experiment_variant_code",axis = 1)
test = df[df["experiment_variant_code"] == "VARIANT"].drop("experiment_variant_code",axis = 1)
features = ["new_return_visitor","platform_type","posa_super_region","marketing_channel"]

print(testChiSqr(train,test,features,verbose = False))

(False, [])
