In [1]:
import pandas as pd
import numpy as np 
import os 

from datetime import datetime
from termcolor import colored


import warnings
warnings.filterwarnings('ignore')

wd = os.getcwd()
print(wd)


/Users/annadudek/00_DataMasked


In [2]:
txns_raw=pd.read_csv("https://drive.google.com/uc?export=download&id=18RLruiMU8rM-IQPLdwL6wNEc8Kks2JZQ")
ips_raw=pd.read_csv("https://drive.google.com/uc?export=download&id=1wbKys6YI-IvE-b-C0_4xR4zz2YnpOL1d")


## Describing Data

In [3]:
def describe_data(data):
    
    """ Shows the name, shape and columns/dtypes of given dataframe. Tries to loook for an id column (chooses the
        first that comes up in the dataframe). If id column found, checks whether length of dataframe is at id level.
        
        USAGE: describe_data(example_df)         """
    
    df_name =[x for x in globals() if globals()[x] is data][0]
    print(colored(f"Dataframe: {df_name}", 'blue', attrs=['bold']))
    print(f"shape: {data.shape}")
    print('')
    print(data.dtypes)

    
    ## assume first id column in the dataset (in terms of left to right order)
    try:
        id_column=data.filter(like='id').columns.to_list()[0]
        print('')
        print(f"Column taken as id: {id_column}")
        print(f"If {len(data)} = {len(data[id_column].unique())}, then dataframe is at this level") 

    except:
        print('')
        print(colored('!!! No id column !!!','red', attrs=['bold']))

    
    print('')
    return data.head(3)

In [4]:
describe_data(txns_raw)
describe_data(ips_raw)

[1m[34mDataframe: txns_raw[0m
shape: (151112, 11)

user_id             int64
signup_time        object
purchase_time      object
purchase_value      int64
device_id          object
source             object
browser            object
sex                object
age                 int64
ip_address        float64
class               int64
dtype: object

Column taken as id: user_id
If 151112 = 151112, then dataframe is at this level

[1m[34mDataframe: ips_raw[0m
shape: (138846, 3)

lower_bound_ip_address    float64
upper_bound_ip_address      int64
country                    object
dtype: object

[1m[31m!!! No id column !!![0m



Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China


## Data Management & Cleaning

In [19]:
def set_dtypes(data): 
    
    """ Transforms columns with 'date' or 'time' into date types. 
        Transforms any two-level columns into categories.
        
        USAGE: new_df = set_dtypes(example_df)"""

    ### transforms dates 
    to_transform_dates=data.filter(like='date').columns.to_list()
    if len(to_transform_dates) == 0:
        print('no objects named date; trying time')
        to_transform_dates=data.filter(like='time').columns.to_list()
        for i in to_transform_dates:
            data[i] = pd.to_datetime(data[i])
    else: 
        for i in to_transform_dates:
            data[i] = pd.to_datetime(data[i])
    
    print('')
            
     ### transforms numerics to binary      
    to_transform_binary = data.select_dtypes('number').columns.tolist()
    print(to_transform_binary)
    binary_to_drop = input("Enter any columns that need omitting from binary transformation ")
    binary_to_drop = list(binary_to_drop.split(' '))  
    
    try: 
        for x in binary_to_drop: 
            to_transform_binary.remove(x)
    except: to_transform_binary
        
    print('')
    for col in to_transform_binary:
        if len(data[col].unique())>3:
            to_transform_binary.remove(col)
        else: 
            data[col] = data[col].astype('category')
            
            
    print(data.dtypes)    
    return data

In [18]:
txns_raw = set_dtypes(txns_raw)

no objects named date; trying time

['user_id', 'purchase_value', 'age', 'ip_address', 'purchase_value_zscore', 'age_zscore', 'source_Ads', 'source_Direct', 'source_SEO', 'browser_Chrome', 'browser_FireFox', 'browser_IE', 'browser_Opera', 'browser_Safari', 'sex_F', 'sex_M']
Enter any columns that need omitting from outlier managment function 

user_id                           int64
signup_time              datetime64[ns]
purchase_time            datetime64[ns]
purchase_value                    int64
device_id                        object
source                           object
browser                          object
sex                              object
age                               int64
ip_address                      float64
class                          category
purchase_value_zscore           float64
age_zscore                      float64
source_Ads                     category
source_Direct                  category
source_SEO                     category
browser_Chrome

In [7]:
### Drop outliers or truncate in depending on dataset size 
def outlier_management (data): 
    
    """ 1. Takes all numbers to a list of columns. Drops any with the substring name id. 
        2. Drop any columns which have fewer than 5 levels; assumption = they are categorical/boolean
        3. Allows an input for any additional columns that shouldn't be considered 
        3. If dataset greater than 30K records, then will drop outliers; show how many dropped
           If dataset is less than 30K records, will truncate outliers to the closest 3SD 
           
        returns dataframe   
        
        USAGE: new_df = outlier_management(data = example_df)"""
    
    columns=data.select_dtypes('number').columns.tolist()
    columns[:] = [x for x in columns if 'id' not in x]
    
    for col in columns:
        if len(data[col].unique())<=5:
            columns.remove(col)
        else: 
            print(f"{[col]} is kept")
    print('')
    print(columns)
    columns_to_drop = input("Enter any columns that need omitting from outlier managment function ")
    columns_to_drop = list(columns_to_drop.split(' '))  
    
    try: 
        for x in columns_to_drop: 
            columns.remove(x)
    except: columns 
    print('')
    
    
    
    ### calculates z-scores for all numeric columns
    col_zscore_list = []
    for col in columns:
        col_zscore = col + '_zscore'
        col_zscore_list.append(col_zscore)
        data[col_zscore] = abs((data[col] - data[col].mean())/data[col].std(ddof=0))
    
    
    
    ### drops or truncates depending on size 
    df_length = len(data)

    if df_length>20000:
        for col in col_zscore_list:
#             df = data[(data[col] < 3).all(axis=1)]
            data = data[data[col] < 3]
        print('dataset too small; was truncated')
    else:
        print('test')

    print(f" Before outlier management :{df_length}; After outlier management {len(data)}")  
    print('')
    print(f" % of dataset retained :{len(data)/df_length}")
    return data


    """ Checks any of the transformed columns whether there is any zscore values >3 """


def check_outlier_management(data, columns): 
    for col in columns: 
        print(f" {col} : {data[col].max()}")
        print('')
        
    for col in columns: 
        data.drop([col], axis=1, inplace=True)
    

In [8]:
txns_raw = outlier_management(data = txns_raw)

['purchase_value'] is kept
['age'] is kept
['ip_address'] is kept

['purchase_value', 'age', 'ip_address']
Enter any columns that need omitting from outlier managment function ip_address

dataset too small; was truncated
 Before outlier management :151112; After outlier management 149701

 % of dataset retained :0.9906625549261475


In [9]:

## transform to dates should always run first because all date-like objects will be considered an object 
## and will be made into dummies, which will unnecessarily explode the number of dummies 

def make_dummies(data): 
    
    """ Takes all object columns and transforms them to dummy variables. Asks for input whether any variables
        should be omitted. 
        
        USAGE: new_df = make_dummies(example_df)"""
    
    dummies_list = []
    variables=data.select_dtypes('object').columns.tolist()
    
    
    print(variables)
    variables_to_drop = input("Enter any columns that need omitting from becoming dummies ")
    variables_to_drop = list(variables_to_drop.split(' '))  
    
    try: 
        for x in variables_to_drop: 
            variables.remove(x)
    except: variabes
    print('')
    
    
    for var in variables:   
        dummies_list = pd.get_dummies(data[var]).rename(columns=lambda x: str(var) + '_' + str(x))
        data=data.join(dummies_list)
#         del dummies_list
    
    return data    
    

In [10]:
txns_raw=make_dummies(txns_raw)
txns_raw

['device_id', 'source', 'browser', 'sex']
Enter any columns that need omitting from becoming dummies device_id



Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,...,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_F,sex_M
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,7.327584e+08,...,0,0,1,1,0,0,0,0,0,1
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,3.503114e+08,...,1,0,0,1,0,0,0,0,1,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2.621474e+09,...,0,0,1,0,0,0,1,0,0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3.840542e+09,...,0,0,1,0,0,0,0,1,0,1
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,4.155831e+08,...,1,0,0,0,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,2015-03-29 00:30:47,43,XPSKTWGPWINLR,SEO,Chrome,M,28,3.451155e+09,...,0,0,1,1,0,0,0,0,0,1
151108,274471,2015-05-15 17:43:29,2015-05-26 12:24:39,35,LYSFABUCPCGBA,SEO,Safari,M,32,2.439047e+09,...,0,0,1,0,0,0,0,1,0,1
151109,368416,2015-03-03 23:07:31,2015-05-20 07:07:47,40,MEQHCSJUBRBFE,SEO,IE,F,26,2.748471e+09,...,0,0,1,0,0,1,0,0,1,0
151110,207709,2015-07-09 20:06:07,2015-09-07 09:34:46,46,CMCXFGRHYSTVJ,SEO,Chrome,M,37,3.601175e+09,...,0,0,1,1,0,0,0,0,0,1


## Experiment Randomization

In [11]:
def categorical_randomization(data, exp_group_col):
    
    """ Takes all the categorical variables in the dataframe and creates frequency tables by 
    Control v Variant in order to check randomization. """
    print(exp_group_col)
    categorical_variables = data.select_dtypes('uint8').columns.tolist()   ## picks up on only dummied variables
#     print(categorical_variables)
#     columns_to_drop = input("Enter any columns that need omitting")
#     columns_to_drop = list(columns_to_drop.split(' '))  
    
#     try: 
#         for x in columns_to_drop: 
#             categorical_variables.remove(x)
#     except: categorical_variables 
#     print('')
              
    
    freq_table=pd.DataFrame()
    for i in categorical_variables: 
        var_table=pd.DataFrame(data[[exp_group_col,i]].pivot_table(index=exp_group_col, columns=i, 
                                aggfunc=len, fill_value=0)).reset_index()

        levels_list = var_table.columns.tolist()
#         levels_list.remove(exp_group_col)

        for l in levels_list: 
            var_table[str("pct_" + l)] = var_table[l]/var_table.sum(axis=1)
        
        freq_table = pd.concat([freq_table, var_table], axis =1)
    
    
    
    
    """Clean up freq_table & flag problematic randomization VARIANCE BETWEEN GROUPS > 5% """
    
    freq_table=freq_table.transpose()
    freq_table.reset_index(inplace=True)
    freq_table.columns = ['categorical_variable', 'Control', 'Variant']
    freq_table=freq_table[freq_table.categorical_variable != exp_group_col]
    
    freq_table['pct_variable']=freq_table['categorical_variable'].str.contains("pct")
    freq_table['group_variance'] = np.where(freq_table['pct_variable'] == True,
                                            abs((freq_table['Control'] - freq_table['Variant'])/freq_table['Control']) , 0)


    print(colored('Following categorical variables have greater than 5% variance between Control & Variant',
                  'red', attrs=['bold']))
    print("")
    print(freq_table[freq_table.group_variance >= 0.05])
    
    
    pd.options.display.max_rows = 1000                              ## increase the length you can see in notebooks
    return freq_table                                


In [12]:
randomization_results=categorical_randomization(data= txns_raw, exp_group_col = 'class')


class
['device_id', 'source', 'browser', 'sex']
Enter any columns that need omitting



KeyboardInterrupt: 

## Model Building

In [None]:
## Build model 

n = 6
randoModel = RandomForestClassifier(n_estimators=100, max_features=n, oob_score=True)
randoModel.fit(X_train,y_train)

y_pred = randoModel.predict(X_test)
test['y_hat']= randoModel.predict(X_test)

In [None]:
def random_forest_results(model): 
    
    """ Takes an existing RandomForestClassifier model and checks the results. 
    
    USAGE: random_forest_results(model=example_RFModel)
    
    """

    print(colored('############  Accuracy Metrics  ############','blue', attrs=['bold']))
    print('')
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print ("Out of Bag Score:", model.oob_score_)



## which features are important 
    print('')
    print(colored('############ Feature Importance  ############','blue', attrs=['bold']))
    feature_imp = pd.Series(model.feature_importances_,
                        index=variables_to_use).sort_values(ascending=False)
    print('')
    print('Feature Importance sorted - top n features: ')
    print(feature_imp[0:n])

 
    

## confusion matrix 
    print('')
    print(colored('############ Confusion Matrix  ############','blue', attrs=['bold']))
    confusion_matrix(test[y_actual], test['y_hat']) 

    df_confusion = pd.crosstab(test[y_actual], test['y_hat'],
                           rownames=['Actual'], colnames=['Predicted'], margins=True)
    print('')
    print('confusion matrix - absolute values')
    print(df_confusion)

    print('')
    print('confusion matrix - % values')
    print(df_confusion / df_confusion.sum(axis=1))       ### represented as a percentage 


## Precision v Recall 
    print('')
    print(colored('############ Precision v Recall ############','blue', attrs=['bold']))
    print('')
    precision, recall, fscore, support = score(test[y_actual], test['y_hat'],labels=[0,1])


## Precision: % of results that are TP of all the records identified positively. TP/(TP + FP) 
## Recall: % of results that are TP from all records that are TP and which were missed as TP (FN). TP/(TP + FN)
## Accuracy: % of all positives and negatives identified correctly (TP + TN)/total records
## F1-Score: 
## Support: 


    classification_report= pd.DataFrame(columns = ('metric', 'not_converted', 'converted'))

    classification_report.loc[0] = ['precision', precision[0], precision[1]]
    classification_report.loc[1] = ['recall', recall[0], recall[1]]
    classification_report.loc[2] = ['fscore', fscore[0], fscore[1]]
    classification_report.loc[3] = ['support', support[0], support[1]]

    print(classification_report)


In [None]:
def package_description(data): 
    print(colored('##############   Data Description   ##############','magenta', attrs=['bold']))
    print(colored('describe_data','magenta', attrs=['bold']))
    print('')
    print('DESCRIPTION: general description of the dataframe and looks for an id column to check dataframe granularity')
    
    print('')
    
    print(colored('set_dtypes','magenta', attrs=['bold']))
    print('')
    print('DESCRIPTION: Takes all columns iLIKE date or time and transforms into datatime object & binary variables into cateogry variables')
    
    print('')
    print('')
    
    print(colored('############## Data Management/Cleaning ##############','magenta', attrs=['bold']))
    print(colored('outlier_management','magenta', attrs=['bold']))
    print('')
    print('DESCRIPTION: 1. Drops outliers or truncates in depending on dataset size')
    print('WORK NEEDED: should only choose int/float variables. IF/ELSE else is not completed')
    
    print('')
    
    print(colored('check_outlier_management','magenta', attrs=['bold']))
    print('')
    print('DESCRIPTION: Prints the absolute max value of all the columns that were transformed in the outlier management section')
    print('WORK NEEDED: Should only choose int/float variables.')
    
    print('')
    print('')
    
    print(colored('############## Experiment Randomization ##############','magenta', attrs=['bold']))
    print(colored('categorical_randomization','magenta', attrs=['bold']))
    print('')
    print('DESCRIPTION: Takes all object columns from a dataframe and checks the frequencies of categorical variables \
                        across Control & Variant. Flags any variables which have more than a 5% variance between groups')
   
    print('')
    print('')
    
    print(colored('##############   Modelling   ##############','magenta', attrs=['bold']))
    print(colored('make_dummies','magenta', attrs=['bold']))
    print('')
    print('DESCRIPTION: Takes all object variables and turns them into dummy variables')
    
    print('')
    
    print(colored('random_forest_results','magenta', attrs=['bold']))
    print('')
    print('DESCRIPTION: Prints accuracy, feature importance, confusion matrix and precision/recall for a random forest model')
   



    print('TO DO: KS TEST for continuous variables to test whether the distributions are the same. Test each of the functions without any packages loaded, will need to load in the py script')

In [None]:
package_description(ips_raw)
