In [119]:
import time
import os
import os.path as path
from os.path import isfile, join
from os import listdir
from __future__ import division, print_function
from matplotlib import pyplot as plt
import seaborn as sns
from statsmodels.graphics.mosaicplot import mosaic
import statsmodels.api as sm
from statsmodels.formula.api import ols
import pandas as pd
import numpy as np
import json
import scipy.stats as scs
from IPython.display import display, HTML
pd.set_option('display.notebook_repr_html', True)
pd.set_option('display.max_colwidth', -1)
pd.set_option('max_colwidth',200)
import datetime as DT

In [125]:
def CustomParser(data):
    """
    utility function to convert a pandas column from str to dict while reading data from a CSV
    """
    import json
    j1 = json.loads(data)
    return j1

class AdhocCorrelationUtilities(object):
    """
    this class includes functions to output analysis for the feature-class correlation notebook. 
    
    functions that identify numerical and categorical attributes in the dataset are needed 
    to separate them by type and use each type with relevant analysis / plots
    
    the constructor of the class loads the dataset, initializes variables, 
    and converts the payload column to a dictionary type.
    it also performs undersampling on the target attribute if usample is True
    
    function get_df() parses the payload dict type column to prepares all features for analysis
    
    dataset_csv = the CSV file containing the dataset for analysis. 
    value_counts_threshold = the value counts threshold that determines whether to consider a feature numerical 
                             or categorical. Default value is 10.  
    
    """
    
    def __init__(self, dataset_csv, target, usample, value_counts_threshold=10):
        
        filepath = path.join(os.getcwd(), dataset_csv) 
        self.features_target_df = pd.read_csv(filepath, converters={'payload':CustomParser}).set_index(['user_id','event_date'], 
                                                                                     drop=False)
        self.value_counts_threshold = value_counts_threshold
        self.target = target
        self.num_cols = None
        self.cat_cols = None
        self.anovaDF = None
        self.chi2DF = None
        
        if usample:
            false_indices = self.features_target_df[self.features_target_df.subscribed == False].index
            sample_size = sum(self.features_target_df.subscribed == True)
            random_indices = np.random.choice(false_indices, sample_size, replace=False)
            false_sample = self.features_target_df.loc[random_indices]
            true_data = self.features_target_df[self.features_target_df.subscribed == True]
            self.features_target_df = pd.concat([true_data, false_sample])
    
    def get_df(self):
        self.features_target_df[self.features_target_df['payload'][0].keys()] = self.features_target_df['payload'].apply(pd.Series)
        self.features_target_df[self.features_target_df['analytics'][0].keys()] = self.features_target_df['analytics'].apply(pd.Series)
        del self.features_target_df['payload']
        del self.features_target_df['analytics']
        self.features_target_df.columns = ["user_id", "event_date", "country", "subscribed", "industry", "time_in_product_mins", "campaign", 
                           "device", "email_open_rate_percent", "referrer_channel"]
        return self.features_target_df
    
    def get_num_cols_rules(self):
        features = [f for f in self.features_target_df.columns.tolist() if not 
                                    (f.startswith('user_id') 
                                     or f.startswith('event_date')
                                     or f.startswith(self.target))]
        featuresDF = self.features_target_df[features]
        num_cols = list(featuresDF._get_numeric_data().columns)
        #disregard features having value counts less than value_counts_threshold
        num_cols_thresh = [f for f in num_cols if featuresDF[f].value_counts().count() >= self.value_counts_threshold]
        self.num_cols = num_cols_thresh
        return num_cols_thresh
    
    def get_cat_cols_rules(self):
        features = [f for f in self.features_target_df.columns.tolist() if not 
                                    (f.startswith('user_id') 
                                     or f.startswith('event_date')
                                     or f.startswith(self.target))]
        num_cols = self.get_num_cols_rules()
        cat_cols = [f for f in features if f not in num_cols]
        self.cat_cols = cat_cols
        return cat_cols 
    
    # function to create a boxplot
    def boxplot(self, target, feature, data):
        sns.plt.figure(figsize=(18,10))
        sns.set_style("whitegrid")
        pal = {c: "#539caf" if c == 0 else "#7663b0" for c in data[target]}
        ax = sns.boxplot(x=target, y=feature, hue=target, palette=pal, data=data)
        ax.axes.set_title('Median ' + feature + ' score for each class in target ' + target, fontsize=16)
        ax.set_xlabel(target, fontsize=14)
        ax.set_ylabel(feature, fontsize=14)
        #sns.plt.setp(ax.artists, alpha=.5, linewidth=2, fill=True, edgecolor="k")
        sns.plt.show()
        
    # function to create boxplots for the numerical features against a target
    def boxplots(self, target):
        num_cols = self.get_num_cols_rules()
        data = self.features_target_df
        
        for i in range(0, len(num_cols)):
            df = data[np.isfinite(data[num_cols[i]])]
            #discard features with no non-NAN values
            if len(df) == 0.0:
                continue
            #discard features having no min - max difference 
            if df[num_cols[i]].max() - df[num_cols[i]].min() == 0.0:
                continue
            self.boxplot(target, num_cols[i], df)
    
    # function to create an overlaid histogram
    def overlaid_histogram(self, data1, data1_name, data1_color, data2, data2_name, data2_color, x_label, y_label, title):
        # Set the bounds for the bins so that the two distributions are
        # fairly compared
        max_nbins = 20
        try:
            data_range = [min(min(data1), min(data2)), max(max(data1), max(data2))]
        except ValueError, e:
            print(e)
            print("skipping feature " + x_label)
            return
        binwidth = (data_range[1] - data_range[0]) / max_nbins
        try:
            bins = np.arange(data_range[0], data_range[1] + binwidth, binwidth)
        except ValueError, e:
            print(e)
            print('skipping feature '+x_label+' ...')
            return

        # Create the plot
        _, ax = plt.subplots(figsize=(18,10))
        ax.hist(data1, bins = bins, color = data1_color, alpha = 1, label = data1_name)
        ax.hist(data2, bins = bins, color = data2_color, alpha = 0.75, label = data2_name)
        ax.set_ylabel(y_label, fontsize=14)
        ax.set_xlabel(x_label, fontsize=14)
        ax.set_title(title, fontsize=16)
        ax.legend(loc = 'best')
        
    # function to create overlaid histograms for the numerical features against a target
    def overlaid_histograms(self, target):
        num_cols = self.get_num_cols_rules()
        data = self.features_target_df
        classes = np.unique(data[target])
        if len(classes) > 2:
            classes = np.delete(classes, [0])

        #print(data[target].value_counts())
        #print(len(data))
        
        for i in range(0, len(num_cols)):
            df = data[np.isfinite(data[num_cols[i]])]
            #discard features with no non-NAN values
            if len(df) == 0.0:
                continue
            #discard features having no min - max difference 
            if df[num_cols[i]].max() - df[num_cols[i]].min() == 0.0:
                continue
            title = 'Distribution of ' + num_cols[i] + ' by ' + target
            features_by_target = []
            for c in classes:
                features_by_target.append(df[df[target] == c][num_cols[i]].values)
            self.overlaid_histogram(data1 = features_by_target[0]
                               , data1_name = 'class_' + str(classes[0])
                               , data1_color = '#539caf'
                               , data2 = features_by_target[1]
                               , data2_name = 'class_' + str(classes[1])
                               , data2_color = '#7663b0'
                               , x_label = num_cols[i]
                               , y_label = 'Frequency'
                               , title = title)
            
    # function to compute one-way ANOVA statistics for numerical features against categories of a binary target
    # the computation is performed using Statsmodels anova_lm function given an ordinary least squares input
    def compute_one_way_anova_lm(self, target):
        anova_cols = ['feature', 'sum_sq', 'esq_sm', 'F', 'PR(>F)']
        anovaDF = pd.DataFrame(columns=anova_cols)
        num_cols = self.get_num_cols_rules()
        data = self.features_target_df
            
        #make sure the target is binary
        classes = np.unique(data[target])
        if len(classes) > 2:
            classes = np.delete(classes, [0])
        data = data[data[target].isin(classes.tolist())]
        
        #make sure the target has an str dtype
        data.loc[:,target] = data.loc[:,target].apply(str)
        
        for i in range(0, len(num_cols)):
            df = data[np.isfinite(data[num_cols[i]])]
            #discard features with no non-NAN values
            if len(df) == 0.0:
                continue
            #discard features having no min - max difference 
            if df[num_cols[i]].max() - df[num_cols[i]].min() == 0.0:
                continue
            mod = ols(num_cols[i] +' ~ ' + target,
                            data=df).fit()

            aov_table = sm.stats.anova_lm(mod, typ=2)
            esq_sm = aov_table['sum_sq'][0]/(aov_table['sum_sq'][0]+aov_table['sum_sq'][1])
            temp = pd.DataFrame({'feature': num_cols[i], 
                                 'sum_sq': round(aov_table['sum_sq'][0], 3),
                                 'esq_sm': round(esq_sm, 3),
                                 'F': round(aov_table['F'][0], 3),
                                 'PR(>F)': round(aov_table['PR(>F)'][0], 6)
                                }, index=[0])
            anovaDF = pd.concat([anovaDF, temp])
            
        print ('one_way_anova_lm statistics computed for %d out of %d features' % (len(anovaDF), len(num_cols)) )
        self.anovaDF = anovaDF
    
    # call compute_one_way_anova_lm() given target and show the resulted dataframe
    # order_by is one of ['sum_sq', 'esq_sm', 'F', 'PR(>F)']
    # ascending is one of [True, False]
    def show_one_way_anova_lm(self, target, order_by, ascending):
        self.compute_one_way_anova_lm(target)
        self.anovaDF = self.anovaDF.sort_values(by=order_by, ascending=ascending)
        self.anovaDF = self.anovaDF.reset_index(drop=True)
        display(self.anovaDF[['feature', 'sum_sq', 'esq_sm', 'F', 'PR(>F)']])
        
    # function to create a contingency table between a categorical feature and target attribute
    def contingency_table(self, target, feature, prop):
        data = self.features_target_df
        
        #print(data[target].value_counts())
        
        #show either value counts or proportions
        if prop:
            print("Proportions of feature " + feature + " per each class of target " + target)
            display(pd.crosstab(data[target], data[feature]).apply(lambda r: r/r.sum(), axis=0))
        else:
            print("Value counts of feature " + feature + " per each class of target " + target)
            display(pd.crosstab(data[target], data[feature]))
                
    # function to create contingency tables for the categorical features against a target
    # if prop is True, the feature proportions per target class are displayed instead of the actual counts
    def contingency_tables(self, target, prop):
        cat_cols = self.get_cat_cols_rules()
        data = self.features_target_df
        for i in range(0, len(cat_cols)):
            #discard features having no data
            if len(data[cat_cols[i]].tolist()) == 0:
                continue
            #discard features having no more than 1 value
            if data[cat_cols[i]].value_counts().count() <= 1:
                continue
            self.contingency_table(target, cat_cols[i], prop)
    
    # Define a function for a stacked bar plot
    def stackedbarplot(self, x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
        _, ax = plt.subplots(figsize=(18,10))
        # Draw bars, one category at a time
        for i in range(0, len(y_data_list)):
            n=len(x_data)
            ind=np.arange(n)
            width=0.10
            if i == 0:
                ax.bar(ind, y_data_list[i], color = colors[i], align = 'center', label = y_data_names[i])
            else:
                # For each category after the first, the bottom of the
                # bar will be the top of the last category
                ax.bar(ind, y_data_list[i], color = colors[i], bottom = y_data_list[i - 1], align = 'center', 
                       label = y_data_names[i])
        plt.xticks(ind+width/2,tuple(x_data))
        ax.set_ylabel(y_label, fontsize=14)
        ax.set_xlabel(x_label, fontsize=14)
        ax.set_title(title, fontsize=16)
        ax.legend(loc = 'upper right')
        plt.show()
    
    # function to create  plots for the categorical features against a target
    def stackedbarplots(self, target):
        cat_cols = self.get_cat_cols_rules()
        for i in range(0, len(cat_cols)):
            data = self.features_target_df
            #discard features having no data
            if len(data[cat_cols[i]].tolist()) == 0:
                continue
            #discard features having no more than 1 value
            if data[cat_cols[i]].value_counts().count() <= 1:
                continue
            
            class_agg_by_feature = pd.crosstab(data[cat_cols[i]], data[target])
            class_agg_by_feature.columns = ['class_0', 'class_1']
            class_agg_by_feature['total'] = class_agg_by_feature['class_0'] + class_agg_by_feature['class_1']
            class_agg_by_feature['c0_prop'] = class_agg_by_feature['class_0'] / class_agg_by_feature['total']
            class_agg_by_feature['c1_prop'] = class_agg_by_feature['class_1'] / class_agg_by_feature['total']
            
            # Call the function to create plot
            title = 'Proportion of target ' + str(target) + ' classes'
            self.stackedbarplot(x_data = class_agg_by_feature.index.values
                           , y_data_list = [class_agg_by_feature['c0_prop'].tolist(), 
                                            class_agg_by_feature['c1_prop'].tolist()]
                           , y_data_names = ['class_0', 'class_1']
                           , colors = ['#539caf', '#7663b0']
                           , x_label = cat_cols[i]
                           , y_label = 'Proportion'
                           , title = title)
            
    # returns range for a series for chi_square test
    def categories(self, series):
        return range(int(series.min()), int(series.max()) + 1)

    # function to compute chi_square test of independence of col1 & col2 variables in a dataframe df
    # returns tuple (chi2, p) representing the chi2 test statistic and the p-value of the test, respectively
    # uses scipy.stats.chi2_contingency
    def chi_square_of_df_cols(self, df, col1, col2):
        df_col1, df_col2 = df[col1], df[col2]

        result = [[sum((df_col1 == cat1) & (df_col2 == cat2))
                   for cat2 in self.categories(df_col2)]
                  for cat1 in self.categories(df_col1)]

        return (scs.chi2_contingency(result)[0], scs.chi2_contingency(result)[1]) 
    
    # TODO
    # calls function chi_square_of_df_cols() for every categorical feature and the target 
    def compute_chi_square(self, target):
        chi2_cols = ['feature', 'chi2', 'Pr(>chi)']
        chi2DF = pd.DataFrame(columns=chi2_cols)
        cat_cols = self.get_cat_cols_rules()
        data = self.features_target_df
            
        #make sure the target is binary
        classes = np.unique(data[target])
        if len(classes) > 2:
            classes = np.delete(classes, [0])
            data = data[data[target].isin(classes.tolist())]
            
        #make sure the target has an int dtype
        data.loc[:,target] = data.loc[:,target].apply(int)
        
        for i in range(0, len(cat_cols)):
            #discard features having no data
            if len(data[cat_cols[i]].tolist()) == 0:
                continue
            #discard features having no more than 1 value
            if data[cat_cols[i]].value_counts().count() <= 1:
                continue
            
            #make sure the feature has an str dtype
            data.loc[:,cat_cols[i]] = data.loc[:,cat_cols[i]].apply(str)
            
            #apply feature mapping from str to int sequence
            mapping = {} 
            for j in range(0, len(data[cat_cols[i]].unique().tolist())):
                mapping[data[cat_cols[i]].unique().tolist()[j]] = j

            data = data.replace({cat_cols[i]: mapping})
            
            (chi2, p) = self.chi_square_of_df_cols(data, cat_cols[i], target)
            
            temp = pd.DataFrame({'feature': cat_cols[i], 
                                 'chi2': round(chi2, 3),
                                 'Pr(>chi)': round(p, 3)
                                }, index=[0])
            chi2DF = pd.concat([chi2DF, temp])
            
        print ('chi_square_of_df_cols computed for %d out of %d features' % (len(chi2DF), len(cat_cols)) )
        self.chi2DF = chi2DF
        
    # call compute_chi_square() given target and show the resulted dataframe
    # order_by is one of ['chi2', 'Pr(>chi)']
    # ascending is one of [True, False]
    def show_chi_square_scs(self, target, order_by, ascending):
        self.compute_chi_square(target)
        self.chi2DF = self.chi2DF.sort_values(by=order_by, ascending=ascending)
        self.chi2DF = self.chi2DF.reset_index(drop=True)
        display(self.chi2DF[['feature', 'chi2', 'Pr(>chi)']])
        