In [None]:
# default_exp preprocessor

In [None]:
#hide
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Preprocessor

> Preprocesses dataset

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#export
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import time


class DataframePreprocessor:
    """
    Preproceses a pandas DataFrame for causal inference
    """
    def __init__(self, 
                 treatment_col='treatment', 
                 outcome_col='outcome', 
                 text_col=None,
                 include_cols=[],
                 ignore_cols=[],
                 verbose=1):
        """
        Instantiates the DataframePreprocessor instance.
        """
        self.treatment_col = treatment_col
        self.outcome_col = outcome_col
        self.text_col = text_col
        self.feature_names = None
        self.feature_names_one_hot = None
        self.cat_dict = {}
        self.v = verbose
        if not isinstance(ignore_cols, list):
            raise ValueError('ignore_cols must be a list.')
        if not isinstance(include_cols, list):
            raise ValueError('include_cols must be a list.')
        if ignore_cols and include_cols:
            raise  ValueError('ignore_cols and include_cols are mutually exclusive.  Please choose one.')
        if include_cols:
            ignore_cols = [c for c in df.columns.values if c not in include_cols + [treatment_col, 
                                                                                    outcome_col, 
                                                                                    text_col]]
        self.ignore_cols = ignore_cols
        self.include_cols = include_cols


    def preprocess(self, df, 
                   training=False,
                   min_df=0.05,
                   max_df=0.5,
                   ngram_range=(1,1),
                   stop_words='english',
                   na_cont_value=-1, na_cat_value='MISSING'):
        """
        Preprocess a dataframe for causal inference.
        """
        if not training and self.feature_names is None:
            raise ValueError('Preprocessor must first be fitted by calling with training=True.')
            
        start_time = time.time()
        
        # step 1: check/clean dataframe
        if not isinstance(df, pd.DataFrame):
            raise ValueError('df must be a pandas DataFrame')
        df = df.rename(columns=lambda x: x.strip()) # strip headers 
        df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # strip data
        df, _ = self._preprocess_column(df, self.treatment_col, is_treatment=True)
        df, self.is_classification = self._preprocess_column(df, self.outcome_col, is_treatment=False)
        self.feature_names = [c for c in df.columns.values \
                             if c not in [self.treatment_col, 
                                          self.outcome_col, self.text_col]+self.ignore_cols]
        self.X = df[self.feature_names].copy()
        self.Y = df[self.outcome_col].copy()
        self.T = df[self.treatment_col].copy()
    

        # step 2: fill empty values on x
        for c in self.feature_names:
            if self._check_type(df, c)['dtype'] =='string': self.X[c] = self.X[c].fillna(na_cat_value)
            if self._check_type(df, c)['dtype']=='numeric': self.X[c] = self.X[c].fillna(na_cont_value)
           

        # step 3: one-hot encode categorial features
        for c in self.feature_names:
            if c == self.text_col: continue
            if self._check_type(df, c)['dtype']=='string':
                if df[c].nunique()/df.shape[0] > 0.5:
                    if self.text_col is not None:
                        err_msg = f'Column "{c}" looks like it contains free-form text. ' +\
                        f'Since there is already a text_col specified ({self.text_col}), '+\
                        f'you should probably include this column in the "ignore_cols" list.'
                    else:
                        err_msg = f'Column "{c}" looks like it contains free-form text or ' +\
                        f'or unique values. Please either set text_col="{c}" or add it to "ignore_cols" list.'
                    raise ValueError(err_msg)
                      
                if training:
                    self.cat_dict[c] = sorted(df[c].unique())
                    catcol = self.X[c]
                else:
                    #REF: https://stackoverflow.com/a/37451867/13550699
                    catcol = self.X[c].astype(pd.CategoricalDtype(categories=self.cat_dict[c]))
                
                self.X = self.X.merge(pd.get_dummies(catcol, prefix = c, 
                                                     drop_first=False), 
                                                     left_index=True, right_index=True)
                del self.X[c]
        self.feature_names_one_hot = self.X.columns
        
                        
        # step 4: for text-based confounder, use extracted vocabulary as features
        if self.text_col is not None:
            from sklearn.feature_extraction.text import TfidfVectorizer
            tv = TfidfVectorizer(min_df=min_df, max_df=max_df, 
                                 ngram_range=ngram_range, stop_words=stop_words)
            v_features = tv.fit_transform(df[self.text_col])
            vocab = tv.get_feature_names()
            vocab_df = pd.DataFrame(v_features.toarray(), columns = ["v_%s" % (v) for v in vocab])
            self.X = pd.concat([self.X, vocab_df], axis=1, join='inner')
        outcome_type = 'categorical' if self.is_classification else 'numerical'
        if self.v: print(f'outcome column ({outcome_type}): {self.outcome_col}')
        if self.v: print(f'treatment column: {self.treatment_col}')
        if self.v: print('numerical/categorical covariates: %s' % (self.feature_names))
        if self.v and self.text_col: print('text covariate: %s' % (self.text_col))
        if self.v: print("preprocess time: ", -start_time + time.time()," sec")
        return (df, self.X, self.Y, self.T)
        
        
    def _preprocess_column(self, df, col, is_treatment=True):
        """
        Preprocess treatment and outcome columns.
        """
        # remove nulls
        df = df[df[col].notnull()]

        # check if already binarized
        if self._check_binary(df, col): return df, True

        # inspect column
        d = self._check_type(df, col)
        typ = d['dtype']
        num = d['nunique']
        
        # process as treatment
        if is_treatment:
            if typ == 'numeric' or (typ == 'string' and num != 2): 
                raise ValueError('Treatment column must contain only two unique values ' +\
                                 'indicating the treated and control groups.')
            values = sorted(df[col].unique())
            df[col].replace(values, [0,1], inplace=True)
            if self.v: print('replaced %s in column "%s" with %s' % (values, col, [0,1]))
        # process as outcome
        else:
            if typ == 'string' and num != 2:
                raise ValueError('If the outcome column is string/categorical, it must '+
                                'contain only two unique values.')
            if typ == 'string':
                values = sorted(df[col].unique())
                df[col].replace(values, [0,1], inplace=True)
                if self.v: print('replaced %s in column "%s" with %s' % (values, col, [0,1]))
        return df, self._check_binary(df, col)
        
        
    def _check_type(self, df, col):
        from pandas.api.types import is_string_dtype
        from pandas.api.types import is_numeric_dtype
        dtype = None
        
        tmp_var = df[df[col].notnull()][col]
        if is_numeric_dtype(tmp_var): dtype = 'numeric'
        elif is_string_dtype(tmp_var): dtype =  'string'
        else:
            raise ValueError('Columns in dataframe must be either numeric or strings.  ' +\
                             'Column %s is neither' % (col))
        output = {'dtype' : dtype, 'nunique' : tmp_var.nunique()}
        return output
    

    def _check_binary(self, df, col):
        return df[col].isin([0,1]).all()        

    def _get_feature_names(self, df):
        return [c for c in df.columns.values \
                if c not in [self.treatment_col, self.outcome_col]+self.ignore_cols]   

In [None]:
show_doc(DataframePreprocessor.preprocess)

<h4 id="DataframePreprocessor.preprocess" class="doc_header"><code>DataframePreprocessor.preprocess</code><a href="__main__.py#L43" class="source_link" style="float:right">[source]</a></h4>

> <code>DataframePreprocessor.preprocess</code>(**`df`**, **`training`**=*`False`*, **`min_df`**=*`0.05`*, **`max_df`**=*`0.5`*, **`ngram_range`**=*`(1, 1)`*, **`stop_words`**=*`'english'`*, **`na_cont_value`**=*`-1`*, **`na_cat_value`**=*`'MISSING'`*)

Preprocess a dataframe for causal inference.

In [None]:
import pandas as pd
df = pd.read_csv('sample_data/music_seed50.tsv', sep='\t', error_bad_lines=False)

In [None]:
pp = DataframePreprocessor(treatment_col='T_ac', outcome_col='Y_sim', 
                           text_col='text', include_cols=['C_true', 'product'])

In [None]:
df, X, Y, T = pp.preprocess(df, training=True)

outcome column (categorical): Y_sim
treatment column: T_ac
numerical/categorical covariates: ['product', 'C_true']
text covariate: text
preprocess time:  1.5564806461334229  sec


In [None]:
X.head()

Unnamed: 0,C_true,product_audio cd,product_mp3 music,product_vinyl,v_album,v_albums,v_band,v_beautiful,v_best,v_better,v_bought,v_buy,v_cd,v_collection,v_did,v_don,v_excellent,v_fan,v_favorite,v_good,v_got,v_great,v_hear,v_heard,v_just,v_know,v_like,v_listen,v_listening,v_love,v_music,v_new,v_old,v_original,v_really,v_record,v_recording,v_rock,v_song,v_songs,v_sound,v_sounds,v_think,v_time,v_track,v_tracks,v_ve,v_voice,v_way,v_work,v_years
0,0,0,1,0,0.25232,0.0,0.0,0.0,0.0,0.0,0.0,0.850798,0.251679,0.0,0.0,0.386181,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.54225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.625138,0.561398,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.629106,0.0,0.0,0.0,0.0,0.777319,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,1,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.527751,0.0,0.0,0.392572,0.0,0.0,0.372982,0.334952,0.0,0.0,0.56219,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
test_df = df[df['product']=='mp3 music'].copy()

In [None]:
test_df.head()

Unnamed: 0,index,id,rating,product,text,summary,price,T_true,C_true,Y_sim,negative,positive,T_ac
0,7,1388703,1.0,mp3 music,buy the cd. do not buy the mp3 album. downlo...,Buy the CD. Do not buy the MP3.,13.01,0,0,0,0.548733,0.451267,0
1,8,1388703,5.0,mp3 music,takes me back to my childhood!,Love it!,13.01,1,0,0,0.008373,0.991627,1
3,13,1388703,5.0,mp3 music,keith's music is a timeless message. since hi...,Never Gets Old,13.01,1,0,1,0.038876,0.961124,1
5,20,1377647,5.0,mp3 music,this recording is a great collection of some o...,Wonderful Collection,18.99,1,0,0,0.025035,0.974965,1
10,41,6920055,5.0,mp3 music,"i have enjoyed these songs for decades, and ha...",A Masterpiece,34.98,1,0,0,0.045326,0.954674,1


In [None]:
_, X, _, _ = pp.preprocess(test_df, training=False)

outcome column (categorical): Y_sim
treatment column: T_ac
numerical/categorical covariates: ['product', 'C_true']
text covariate: text
preprocess time:  0.2019202709197998  sec


In [None]:
assert 'product_audio cd' in X.columns
assert 'product_mp3 music' in X.columns
assert 'product_vinyl' in X.columns

In [None]:
X.head()

Unnamed: 0,C_true,product_audio cd,product_mp3 music,product_vinyl,v_album,v_albums,v_amazing,v_awesome,v_band,v_beautiful,v_best,v_buy,v_cd,v_don,v_fan,v_favorite,v_good,v_great,v_hear,v_heard,v_just,v_know,v_like,v_listen,v_listening,v_love,v_lyrics,v_music,v_new,v_old,v_really,v_recommend,v_rock,v_song,v_songs,v_sound,v_think,v_time,v_ve,v_voice,v_way,v_work,v_years
0,0,0,1,0,0.209894,0.0,0.0,0.0,0.0,0.0,0.0,0.833785,0.324038,0.394648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.688648,0.0,0.0,0.0,0.0,0.0,0.0,0.338371,0.0,0.0,0.213894,0.0,0.0,0.0,0.0,0.0,0.0,0.489719,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.354532
5,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.439668,0.0,0.0,0.0,0.0,0.700757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.419458,0.373747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0,0,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333854,0.0,0.420052,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.363059,0.0,0.0,0.439849,0.247908,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.395618,0.0,0.0,0.410911
