In [1]:
# default_exp causalinference

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# Causal Inference

> Causal Inference API

In [3]:
#hide
from nbdev.showdoc import *

In [31]:
#export
import pandas as pd
pd.set_option('display.max_columns', 500)
import time
from causalml.inference.meta import BaseTClassifier, BaseXClassifier, BaseRClassifier, BaseSClassifier
from causalml.inference.meta import BaseTRegressor, BaseXRegressor, BaseRRegressor, BaseSRegressor
from causalml.inference.meta import LRSRegressor
from causalml.propensity import ElasticNetPropensityModel
from causalml.match import NearestNeighborMatch, create_table_one
from scipy import stats
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
import numpy as np
import warnings

# from xgboost import XGBRegressor
# from causalml.inference.meta import XGBTRegressor, MLPTRegressor



metalearner_cls_dict = {'t-learner' : BaseTClassifier,
                        'x-learner' : BaseXClassifier,
                        'r-learner' : BaseRClassifier,
                         's-learner': BaseSClassifier}
metalearner_reg_dict = {'t-learner' : BaseTRegressor,
                        'x-learner' : BaseXRegressor,
                        'r-learner' : BaseRRegressor,
                        's-learner' : BaseSRegressor}

class CausalInferenceModel:
    """Infers causality from the data contained in `df` using a metalearner.
    
    
    Usage:

    ```python
    >>> cm = CausalInferenceModel(df, 
                                  treatment_col='Is_Male?', 
                                  outcome_col='Post_Shared?', text_col='Post_Text',
                                  ignore_cols=['id', 'email'])
        cm.fit()
    ```
    
    **Parameters:**
    
    * **df** : pandas.DataFrame containing dataset
    * **treatment_col** : treatment variable; column should contain binary values: 1 for treated, 0 for untreated.
    * **outcome_col** : outcome variable; column should contain the categorical or numeric outcome values
    * **text_col** : (optional) text column containing the strings (e.g., articles, reviews, emails). 
    * **ignore_cols** : columns to ignore in the analysis
    * **include_cols** : columns to include as covariates (e.g., possible confounders)
    * **treatment_effect_col** : name of column to hold causal effect estimations.  Does not need to exist.  Created by CausalNLP.
    * **metalearner_type** : metalearner model to use. One of {'t-learner', 's-learner', 'x-learner', 'r-learner'} (Default: 't-learner')
    * **learner** : an instance of a custom learner.  If None, a default LightGBM will be used.
        # Example
         learner = LGBMClassifier(n_estimators=1000)
    * **min_df** : min_df parameter used for text processing using sklearn
    * **max_df** : max_df parameter used for text procesing using sklearn
    * **stop_words** : stop words used for text processing (from sklearn)
    * **verbose** : If 1, print informational messages.  If 0, suppress.
    """
    def __init__(self, 
                 df, 
                 treatment_col='treatment', 
                 outcome_col='outcome', 
                 text_col=None,
                 ignore_cols=[],
                 include_cols=[],
                 treatment_effect_col = 'treatment_effect',
                 metalearner_type='t-learner',
                 learner = None,
                 min_df=0.05,
                 max_df=0.5,
                 stop_words='english',
                 verbose=1):
        """
        constructor
        """

        self.treatment_col = treatment_col
        self.outcome_col = outcome_col
        self.text_col = text_col
        self.te = treatment_effect_col # created
        self.metalearner_type = metalearner_type
        self.v = verbose
        self.df = df.copy()
        self.min_df = 0.05
        self.max_df = 0.5
        self.stop_words = stop_words
        if not isinstance(ignore_cols, list):
            raise ValueError('ignore_cols must be a list.')
        if not isinstance(include_cols, list):
            raise ValueError('include_cols must be a list.')
        if ignore_cols and include_cols:
            raise  ValueError('ignore_cols and include_cols are mutually exclusive.  Please choose one.')
        if include_cols:
            ignore_cols = [c for c in df.columns.values if c not in include_cols + [treatment_col, 
                                                                                    outcome_col, 
                                                                                    text_col]]
        self.ignore_cols = ignore_cols
        self.include_cols = include_cols
            
        if text_col is not None and text_col not in df:
            raise ValueError(f'You specified text_col="{text_col}", but {text_col} is not a column in df.')
        if self.treatment_col in self.ignore_cols:
            raise ValueError(f'ignore_cols contains the treatment column ({treatment_col})')
        if self.outcome_col in self.ignore_cols:
            raise ValueError(f'ignore_cols contains the outcome column ({outcome_col})')
            
        
        # these are auto-populated by preprocess method
        self.is_classification = True       
        self.feature_names = None
        self.x = None
        self.y = None
        self.treatment = None
        
        # preprocess
        self.preprocess(self.df)

        # setup model
        self.model = self._create_metalearner(metalearner_type=self.metalearner_type,
                                             supplied_learner=learner)

           

    def _create_metalearner(self, metalearner_type='t-learner', supplied_learner=None):
        # set learner
        default_learner = None
        if metalearner_type == 's-learner':
            if self.is_classification:
                default_learner =  LogisticRegression()
            else:
                default_learner = LinearRegression()
        else:
            if self.is_classification:
                default_learner = LGBMClassifier()
            else:
                default_learner =  LGBMRegressor()
        learner = default_learner if supplied_learner is None else supplied_learner
        
        # set metalearner
        metalearner_class = metalearner_cls_dict[metalearner_type] if self.is_classification \
                                                                   else metalearner_reg_dict[metalearner_type]
        if metalearner_type in ['t-learner', 's-learner']:
            model = metalearner_class(learner=learner,control_name=0)
        else:
            model = metalearner_class(outcome_learner=learner,
                                      effect_learner=LGBMRegressor(),
                                      control_name=0) 
        return model
        
        
    def preprocess(self, df=None, na_cont_value=-1, na_cat_value='MISSING'):
        """
        Preprocess a dataframe for causal inference.
        If df is None, uses self.df.
        """
        start_time = time.time()
        
        # step 1: check/clean dataframe
        if not isinstance(df, pd.DataFrame):
            raise ValueError('df must be a pandas DataFrame')
        df = df.rename(columns=lambda x: x.strip()) # strip headers 
        df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)  # strip data
        df, _ = self._preprocess_column(df, self.treatment_col, is_treatment=True)
        df, self.is_classification = self._preprocess_column(df, self.outcome_col, is_treatment=False)
        self.feature_names = [c for c in df.columns.values \
                             if c not in [self.treatment_col, 
                                          self.outcome_col, self.text_col]+self.ignore_cols]
        #if self.text_col is not None and self.text_col in self.feature_names:
        #    warnings.warn(f'Since you specified text_col="{self.text_col}", other columns that are not ' +\
        #                  'treatments or outcomes are ignored and the metalearner will use ' +\
        #                  'a text classifier/regressor as the base learner.') 
        #    self.feature_names = [self.text_col]
        self.x = df[self.feature_names].copy()
        self.y = df[self.outcome_col].copy()
        self.treatment = df[self.treatment_col].copy()
    

        # step 2: fill empty values on x
        for c in self.feature_names:
            if self._check_type(df, c)['dtype'] =='string': self.x[c] = self.x[c].fillna(na_cat_value)
            if self._check_type(df, c)['dtype']=='numeric': self.x[c] = self.x[c].fillna(na_cont_value)
           

        # step 3: one-hot encode categorial features
        for c in self.feature_names:
            if c == self.text_col: continue
            if self._check_type(df, c)['dtype']=='string':
                if self.df[c].nunique()/self.df.shape[0] > 0.5:
                    if self.text_col is not None:
                        err_msg = f'Column "{c}" looks like it contains free-form text. ' +\
                        f'Since there is already a text_col specified ({self.text_col}), '+\
                        f'you should probably include this column in the "ignore_cols" list.'
                    else:
                        err_msg = f'Column "{c}" looks like it contains free-form text. ' +\
                        f'Please either set text_col="{c}" or add it to "ignore_cols" list.'
                    raise ValueError(err_msg)
                        
                self.x = self.x.merge(pd.get_dummies(self.x[c], prefix = c, 
                                                     drop_first=False), 
                                                     left_index=True, right_index=True)
                del self.x[c]
        self.feature_names_one_hot = self.x.columns
        
                        
        # step 4: for text-based confounder, use extracted vocabulary as features
        if self.text_col is not None:
            from sklearn.feature_extraction.text import TfidfVectorizer
            tv = TfidfVectorizer(min_df=self.min_df, max_df=self.max_df, stop_words=self.stop_words)
            v_features = tv.fit_transform(self.df[self.text_col])
            vocab = tv.get_feature_names()
            vocab_df = pd.DataFrame(v_features.toarray(), columns = ["v_%s" % (v) for v in vocab])
            self.x = pd.concat([self.x, vocab_df], axis=1, join='inner')
        outcome_type = 'categorical' if self.is_classification else 'numerical'
        if self.v: print(f'outcome column ({outcome_type}): {self.outcome_col}')
        if self.v: print(f'treatment column: {self.treatment_col}')
        if self.v: print('numerical/categorical covariates: %s' % (self.feature_names))
        if self.v and self.text_col: print('text covariate: %s' % (self.text_col))
        if self.v: print("preprocess time: ", -start_time + time.time()," sec")

        return df
        
        
    def _preprocess_column(self, df, col, is_treatment=True):
        """
        Preprocess treatment and outcome columns.
        """
        # remove nulls
        df = df[df[col].notnull()]

        # check if already binarized
        if self._check_binary(df, col): return df, True

        # inspect column
        d = self._check_type(df, col)
        typ = d['dtype']
        num = d['nunique']
        
        # process as treatment
        if is_treatment:
            if typ == 'numeric' or (typ == 'string' and num != 2): 
                raise ValueError('Treatment column must contain only two unique values ' +\
                                 'indicating the treated and control groups.')
            values = sorted(df[col].unique())
            df[col].replace(values, [0,1], inplace=True)
            if self.v: print('replaced %s in column "%s" with %s' % (values, col, [0,1]))
        # process as outcome
        else:
            if typ == 'string' and num != 2:
                raise ValueError('If the outcome column is string/categorical, it must '+
                                'contain only two unique values.')
            if typ == 'string':
                values = sorted(df[col].unique())
                df[col].replace(values, [0,1], inplace=True)
                if self.v: print('replaced %s in column "%s" with %s' % (values, col, [0,1]))
        return df, self._check_binary(df, col)
        
        
    def _check_type(self, df, col):
        from pandas.api.types import is_string_dtype
        from pandas.api.types import is_numeric_dtype
        dtype = None
        
        tmp_var = df[df[col].notnull()][col]
        #if tmp_var.nunique()<=5: return 'cat'
        if is_numeric_dtype(tmp_var): dtype = 'numeric'
        elif is_string_dtype(tmp_var): dtype =  'string'
        else:
            raise ValueError('Columns in dataframe must be either numeric or strings.  ' +\
                             'Column %s is neither' % (col))
        output = {'dtype' : dtype, 'nunique' : tmp_var.nunique()}
        return output
    

    def _check_binary(self, df, col):
        return df[col].isin([0,1]).all()        

    def _get_feature_names(self, df):
        return [c for c in df.columns.values \
                if c not in [self.treatment_col, self.outcome_col]+self.ignore_cols]
    
    def fit(self):
        """
        Fits a causal inference model and estimates outcome
        with and without treatment for each observation.
        """
        print("start fitting causal inference model")
        start_time = time.time()
        self.model.fit(self.x.values, self.treatment.values, self.y.values)
        preds = self.predict(self.x)
        self.df[self.te] = preds
        print("time to fit causal inference model: ",-start_time + time.time()," sec")
            
    def predict(self, x):
        """
        Estimates the treatment effect for each observation in `x`.
        """
        if isinstance(x, pd.DataFrame):
            return self.model.predict(x.values)
        else:
            return self.model.predict(x)
    
    def estimate_ate(self, bool_mask=None):
        """
        Estimates the treatment effect for each observation in
        `self.df`.
        """
        df = self.df if bool_mask is None else self.df[bool_mask]
        a = df[self.te].values
        mean = np.mean(a)
        return {'ate' : mean}
        

        
    def minimize_bias(self, caliper = None):
            print('-------Start bias minimization procedure----------')
            start_time = time.time()
            #Join x, y and treatment vectors
            df_match = self.x.merge(self.treatment,left_index=True, right_index=True)
            df_match = df_match.merge(self.y, left_index=True, right_index=True)

            #buld propensity model. Propensity is the probability of raw belongs to control group.
            pm = ElasticNetPropensityModel(n_fold=3, random_state=42)

            #ps - propensity score
            df_match['ps'] = pm.fit_predict(self.x, self.treatment)

            #Matching model object
            psm = NearestNeighborMatch(replace=False,
                           ratio=1,
                           random_state=423,
                           caliper=caliper)

            ps_cols = list(self.feature_names_one_hot)
            ps_cols.append('ps')

            #Apply matching model
            #If error, then sample is unbiased and we don't do anything
            self.flg_bias = True
            self.df_unbiased = psm.match(data=df_match, treatment_col='treatment',score_cols=['ps'])
            self.x_unbiased = self.df_unbiased[self.x.columns]
            self.y_unbiased = self.df_unbiased[self.outcome_col]
            self.treatment_unbiased = self.df_unbiased['treatment']
            print('-------------------MATCHING RESULTS----------------')
            print('-----BEFORE MATCHING-------')
            print(create_table_one(data=df_match,
                                    treatment_col='treatment',
                                    features=list(self.feature_names_one_hot)))
            print('-----AFTER MATCHING-------')
            print(create_table_one(data=self.df_unbiased,
                                    treatment_col='treatment',
                                    features=list(self.feature_names_one_hot)))
            return self.df_unbiased


In [32]:
show_doc(CausalInferenceModel.fit)

<h4 id="CausalInferenceModel.fit" class="doc_header"><code>CausalInferenceModel.fit</code><a href="__main__.py#L284" class="source_link" style="float:right">[source]</a></h4>

> <code>CausalInferenceModel.fit</code>()

Fits a causal inference model and estimates outcome
with and without treatment for each observation.

In [34]:
show_doc(CausalInferenceModel.predict)

<h4 id="CausalInferenceModel.predict" class="doc_header"><code>CausalInferenceModel.predict</code><a href="__main__.py#L296" class="source_link" style="float:right">[source]</a></h4>

> <code>CausalInferenceModel.predict</code>(**`x`**)

Estimates the treatment effect for each observation in `x`.

The parameter `x` should be either a `pandas.DataFrame` or a `numpy.ndarray`.

In [35]:
show_doc(CausalInferenceModel.estimate_ate)

<h4 id="CausalInferenceModel.estimate_ate" class="doc_header"><code>CausalInferenceModel.estimate_ate</code><a href="__main__.py#L305" class="source_link" style="float:right">[source]</a></h4>

> <code>CausalInferenceModel.estimate_ate</code>(**`bool_mask`**=*`None`*)

Estimates the treatment effect for each observation in
`self.df`.

The `bool_mask` parameter can be used to estimate the conditional average treatment estimate (CATE).
For instance, to estimate the average treatment effect for only those individuals over 18 years of age:

```python
cm.estimate_ate(cm.df['age']>18])

```


## Usage Example: Do social media posts by women get shared more often than those by men?

Let's create a simulated dataset.

In [5]:
import itertools
data = ((*a, b) for (a, b) in zip(itertools.product([0,1], [0,1], [0,1]), [36, 234, 25, 55, 6, 81, 71, 192]))
df = pd.DataFrame(data, columns=['Is_Male?', 'Post_Text', 'Post_Shared?', 'N'])
df = df.loc[df.index.repeat(df['N'])].reset_index(drop=True).drop(columns=['N'])
values = sorted(df['Post_Text'].unique())
df['Post_Text'].replace(values, ['I really love my job!', 'My boss is pretty terrible.'], inplace=True)
original_df = df.copy()
df = None
original_df.head()

Unnamed: 0,Is_Male?,Post_Text,Post_Shared?
0,0,I really love my job!,0
1,0,I really love my job!,0
2,0,I really love my job!,0
3,0,I really love my job!,0
4,0,I really love my job!,0


At first glance, it seems like posts by women get shared more often.  More specifically, it appears that being male **reduces** your the chance your post is shared by 4.5 percentage points:

In [6]:
male_probability = original_df[(original_df['Is_Male?']==1)]['Post_Shared?'].value_counts(normalize=True)[1]
male_probability

0.78

In [7]:
female_probability = original_df[(original_df['Is_Male?']==0)]['Post_Shared?'].value_counts(normalize=True)[1]
female_probability

0.8257142857142857

In [8]:
male_probability-female_probability

-0.04571428571428571

However, this is inaccurate. In fact, this is an example of [Simpson's Paradox](https://en.wikipedia.org/wiki/Simpson%27s_paradox), and the true causal effect of being male in this simulated datsaet is roughly **0.05** (as opposed to **-0.045**) with men's posts being more likely to be shared. The reason is that women in this simulation tend to make more positive posts which tend to be shared more often here. Post sentiment, then, is a [mediator](https://en.wikipedia.org/wiki/Mediation_(statistics), which is statistically the same thing as a [confounder](https://en.wikipedia.org/wiki/Confounding).   

When controlling for the sentiment of the post (the mediator variable in this dataset), it is revealed that men's posts are, in fact, shared more often (for both negative posts and positive posts). This can be quickly and easily estimated in **CausalNLP**.

### Causal Inference from Text with Autocoders

Let's first use the `Autocoder` to transform the raw text into sentiment.  We can then control for sentiment when estimating the causal effect.

In [9]:
from causalnlp.autocoder import Autocoder
ac = Autocoder()

In [10]:
df = ac.code_sentiment(original_df['Post_Text'].values, original_df, binarize=False, batch_size=16)

In [11]:
df.head()

Unnamed: 0,Is_Male?,Post_Text,Post_Shared?,negative,positive
0,0,I really love my job!,0,0.019191,0.980809
1,0,I really love my job!,0,0.019191,0.980809
2,0,I really love my job!,0,0.019191,0.980809
3,0,I really love my job!,0,0.019191,0.980809
4,0,I really love my job!,0,0.019191,0.980809


When autocoding the raw text for sentiment, we have chosen to binarize the probabilities (`binarize=True`).  Raw probabilities can also be used with `binarize=False`.

Next, let's estimate the treatment effects. We will ignore the `positive` and `Post_Shared?` columns, as their information is captured by the `negative` column in this example.

In [12]:
cm = CausalInferenceModel(df, treatment_col='Is_Male?', outcome_col='Post_Shared?',
                          include_cols=['negative'])
cm.fit()

outcome column (categorical): Post_Shared?
treatment column: Is_Male?
numerical/categorical covariates: ['negative']
preprocess time:  0.014408588409423828  sec
start fitting causal inference model
time to fit causal inference model:  0.5292856693267822  sec


Upon controlling for sentiment, we see that the overall average treatment is correctly estimated as 0.05.

In [13]:
ate = cm.estimate_ate()
ate

{'ate': 0.05366850622769351}

**CausalNLP** allows you to easily compute conditional or individualized treatment effects.
For instance, for negative posts, being male increases the chance of your post being shared by about 4 percentage points:

In [14]:
cm.estimate_ate(cm.df['negative']>0.9)

{'ate': 0.042535751074149745}

For positive posts, being male increases the chance of your post being shared by about 6 percentage points:

In [15]:
cm.estimate_ate(cm.df['negative']<0.1)

{'ate': 0.06436468274776497}

In [16]:
assert ate['ate'] > 0.05
assert ate['ate'] < 0.055

### Causal Inference Using Raw Text as a Confounder/Mediator

In the example above, we approached the problem under the assumption that a specific lingustic property (sentiment) was an important mediator or confounder for which to control. In some cases, there may also be other unknown lingustic properties that are potential confounders/mediators (e.g., topic, politeness, toxic language, readability).  

In **CausalNLP**, we can also use the **raw text** as the potential confounder/mediator.

In [17]:
cm = CausalInferenceModel(df, treatment_col='Is_Male?', outcome_col='Post_Shared?', text_col='Post_Text',
                         ignore_cols=['negative', 'positive'])
cm.fit()

outcome column (categorical): Post_Shared?
treatment column: Is_Male?
numerical/categorical covariates: []
text covariate: Post_Text
preprocess time:  0.01554560661315918  sec
start fitting causal inference model
time to fit causal inference model:  0.06210470199584961  sec


Although we have excluded the **negative** and **positive** columns as extra covariates, you can use traditional categorical/numerical covariates in combination with a text field covariate (if they exist as extra columns in the dataframe).

Here, we see that the same causal estimates are returned, as the text is easy to infer as positive or negative based on their correlations with the outcomes in this problem.

In [18]:
ate = cm.estimate_ate()
ate

{'ate': 0.05366850622769351}

In [19]:
cm.estimate_ate(df['Post_Text'] == 'My boss is pretty terrible.')

{'ate': 0.042535751074149745}

In [20]:
cm.estimate_ate(df['Post_Text'] == 'I really love my job!')

{'ate': 0.06436468274776497}

In [21]:
assert ate['ate'] > 0.05
assert ate['ate'] < 0.055

### Causal Inference With Text as a Treatment

Suppose we were interested in estimating the causal impact of **sentiment** on the outcome.  That is, **sentiment** of text is the treatment, and the **gender** is a potential confounder. As we did above, we can use the `Autocoder` to create the treatment variable.  The only difference is that we would supply the `binarize=True` as an argument.

In [22]:
df = ac.code_sentiment(original_df['Post_Text'].values, original_df, binarize=True, batch_size=16)

In [23]:
df.head()

Unnamed: 0,Is_Male?,Post_Text,Post_Shared?,negative,positive
0,0,I really love my job!,0,0,1
1,0,I really love my job!,0,0,1
2,0,I really love my job!,0,0,1
3,0,I really love my job!,0,0,1
4,0,I really love my job!,0,0,1


In [24]:
cm = CausalInferenceModel(df, treatment_col='positive', outcome_col='Post_Shared?',
                          include_cols=['Is_Male?'])
cm.fit()

outcome column (categorical): Post_Shared?
treatment column: positive
numerical/categorical covariates: ['Is_Male?']
preprocess time:  0.009786844253540039  sec
start fitting causal inference model
time to fit causal inference model:  0.6526179313659668  sec


In [25]:
ate = cm.estimate_ate()
ate

{'ate': 0.19008080596986368}

In [26]:
assert ate['ate'] > 0.18
assert ate['ate'] < 0.2

In [28]:
#hide
from nbdev.export import notebook2script; notebook2script()

Converted 00_causalinference.ipynb.
Converted 01_autocoder.ipynb.
Converted examples.ipynb.
Converted index.ipynb.
