Environment setup

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive, userdata
drive.mount('/content/drive')

changing the working dir to the project's dir

In [None]:
import os

project_path = "/content/drive/MyDrive/ds/causal-sermons"
os.chdir(project_path)

Adding src to pythonpath

In [None]:

import sys
import os
from pathlib import Path

# Get the current working directory (the directory where your notebook is located)
current_dir = Path(os.getcwd())

# Add the current directory to the Python path
sys.path.append(str(current_dir/"src"))

In [None]:
!pip install -r requirements.txt

# Experiment

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from comet_ml import Experiment
import matplotlib.pyplot as plt

Initialize experiment logger

In [None]:
experiment = Experiment(
  api_key=userdata.get('comet_key'),
  project_name="causal-sermons-synth-ratio",
  workspace=userdata.get('comet_user')
)

## Reading synthetic data

In [None]:
sermons = pd.read_csv('./data/sermons/dataset_summarized_small.csv')
sermons.shape

In [None]:
sermons.head()

In [None]:
sermons['treatment'] = sermons['treatment'].astype(float)

sermons['outcome'] = np.where(sermons['treatment'] == 0, sermons['outcome_0'], sermons['outcome_1'])

In [None]:
sermons.describe()

## Preprocessing

In [None]:
#sermons = sermons.sample(n=1000, random_state=1)


sermons = sermons.loc[lambda x: x.text.notnull()]
sermons = sermons.loc[lambda x: x.text.str.len() > 100]

sermons.shape

In [None]:
#sermons = sermons.loc[lambda x: x.num_sermons>5].loc[lambda x: x.portion_voted.notnull()]

Calculate the ratio of original text to summary

In [None]:
sermons['ratio'] = np.log(sermons['text_sum'].str.len() / sermons['text'].str.len())
sermons['ratio_z'] = zscore(sermons['ratio'])

In [None]:
fig, ax = plt.subplots(figsize=(10, 5))
sns.histplot(x='ratio', data=sermons, ax=ax)
experiment.log_figure(fig, overwrite=True)

## Training Synthetic estimation

In [None]:
# params
text_version = 'text_sum'
model_version = 'distilbert'
max_tokens_text = 256
batch_size = 16

In [None]:
experiment.log_parameters({
    'text_version': text_version,
    'model_version': model_version,
    'max_tokens_text': max_tokens_text,
    'batch_size': batch_size
})

In [None]:
# sermons['Y_0'] = (sermons['trump_minus_clinton'] > 0).astype(int)
# sermons['Y_1'] = sermons['trump_minus_clinton']
# sermons['Y_2'] = sermons['portion_voted']

In [None]:
if text_version == 'full':
    sermons['text_input'] = sermons['text']
elif text_version == 'text_sum':
    sermons['text_input'] = sermons['text_sum']
elif text_version == 'text_sum_with_first_and_last_5':
    sermons['text_input'] = sermons['text_sum_with_first_and_last_5']

In [None]:
train_indices, test_indices = train_test_split(sermons.index, test_size=0.2, random_state=42)

sermons_train = sermons.loc[train_indices]
sermons_test = sermons.loc[test_indices]

In [None]:
# import torch

# torch.cuda.empty_cache()

# import gc
# gc.collect()

In [None]:
from causal_sermons.causal_bert import (
    CausalModelWrapper,
    CausalDistilBert, DistilBertTokenizer,
    CausalLongformer, LongformerTokenizer)
from causal_sermons.ate import get_errors

In [None]:
texts = sermons_train['text_input']
confounds = sermons_train[['ratio_z']]
treatments = sermons_train['treatment']
outcomes = sermons_train[['outcome']]

In [None]:
if model_version == 'distilbert':
  model = CausalDistilBert.from_pretrained(
            "distilbert-base-uncased",
            num_outcomes=outcomes.shape[1],
            num_confounders=confounds.shape[1],
            output_attentions=False,
            output_hidden_states=False)

  tokenizer = DistilBertTokenizer.from_pretrained(
                  'distilbert-base-uncased', do_lower_case=True)
elif model_version == 'longformer':
  model = CausalLongformer.from_pretrained(
            "allenai/longformer-base-4096",
            num_outcomes=outcomes.shape[1],
            num_confounders=confounds.shape[1],
            output_attentions=False,
            output_hidden_states=False)

  tokenizer = LongformerTokenizer.from_pretrained(
                  'allenai/longformer-base-4096', do_lower_case=True)
else:
  raise ValueError('model_version must be either distilbert or longformer')

In [None]:
# initialize the wrapper for training and inference
cb = CausalModelWrapper(
    model=model,
    tokenizer=tokenizer,
    g_weight=0.2, Q_weight=0.2, mlm_weight=0.5,
    batch_size=batch_size, max_length=max_tokens_text, num_workers = os.cpu_count())

In [None]:
# training model
cb.train(
    texts=texts,
    confounds=confounds,
    treatments=treatments,
    outcomes=outcomes,
    epochs=2)  # train the model

## ATE estimation

In [None]:
def ate_estimation(cb, sermons):
  texts = sermons['text_input']
  confounds = sermons[['ratio_z']]
  treatments = sermons['treatment']
  outcomes = sermons[['outcome']]

  ate_estimators = cb.ATE(
      texts=texts,
      confounds=confounds,
      treatments=treatments,
      outcomes=outcomes)

  gt = (sermons[['outcome_1']].values - sermons[['outcome_0']].values).mean(axis=0)
  errors = get_errors(ate_estimators, gt)

  return ate_estimators, errors, gt

In [None]:
ate_estimators, errors, gt = ate_estimation(cb, sermons_train)

ate_estimators, errors, gt

In [None]:
with experiment.train():
  experiment.log_metrics(ate_estimators)
  experiment.log_metrics(errors)
  experiment.log_metric('ground_truth', gt)

In [None]:
# test estimators
ate_estimators, errors, gt = ate_estimation(cb, sermons_test)

ate_estimators, errors, gt

The ground truth

In [None]:
with experiment.test():
  experiment.log_metric('ground_truth', gt)
  experiment.log_metrics(ate_estimators)
  experiment.log_metrics(errors)

## CATE estimation on test

In [None]:
texts = sermons_test['text_input']
confounds = sermons_test[['ratio_z']]
treatments = sermons_test['treatment']
outcomes = sermons_test[['outcome']]

g, Q0, Q1, T, Y = cb.inference(
            texts=texts, confounds=confounds, treatments=treatments, outcomes=outcomes)

In [None]:
# Using DR estimator
from causal_sermons.ate import tau_DR_i

In [None]:
ite_DR = tau_DR_i(Q0, Q1, g, T, Y)
sermons_test['ite_DR'] = ite_DR
sermons_test['ite_gt'] = sermons_test['outcome_1'] - sermons_test['outcome_0']

In [None]:
sermons_test.head()

In [None]:
# prompt: add a column with the quantile of ratio
sermons_test['ratio_quantile'] = pd.qcut(sermons_test['ratio'], q=10)

In [None]:
cate_estimates = sermons_test.groupby('ratio_quantile').agg(tau_DR=('ite_DR', 'mean'), tau_gt=('ite_gt','mean')).reset_index()
cate_estimates['error'] = cate_estimates['tau_DR'] - cate_estimates['tau_gt']
cate_estimates

In [None]:
experiment.log_table(cate_estimates)

end experiment and close vm

In [None]:
experiment.end()

In [None]:
from google.colab import runtime
runtime.unassign()