In [None]:
from google.colab import drive
drive.mount('/content/drive')

changing the working dir to the project's dir

In [None]:
import os

project_path = "/content/drive/MyDrive/ds/causal-sermons"
os.chdir(project_path)

Adding src to pythonpath

In [None]:
import sys
import os
from pathlib import Path

# Get the current working directory (the directory where your notebook is located)
current_dir = Path(os.getcwd())

# Add the current directory to the Python path
sys.path.append(str(current_dir/"src"))

In [None]:
!pip install -r requirements.txt

# Training the model with some data

Process church data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
# reading sermon data
sermons = pd.read_csv('./data/sermons/county_results.csv')

In [None]:
sermons.head()

In [None]:
sermons.columns

In [None]:
sermons.describe()

In [None]:
sermons.isnull().sum()

In [None]:
sermons.shape[0]

## Preprocessing

In [None]:
sermons = sermons.query("num_sermons > 5")
sermons = sermons.dropna()

In [None]:
# calculate the other outcome of interest
sermons['portion_voted'] = sermons.total_turnout / sermons.population

In [None]:
sermons['concatenated_sermons'] = sermons['concatenated_sermons'].str.slice(stop=10000)

In [None]:
from sklearn.preprocessing import MinMaxScaler
# log and scale pop and num_sermons
sermons['log_population'] = np.log(sermons.population)
sermons['log_num_sermons'] = np.log(sermons.num_sermons)

scaler = MinMaxScaler()
sermons[['log_population', 'log_num_sermons', 'rural_urban_idx', 'median_age']] = scaler.fit_transform(
    sermons[['log_population', 'log_num_sermons', 'rural_urban_idx', 'median_age']])

# scale perc_with_bachelors and unemployment
sermons['perc_with_bachelors'] = sermons['perc_with_bachelors'] / 100
sermons['unemployment'] = sermons['unemployment'] / 100

In [None]:
sermons['C_dummy'] = 1
sermons['dummy_text'] = 'this is just a dummy text, please ignore'

## Training Causal Bert Y: trump_minus_hillary, portion_voted

In [None]:
experiment = Experiment(
  api_key=userdata.get('comet_key'),
  project_name="causal-sermons-effect",
  workspace=userdata.get('comet_user')
)

In [None]:
import gc
import torch

from causal_sermons.causal_bert import CausalModelWrapper, CausalDistilBert
from transformers import DistilBertTokenizer
from sklearn.model_selection import StratifiedKFold
from causal_sermons.ate import get_errors, all_ate_estimators, all_ite_estimators, tau_DR_i
from comet_ml import Experiment
from google.colab import userdata

Setting up the variables

In [None]:
# IMPORTANT parameter
model_version = 'text_and_confounds'
experiment.log_parameter('model_version', model_version)

In [None]:
outcomes = ['trump_minus_clinton', 'portion_voted']
treatment = 'overall_political_sermons'

if model_version == 'text_and_confounds':
  text = 'concatenated_sermons'
  confounds = [
      'log_num_sermons', 'perc_white',
      'log_population', 'perc_with_bachelors', 'unemployment', 'rural_urban_idx',
      'romney_minus_obama', 'percent_adherents']
elif model_version == 'text_only':
  text = 'concatenated_sermons'
  confounds = ['C_dummy']
elif model_version == 'confounds_only':
  text = 'dummy_text'
  confounds = [
      'log_num_sermons', 'perc_white',
      'log_population', 'perc_with_bachelors', 'unemployment', 'rural_urban_idx',
      'romney_minus_obama', 'percent_adherents']

else:
  raise ValueError('model_version not recognized')

In [None]:
batch_size = 32
max_tokens_text = 256
num_epochs = 4
n_folds = 5
sample_frac = 1
alpha_c = 0.01

In [None]:
experiment.log_parameters({
    'batch_size': batch_size,
    'max_tokens_text': max_tokens_text,
    'num_epochs': num_epochs,
    'n_folds': n_folds,
    'sample_frac': sample_frac,
    'alpha_c': alpha_c,
    'text': text,
    'outcomes': outcomes,
    'confounds': confounds,
    'treatment': treatment
})

In [None]:
def init_model():
  model = CausalDistilBert.from_pretrained(
    "distilbert-base-uncased",
    num_outcomes=len(outcomes),
    num_confounders=len(confounds),
    output_attentions=False,
    output_hidden_states=False)

  tokenizer = DistilBertTokenizer.from_pretrained(
      'distilbert-base-uncased', do_lower_case=True)

  # initialize the wrapper for training and inference
  cb = CausalModelWrapper(
      model=model,
      tokenizer=tokenizer,
      g_weight=0.2, Q_weight=0.2, mlm_weight=0.5,
      batch_size=batch_size, max_length=max_tokens_text, num_workers = os.cpu_count())

  return cb

In [None]:
kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)

sermons = sermons.sample(frac=sample_frac, random_state=0)

In [None]:
estimates = []
tau_estimates = []


for fold, (train_idx, test_idx) in enumerate(kfold.split(sermons, sermons[treatment])):
  # Clean cache and collect garbage at the start of each epoch
  torch.cuda.empty_cache()
  gc.collect()

  train_sermons = sermons.iloc[train_idx]
  test_sermons = sermons.iloc[test_idx]

  cb = init_model()

  # training model
  cb.train(
      texts=train_sermons[text],
      confounds=train_sermons[confounds],
      treatments=train_sermons['overall_political_sermons'],
      outcomes=sermons[outcomes],
      epochs=num_epochs)


  # inference on test for cross
  g, Q0, Q1, T, Y = cb.inference(
      texts=test_sermons[text],
      confounds=test_sermons[confounds],
      treatments=test_sermons[treatment],
      outcomes=test_sermons[outcomes])

  g = np.clip(g, alpha_c, 1-alpha_c)
  Q0 = np.clip(Q0, -1 + alpha_c, 1-alpha_c)
  Q1 = np.clip(Q1, -1 + alpha_c, 1-alpha_c)

  ate_estimates = all_ate_estimators(Q0, Q1, g, T, Y)

  experiment.log_metrics(ate_estimates, step=fold)

  tau_estimates_df = pd.DataFrame(tau_DR_i(Q0, Q1, g, T, Y), index=sermons.index[test_idx])
  tau_estimates_df.columns = outcomes

  estimates.append(ate_estimates)
  tau_estimates.append(tau_estimates_df)

In [None]:
def proc_estimate(t):
  fold, d = t
  df = pd.DataFrame(d).T
  df.columns = outcomes
  df['fold'] = fold
  return df.rename_axis('tau').reset_index()

estimates_df = pd.concat(list(map(proc_estimate, enumerate(estimates))))
estimates_df

In [None]:
experiment.log_table('estimates_df.csv', estimates_df)

In [None]:
estimates_stats = estimates_df.groupby(['tau']).agg({col: ['mean', 'std'] for col in outcomes}).round(4)
estimates_stats.columns = [x[0] + '_' + x[1] for x in estimates_stats.columns]
estimates_stats = estimates_stats.reset_index()

estimates_stats

In [None]:
experiment.log_table('estimates_stats.csv', estimates_stats)

In [None]:
tau_estimates_df = pd.concat(tau_estimates).add_suffix('_tau')

tau_estimates_df.head()

In [None]:
sermons_tau = sermons.join(tau_estimates_df)
sermons_tau.head()

In [None]:
sermons_tau.to_csv(f'./data/sermons/sermons_tau_{model_version}.csv')

In [None]:
experiment.end()

In [None]:
# from google.colab import runtime
# runtime.unassign()