In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from google.colab import drive, userdata
drive.mount('/content/drive')

changing the working dir to the project's dir

In [None]:
import os

project_path = "/content/drive/MyDrive/ds/causal-sermons"
os.chdir(project_path)

Adding src to pythonpath

In [None]:
import sys
import os
from pathlib import Path

# Get the current working directory (the directory where your notebook is located)
current_dir = Path(os.getcwd())

# Add the current directory to the Python path
sys.path.append(str(current_dir/"src"))

In [None]:
!pip install -r requirements.txt

## Reading synthetic data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import zscore
from sklearn.model_selection import train_test_split

In [None]:
sermons = pd.read_csv('./data/sermons/dataset_summarized_small.csv')
sermons.shape

In [None]:
sermons.head()

In [None]:
sermons.loc[lambda x: x.text.isna()]

In [None]:
sermons['treatment'] = sermons['treatment'].astype(float)

sermons['outcome'] = np.where(sermons['treatment'] == 0, sermons['outcome_0'], sermons['outcome_1'])

In [None]:
sermons.describe()

# Training the model with some data

Process church data

In [None]:
# import pandas as pd
# import numpy as np
# import seaborn as sns

In [None]:
# reading sermon data
#sermons = pd.read_pickle('./data/sermons/raw/sermoncentral_latest.pkl')

In [None]:
# sermons.head()

## Preprocessing

In [None]:
# sermons = sermons.sample(n=10000, random_state=1)


sermons = sermons.loc[lambda x: x.text.notnull()]
sermons = sermons.loc[lambda x: x.text.str.len() > 100]

sermons.shape

In [None]:
#sermons = sermons.loc[lambda x: x.num_sermons>5].loc[lambda x: x.portion_voted.notnull()]

Limit and clean text

In [None]:
# dummy confounders
sermons['C_1'] = 0.2
sermons['C_2'] = 0.9
sermons['C_3'] = 0.9
sermons['C_4'] = 0.9

## Training Synthetic estimation

In [None]:
# sermons['Y_0'] = (sermons['trump_minus_clinton'] > 0).astype(int)
# sermons['Y_1'] = sermons['trump_minus_clinton']
# sermons['Y_2'] = sermons['portion_voted']

In [None]:
train_indices, test_indices = train_test_split(sermons.index, test_size=0.2, random_state=42)

sermons_train = sermons.loc[train_indices]
sermons_test = sermons.loc[test_indices]

In [None]:
# import torch

# torch.cuda.empty_cache()

# import gc
# gc.collect()

In [None]:
from causal_sermons.causal_bert import CausalModelWrapper, CausalBert

In [None]:
texts = sermons_train['cleaned_text']
confounds = sermons_train[['C_1', 'C_2', 'C_3', 'C_4']]
treatments = sermons_train['treatment']
outcomes = sermons_train[['outcome']]

In [None]:
# initialize the wrapper for training and inference

model = CausalDistilBert.from_pretrained(
            "distilbert-base-uncased",
            num_outcomes=outcomes.shape[1], # TODO change this so that it is truly 1
            num_confounders=confounds.shape[1],
            # num_labels=2,  # was a bert parameter used in the dragon, not needed anymore
            output_attentions=False,
            output_hidden_states=False)

cb = CausalModelWrapper(
    model=model,
    g_weight=0.2, Q_weight=0.2, mlm_weight=0.5,
    batch_size=32, max_length=256)

In [None]:
# training model
cb.train(
    texts=texts,
    confounds=confounds,
    treatments=treatments,
    outcomes=outcomes,
    epochs=4)  # train the model

In [None]:
texts = sermons_test['cleaned_text']
confounds = sermons_test[['C_1', 'C_2', 'C_3', 'C_4']]
outcomes = sermons_test[['outcome']]

cb.ATE(
    texts=texts,
    confounds=confounds,
    outcomes=outcomes)

The ground truth

In [None]:
(sermons_test.outcome_1 - sermons_test.outcome_0).mean()