In [None]:
from google.colab import drive
drive.mount('/content/drive')

changing the working dir to the project's dir

In [None]:
import os

project_path = "/content/drive/MyDrive/ds/causal-sermons"
os.chdir(project_path)

Adding src to pythonpath

In [None]:
import sys
import os
from pathlib import Path

# Get the current working directory (the directory where your notebook is located)
current_dir = Path(os.getcwd())

# Add the current directory to the Python path
sys.path.append(str(current_dir/"src"))

In [None]:
!pip install -r requirements.txt

# Training the model with some data

Process church data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns

In [None]:
# reading sermon data
sermons = pd.read_pickle('./data/sermons/raw/sermoncentral_latest.pkl')

In [None]:
sermons.head()

In [None]:
sermons.shape[0]

## Preprocessing

In [None]:
sermons = sermons.query("num_sermons > 5")

In [None]:
sermons['C'] = 1

Limit text, for starters

In [None]:
import re

def clean_and_limit_text(text):
    # Remove special characters and whitespace using regular expressions
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # Replace multiple spaces with a single space
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()

    # Split the text into words and select the first 1000 words
    words = cleaned_text.split()[:1000]

    # Join the words back together to create the limited text
    limited_text = ' '.join(words)

    return limited_text

In [None]:
sermons['cleaned_sermons'] = sermons['concatenated_sermons'].apply(clean_and_limit_text)

## Training Causal Bert Y: trump_minus_hillary

In [None]:
sermons['Y'] = (sermons['trump_minus_clinton'] > 0).astype(int)

In [None]:
from causal_sermons.causal_bert import CausalBertWrapper

In [None]:
# initialize the wrapper for training and inference
cb = CausalBertWrapper(
    batch_size=32,
    g_weight=0.1, Q_weight=0.1, mlm_weight=1, max_length=256)


# training model
cb.train(
    sermons['cleaned_sermons'],
    sermons['C'],
    sermons['overall_political_sermons'],
    sermons['Y'],
    epochs=4)  # train the model

In [None]:
cb.ATE(sermons['C'], sermons['cleaned_sermons'], platt_scaling=True)

## Training Causal Bert (Y: turnout)

In [None]:
sermons['Y'] = sermons['portion_voted']

In [None]:
from causal_sermons.causal_bert import CausalBertWrapper

In [None]:
# initialize the wrapper for training and inference
cb = CausalBertWrapper(
    batch_size=32,
    g_weight=0.1, Q_weight=0.1, mlm_weight=1, max_length=256)


# training model
cb.train(
    sermons['cleaned_sermons'],
    sermons['C'],
    sermons['overall_political_sermons'],
    sermons['Y'],
    epochs=4)  # train the model

In [None]:
cb.ATE(sermons['C'], sermons['cleaned_sermons'], platt_scaling=True)