In [None]:
from google.colab import drive
drive.mount('/content/drive')

changing the working dir to the project's dir

In [None]:
import os

project_path = "/content/drive/MyDrive/ds/causal-sermons"
os.chdir(project_path)

Adding src to pythonpath

In [None]:
import sys
import os
from pathlib import Path

# Get the current working directory (the directory where your notebook is located)
current_dir = Path(os.getcwd())

# Add the current directory to the Python path
sys.path.append(str(current_dir/"src"))

In [None]:
!pip install -r requirements.txt

# Training the model with some data

Process church data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import altair as alt
import json

In [None]:
# reading sermon data
sermons = pd.read_pickle('./data/sermons/raw/sermoncentral_latest.pkl')

In [None]:
sermons.head()

In [None]:
sermons.shape

## Sermon data processing

In [None]:
sermons = sermons.drop(columns='concatenated_sermons')

Limit text, for starters

In [None]:
# sermons['Y'] = (sermons['trump_minus_clinton'] > 0).astype(int)

Get a percentage of political sermons

In [None]:
sermons['sermon_pol_vector'].iloc[0]

In [None]:
sermons['sermon_pol_p'] = sermons['sermon_pol_vector'].apply(lambda x: np.mean(json.loads(x)))

## Visualizations

Distribution of target and treatment

In [None]:
alt.Chart(sermons).mark_bar().encode(
    x=alt.X('trump_minus_clinton:Q', bin=alt.Bin(step=0.1), title='Trump - Clinton proportion'),
    y=alt.Y('count()', title='Number of counties')
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
).configure_title(
    fontSize=20
)

In [None]:
alt.Chart(sermons).mark_bar().encode(
    x=alt.X('portion_voted:Q', bin=alt.Bin(step=0.05), title='Voter turnout'),
    y=alt.Y('count()', title='Number of counties')
).configure_axis(
    labelFontSize=18,
    titleFontSize=18
).configure_title(
    fontSize=20
)

In [None]:
sermon_pol_chart = (alt.Chart(sermons.query("num_sermons>5")).mark_bar().encode(
    x=alt.X('sermon_pol_p:Q', bin=alt.Bin(step=0.05), title='% Political Sermons'),
    y=alt.Y('count()', title='Number of counties'))
  .properties(title = "% Political Sermons per county", width=500, height=150)
)

In [None]:
num_sermons_chart = (alt.Chart(sermons).mark_bar().encode(
    x=alt.X('num_sermons:Q', bin=alt.Bin(step=50), title='Number of sermons'),
    y=alt.Y('count()', title='Number of counties'))
  .properties(title = "Number of sermons per county", width=500, height=150)
)

In [None]:
sermons_dists = num_sermons_chart | sermon_pol_chart

(sermons_dists
 .configure_axis(
    labelFontSize=18,
    titleFontSize=18)
 .configure_title(
    fontSize=20)
)

The distribution of counties that have more than 5 sermons

In [None]:
sermons.assign(more_than_5_sermons=lambda x: x['num_sermons'] > 5).groupby('more_than_5_sermons')['county'].count()

Plot the target and intervention spatially

In [None]:
# county2fips = pd.read_csv('./data/sermons/raw/fips2county.tsv', sep='\t')
# county2fips['county'] = county2fips['CountyName'].str.upper() + ', ' + county2fips['StateAbbr']
# county2fips['fips'] = county2fips['CountyFIPS'].astype(str).str.zfill(5)
# county2fips.head()

In [None]:
# sermons = sermons.merge(county2fips[['county', 'fips']], on='county')
# sermons.head()

In [None]:
# prompt: read geojson in pandas

import geopandas as gpd

counties = gpd.read_file('./data/sermons/raw/county_2016.geojson')
counties['fips'] = counties['GEOID']

counties.sample(3)


In [None]:
sermons_gdf = gpd.GeoDataFrame(sermons.merge(counties[['fips', 'geometry']], on='fips'))
sermons_gdf.sample(3)

In [None]:
trump_minus_clinton_chart = alt.Chart(sermons_gdf).mark_geoshape(
).encode(
    color=alt.Color('trump_minus_clinton:Q', scale=alt.Scale(scheme='blueorange'), legend=alt.Legend(title='Trump - Clinton')),
    #tooltip=["fips:N", "trump_minus_clinton:Q"]
).project(
    type='albersUsa'
).properties(title = "Trump - Clinton proportion")

In [None]:
adherents_chart = alt.Chart(sermons_gdf).mark_geoshape(
).encode(
    color="percent_adherents:Q",
    #tooltip=["fips:N", "trump_minus_clinton:Q"]
).project(
    type='albersUsa'
).properties(title = "Sermon adherents")

In [None]:
sermon_pol_chart = alt.Chart(sermons_gdf).mark_geoshape(
).encode(
    color="sermon_pol_p:Q",
    #tooltip=["fips:N", "trump_minus_clinton:Q"]
).project(
    type='albersUsa'
).properties(title = "% Political Sermons")

In [None]:
turnout_chart = alt.Chart(sermons_gdf).mark_geoshape(
).encode(
    color="portion_voted:Q",
    #tooltip=["fips:N", "trump_minus_clinton:Q"]
).project(
    type='albersUsa'
).properties(title = "Voter turnout")

In [None]:
concatenated_chart = (alt.hconcat(trump_minus_clinton_chart, turnout_chart, sermon_pol_chart)
  .resolve_scale(color='independent')
  .resolve_legend(color='independent'))

concatenated_chart = concatenated_chart.configure_title(
    fontSize=20
)

concatenated_chart

In [None]:
(sermons
 .query("num_sermons>5")
 .assign(T=lambda x: x.overall_political_sermons, Y=lambda x: x.trump_minus_clinton > 0)
 .groupby('T')['Y']
 .mean())

In [None]:
0.78 - 0.95

In [None]:
(sermons
 .query("num_sermons>5")
 .assign(T=lambda x: x.overall_political_sermons, Y=lambda x: x.portion_voted)
 .groupby('T')['Y']
 .mean())

In [None]:
0.405 - 0.407