# Exercise 4.1: Search engine study: Estimate the causal effect of ice-cream consumption on swimming

The following code reproduces Table 4.1. Your task is to estimate the causal effect. 

In [1]:
import math
import numpy as np
import pandas as pd

In [2]:
# Overall parameters for the data generating process
temp_vals = ["LOW",  "MEDIUM", "HIGH"]
icecream_vals = ["LOW", "HIGH"]
fraction_high_icecream_queries = {temp_vals[0]: 0.0001, temp_vals[1]: 0.5, temp_vals[2]: 0.9}
mean_swimming_queries = {temp_vals[0]: 505, temp_vals[1]:2150 , temp_vals[2]: 4750}

In [3]:
# Function to generate the data
def generate_dataset(num_samples=30000, 
                    variance_swimming_queries = 1000):
    samples_per_temp = int(num_samples/3)
    temp_arr = []
    for temp in temp_vals:
        temp_arr.extend([temp]* samples_per_temp)
    icecream_queries = []
    for temp in temp_vals:
        num_high = int(fraction_high_icecream_queries[temp]*samples_per_temp)
        num_low = samples_per_temp - num_high
        icecream_queries.extend(["HIGH"]*num_high + ["LOW"]* num_low)
    swimming_queries = []
    for temp in temp_vals:
        swimming_queries.extend(np.random.normal(mean_swimming_queries[temp], 
                                                 math.sqrt(variance_swimming_queries),
                                                 samples_per_temp))
    # Specifying the anomalous value 
    swimming_queries[0] = 560
    timeseries_indices = np.random.permutation(num_samples)
    df = pd.DataFrame({
        'temp_timeseries': [temp_arr[ind] for ind in timeseries_indices],
        'icecream_timeseries': [icecream_queries[ind] for ind in timeseries_indices],
        'swimming_timeseries': [swimming_queries[ind] for ind in timeseries_indices]
    })
    df.loc[:, "temp_timeseries"] = pd.Categorical(df["temp_timeseries"], categories=temp_vals, ordered=True)
    df.loc[:, "icecream_timeseries"] = pd.Categorical(df["icecream_timeseries"], categories=icecream_vals, ordered=True)
    return df

In [4]:
# Summarizing the data and reproducing Table 4.1 from the book.
data = generate_dataset()
data.groupby(["temp_timeseries", "icecream_timeseries"]).agg(
    frequency = ('swimming_timeseries', np.size),
    mean_swimming_queries = ("swimming_timeseries", np.mean))

Unnamed: 0_level_0,Unnamed: 1_level_0,frequency,mean_swimming_queries
temp_timeseries,icecream_timeseries,Unnamed: 2_level_1,Unnamed: 3_level_1
LOW,LOW,9999.0,505.447944
LOW,HIGH,1.0,560.0
MEDIUM,LOW,5000.0,2150.17876
MEDIUM,HIGH,5000.0,2149.736517
HIGH,LOW,1000.0,4750.14645
HIGH,HIGH,9000.0,4750.213717


In [5]:
# TODO: Using the simple stratification estimator, calculate the average causal effect of
# icecream consumption on the number of swimming queries.

In [6]:
# TODO: Can you think of a better method to estimate the causal effect using this data?