<a href="https://colab.research.google.com/github/antoniotre86/causal-inference/blob/main/notebooks/Simulate_data_for_Causal_Inference_with_dowhy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

from google.colab import drive

In [3]:
drive.mount("/drive")

Mounted at /drive


# Scenario 1

We randomly generate *occupation* and *age_years* first as they don't depend on other variables. 

We then generate *exercise* based on *occupation* and *age* assuming that:
- stormtroopers exercise more than radar technicians
- people under 40 exercise more than people aged 40 or older

We generate the *healty_diet* variable based on *occupation*, assuming radar technicians have a better diet.

Finally, we generate *cholesterol* levels assuming that:
- it's lower with an *healthy diet* and *exercise*
- it's lower for people less than 40 years of *age*.

In [5]:
np.random.seed(333)

N = 1000

data_s1 = pd.DataFrame(columns=["exercise", "cholesterol", "occupation", "age_years", "healthy_diet"])

data_s1["occupation"] = np.random.choice(["stormtrooper", "radar_technician"], N)
data_s1["age_years"] = (np.random.normal(40, 10, N) / 10).astype(int) * 10

# Stormtroopers get more exercise
data_s1["exercise"] = np.random.randn(N) \
  + 1*(data_s1["occupation"] == "stormtrooper") \
  + -1*(data_s1["occupation"] == "radar_technician")
# People under 40 get more exercise
data_s1["exercise"] += 1*(data_s1["age_years"] < 40)
data_s1["exercise"] = (data_s1["exercise"] > data_s1["exercise"].mean())


# Stormtroopers have a poorer diet
data_s1["healthy_diet"] = np.random.rand(N) < (0.1*(data_s1["occupation"] == "stormtrooper") 
                                              + 0.9*(data_s1["occupation"] == "radar_technician"))

# Base level of cholesterol (random)
data_s1["cholesterol"] = 5 + np.random.randn(N)**2 

# A healthy diet gives lower cholesterol
data_s1["cholesterol"] += -3*data_s1["healthy_diet"] 

# People under 40 have lower cholesterol
data_s1["cholesterol"] += -1*(data_s1["age_years"] < 40)


# Exercise has the effect of reducing cholesterol
actual_effect_s1 = -1
data_s1["cholesterol"] += actual_effect_s1*data_s1["exercise"]

## Save

In [7]:
data_s1.to_csv("/drive/My Drive/Colab Notebooks/data/scenario_1.csv", header=True, index=False)

# Scenario 2: exercise does not affect cholesterol

In [10]:
np.random.seed(333)

N = 1000

data_s2 = data_s1.copy()

# Base level of cholesterol (random)
data_s2["cholesterol"] = 5 + np.random.randn(N)**2 

# A healthy diet gives lower cholesterol
data_s2["cholesterol"] += -3*data_s2["healthy_diet"] 

# People under 40 have lower cholesterol
data_s2["cholesterol"] += -1*(data_s2["age_years"] < 40)


# Exercise does not affect cholesterol
actual_effect_s2 = 0.0
data_s2["cholesterol"] += actual_effect_s2*data_s2["exercise"]

## Save

In [11]:
data_s2.to_csv("/drive/My Drive/Colab Notebooks/data/scenario_2.csv", header=True, index=False)