<a href="https://colab.research.google.com/github/YounSooKimTech/Coding_Test_202312/blob/main/Data_Simulation_JoyKim.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Written by Joy Kim


# Reference Papers:
- Saelee, R., Zell, E., Murthy, B. P., Castro-Roman, P., Fast, H., Meng, L., ... & Murthy, N. (2022). Disparities in COVID-19 vaccination coverage between urban and rural counties—United States, December 14, 2020–January 31, 2022. Morbidity and Mortality Weekly Report, 71(9), 335.

- Lomeli, A., Escoto, A. A., Reyes, B., Burola, M. L. M., Tinoco-Calvillo, S., Villegas, I., ... & Seifert, M. (2023). Factors associated with COVID-19 vaccine uptake in a US/Mexico border community: demographics, previous influenza vaccination, and trusted sources of health information. Frontiers in Public Health, 11.

## Baseline Survey Data

In [None]:
# Create baseline data through a random process.
# The ratios (p) of demographic variables are determined from the American Community Survey or the US Census Dataset.

import pandas as pd
import numpy as np

np.random.seed(123)

baseline = pd.DataFrame({
    'ID': range(1, 5001),
    'Age': np.random.randint(18, 65, 5000),
    'Sex': np.random.choice(['Male', 'Female'], 5000, p=[0.5, 0.5]),
    'Ethnicity': np.random.choice(["White", "Black", "Latino", "Asian", "Others"], 5000, p=[0.6, 0.12, 0.19, 0.06, 0.03]),
    'University_Degree': np.random.choice(["No","Yes"], 5000,p=[0.7, 0.3]),
    'Flu_Vaccination_History': np.random.choice(['No', 'Yes'], 5000, p=[0.5, 0.5]),
    'Living_Area': np.random.choice(['Rural', 'Urban'], 5000, p=[0.2, 0.8]),
    'Before_Perception':np.random.randint(1, 6, 5000)
})

print(baseline['Before_Perception'].value_counts().sort_index())

# Save the data
baseline.to_csv("baseline_survey.csv", index=False)

1    1008
2    1018
3     979
4    1027
5     968
Name: Before_Perception, dtype: int64


## Random Assignment for 5000 participants

In [None]:
# Random assignment:
# Participants are equally randomly assigned to the reason, emotion, and control groups.

random_assignment = pd.DataFrame({
    'ID': range(1, 5001),
    'Group': np.random.choice(['Reason', 'Emotion', 'Control'], 5000, p=[1/3, 1/3, 1/3])
})

print(random_assignment["Group"].value_counts())

# Save the data
random_assignment.to_csv("random_survey.csv", index=False)

Control    1689
Emotion    1675
Reason     1636
Name: Group, dtype: int64


## Endline Survey

In [None]:
# Two dependent variables (DVs) are used.
# Likelihood represents the intention of behavior, where 5 indicates the highest likelihood of vaccination and 1 indicates the least likelihood.
# Actual Vaccination represents the behavior of getting vaccinated within one year.
# It is assumed that this study can include a follow-up with up to 4500 people.

endline = pd.DataFrame({
    'ID':np.random.choice(range(1, 5001), 4500, replace=False),
    'After_Perception':np.random.randint(1, 6, 4500),
    'Vaccinated':np.random.choice([1, 0], 4500)
})

print(endline['After_Perception'].value_counts().sort_index())

# Save the data
endline.to_csv("endline_survey.csv", index=False)

1    906
2    925
3    927
4    870
5    872
Name: After_Perception, dtype: int64
