# Autism Data
Source: https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9128411/table/T2/?report=objectonly

In [42]:
# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
import sys
from pathlib import Path

# Import local modules
sys.path.append("../Local_Modules/")
import codebook


In [25]:
# Open automated browser
browser = Browser('chrome')
url = 'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9128411/table/T2/?report=objectonly'
browser.visit(url)

In [26]:
# Create a Beautiful Soup object
html = browser.html
soup = BeautifulSoup(html, 'html.parser')

In [27]:
# End the automated browsing session
browser.quit()

In [28]:
tables = soup.find_all('table')
table = tables[0]

dataframes = []

for table in tables:
    dataframes.append(pd.read_html(str(table))[0])

## Table
State estimated autism spectrum disorder prevalence among adults ages 18–84 years, cases, and associated 95% simulation interval

In [29]:
dataframes[0]

Unnamed: 0,State,Cases,95% SI,Prevalence,95% SI.1
0,Alabama,78072,"61,527, 96,435",2.12,"1.67, 2.61"
1,Alaska,12000,"9559, 14,849",2.19,"1.74, 2.71"
2,Arizona,119924,"95,618, 147,485",2.29,"1.82, 2.81"
3,Arkansas,45569,"35,644, 56,735",2.03,"1.59, 2.53"
4,California,701669,"563,358, 863,471",2.36,"1.89, 2.90"
5,Colorado,96917,"78,736, 117,790",2.28,"1.85, 2.77"
6,Connecticut,65337,"51,985, 81,354",2.37,"1.89, 2.96"
7,Delaware,16683,"13,191, 20,742",2.26,"1.79, 2.81"
8,District of Columbia,11700,"9281, 14,425",2.1,"1.67, 2.59"
9,Florida,329131,"259,573, 407,473",2.03,"1.60, 2.51"


In [30]:
# Copy the DataFrame into a new one
autism_df = dataframes[0][:]

# Remove Cases and 95% SI columns
autism_df = autism_df.drop(columns=['Cases', '95% SI'])

# Split 95% interval into two columns
autism_df[['Low_95%','High_95%']] = autism_df['95% SI.1'].str.split(', ',expand=True)

# Drop 95% SI.1	column
autism_df = autism_df.drop(columns='95% SI.1')

# Drop row with total amounts
autism_df = autism_df.loc[autism_df['State'] != 'Total']

In [31]:
autism_df.tail()

Unnamed: 0,State,Prevalence,Low_95%,High_95%
46,Virginia,2.41,1.94,2.94
47,Washington,2.13,1.7,2.65
48,West Virginia,2.07,1.62,2.58
49,Wisconsin,2.23,1.8,2.73
50,Wyoming,2.26,1.79,2.78


In [40]:
# Replace State names by state 2-letter code
for index, row in autism_df.iterrows():
    autism_df.loc[index,'State'] = codebook.codes_states[autism_df.loc[index,'State']]

In [43]:
# Save DataFrame as CSV file
csv_out = Path('../clean_data/clean_autism.csv')
autism_df.to_csv(csv_out, index=False)