### Web Scraping the Data!

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

**Code to scrape California population by County**

In [2]:
url = 'https://www.california-demographics.com/counties_by_population'

res = requests.get(url)
print(res.status_code)

# beautiful soup!
soup = BeautifulSoup(res.content)

# find html table
table = soup.find('table')

columns = [i.text for i in table.find_all('th')]

rows = table.find_all('tr')[1:-1]

row_dict_list = []

for i in rows:
    data = i.find_all('td')

    row_dict = {}
    
    
    rank = data[0].text
    county = data[1].text
    pop = data[2].text
    
    row_dict['Rank'] = int(rank.strip('\n        '))
    row_dict['County'] = county.strip('\n        ').replace(' County', '').upper()
    row_dict['Population'] = int(pop.strip('\n        ').replace(',', ''))
    
    row_dict_list.append(row_dict)

200


In [3]:
ca_counties = pd.DataFrame(data = row_dict_list)
print(ca_counties.shape[0], 'rows; ')
ca_counties.head()

58 rows; 


Unnamed: 0,County,Population,Rank
0,LOS ANGELES,10105518,1
1,SAN DIEGO,3343364,2
2,ORANGE,3185968,3
3,RIVERSIDE,2450758,4
4,SAN BERNARDINO,2171603,5


In [4]:
# save to CSV
ca_counties.to_csv('./data/ca_counties_by_pop.csv', index = False)

**Code to scrape West Virginia population by County**

In [5]:
url = 'https://www.westvirginia-demographics.com/counties_by_population'

res_wv = requests.get(url)
print(res_wv.status_code)

# beautiful soup!
soup_wv = BeautifulSoup(res_wv.content)

# find html table
table_wv = soup_wv.find('table')

columns = [i.text for i in table.find_all('th')]

rows_wv = table_wv.find_all('tr')[1:-1]

wv_row_dict_list = []

for i in rows_wv:
    data = i.find_all('td')

    row_dict = {}
    
    
    rank = data[0].text
    county = data[1].text
    pop = data[2].text
    
    row_dict['Rank'] = int(rank.strip('\n        '))
    row_dict['County'] = county.strip('\n        ').replace(' County', '').upper()
    row_dict['Population'] = int(pop.strip('\n        ').replace(',', ''))
    
    wv_row_dict_list.append(row_dict)

200


In [6]:
wv_counties = pd.DataFrame(data = wv_row_dict_list)
print(wv_counties.shape[0], 'rows; ')
wv_counties.head()

55 rows; 


Unnamed: 0,County,Population,Rank
0,KANAWHA,180454,1
1,BERKELEY,117123,2
2,MONONGALIA,106420,3
3,CABELL,93224,4
4,WOOD,84203,5


In [7]:
wv_counties.to_csv('./data/west_virgina_pop_by_county.csv', index = False)

#### Opioid Deaths per State per year
**2009 - 2017**

Download Separate CSVs for each year from [CDC](https://www.kff.org/other/state-indicator/opioid-overdose-death-rates/?currentTimeframe=18&sortModel=%7B%22colId%22:%22Location%22,%22sort%22:%22asc%22%7D)

In [8]:
# years of data - 1999 to 2018
years = range(1999, 2018)

# constructing a dictionary to correspond with the unique file name in the data directory 
year_to_url_key = {v:years[v] for v in range(len(years)) }
print(year_to_url_key)

{0: 1999, 1: 2000, 2: 2001, 3: 2002, 4: 2003, 5: 2004, 6: 2005, 7: 2006, 8: 2007, 9: 2008, 10: 2009, 11: 2010, 12: 2011, 13: 2012, 14: 2013, 15: 2014, 16: 2015, 17: 2016, 18: 2017}


In [9]:
# creating an empty list for each new dataframe
dfs = []

# 19 years of data
for i in range(0, 19):
    data = './cdc_opioid_mortality/raw_data ({}).csv'.format(i)

    # csv format has notes - we only want the rows with state data
    df = pd.read_csv(data, header = 2)[:52]
    
    # changing the column names
    df.columns = ['location', '{}_opioid_deaths'.format(year_to_url_key[i]), 
                  '{}_drug_deaths'.format(year_to_url_key[i]), 
                  '{}_change_opioid'.format(year_to_url_key[i]), 
                  '{}_change_drug'.format(year_to_url_key[i])]
    
    # adding to the list of dfs
    dfs.append(df)
    
# new empty dataframe    
opioids = pd.DataFrame()

# add all the data together 
for i in dfs:
    opioids[i.columns] = i

In [10]:
opioids.head()

Unnamed: 0,location,1999_opioid_deaths,1999_drug_deaths,1999_change_opioid,1999_change_drug,2000_opioid_deaths,2000_drug_deaths,2000_change_opioid,2000_change_drug,2001_opioid_deaths,...,2015_change_opioid,2015_change_drug,2016_opioid_deaths,2016_drug_deaths,2016_change_opioid,2016_change_drug,2017_opioid_deaths,2017_drug_deaths,2017_change_opioid,2017_change_drug
0,United States,2.9,6.1,NSD,NSD,3.0,6.2,0.03,0.02,3.3,...,0.156,0.109,13.3,19.8,0.28,0.21,14.9,21.7,0.12,0.1
1,Alabama,0.8,3.9,NSD,NSD,1.0,4.5,0.25,0.15,1.3,...,0.089,0.033,7.5,16.2,0.23,0.03,9.0,18.0,0.2,0.11
2,Alaska,4.0,7.5,NSD,NSD,4.0,7.1,0.0,-0.05,NR,...,0.038,-0.048,12.5,16.8,0.14,0.05,13.9,20.2,0.11,0.2
3,Arizona,4.7,10.6,NSD,NSD,4.8,10.6,0.02,0.0,5.2,...,0.159,0.044,11.4,20.3,0.12,0.07,13.5,22.2,0.18,0.09
4,Arkansas,1.1,4.4,NSD,NSD,0.8,5.4,-0.27,0.23,1.1,...,0.143,0.095,5.9,14.0,-0.18,0.01,6.5,15.5,0.1,0.11


In [11]:
opioids.to_csv('./data/ods_by_state_1999_to_2017.csv', index = False)