## my capstone data

### start by reading in SNAP and Child Nutrition data - national level plus county and state population from Census data


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
from bs4 import BeautifulSoup
import folium
from folium.plugins import MarkerCluster
from folium.plugins import FastMarkerCluster
import lxml.html as lh
import csv

# SNAP data

In [2]:
#read in csv files
#snap benefit national summary with participation (rounded thousands) average benefits per person in dollars and 
#costs rounded in millions as benefits, other costs and total costs (sum)census
snap_national = pd.read_csv('..\data\snap_national_summary_1969_2019.csv')                            
snap_national.head()

Unnamed: 0,year,avg_participation_t,avg_benefit_person,benefit_m,cost_other_m,cost_total_m
0,1969,2878,6.63,228.8,21.7,250.5
1,1970,4340,10.55,549.7,27.2,576.9
2,1971,9368,13.55,1522.7,53.2,1575.9
3,1972,11109,13.48,1797.3,69.4,1866.7
4,1973,12166,14.6,2131.4,76.0,2207.4


In [3]:
snap_national.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 6 columns):
year                   51 non-null int64
avg_participation_t    51 non-null int64
avg_benefit_person     51 non-null float64
benefit_m              51 non-null float64
cost_other_m           51 non-null float64
cost_total_m           51 non-null float64
dtypes: float64(4), int64(2)
memory usage: 2.5 KB


In [4]:
# comment out after use until need again
# snap_national.to_csv('../data/snap_national.csv', index = False)

# Census data -  start with state and get national totals then append 2000 and 2010 data

In [5]:
#read in csv files
#census population by state with fips code 1790-1990
census_state = pd.read_csv('..\data\pop_census_state_1790_1990.csv')
census_state.head()

Unnamed: 0,state,1990,1980,1970,1960,1950,1940,1930,1920,1910,...,1850,1840,1830,1820,1810,1800,1790,first_census,no_change_since,fips
0,Alabama,4040587,3893888,3444165,3266740,3061743,2832961,2646248,2348174,2138093,...,771623,590756,309527,127901,9046,1250,---,1800,1820,1
1,Alaska,550043,401851,300382,226167,128643,72524,59278,55036,64356,...,---,---,---,---,---,---,---,1880,1880,2
2,Arizona,3665228,2718215,1770900,1302161,749587,499261,435573,334162,204354,...,---,---,---,---,---,---,---,1860,1870,4
3,Arkansas,2350725,2286435,1923295,1786272,1909511,1949387,1854482,1752204,1574449,...,209897,97574,30388,14273,1062,---,---,1810,1830,5
4,California,29760021,23667902,19953134,15717204,10586223,6907387,5677251,3426861,2377549,...,92597,---,---,---,---,---,---,1850,1860,6


In [6]:
census_state.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 25 columns):
state              51 non-null object
1990               51 non-null int64
1980               51 non-null int64
1970               51 non-null int64
1960               51 non-null int64
1950               51 non-null int64
1940               51 non-null int64
1930               51 non-null int64
1920               51 non-null int64
1910               51 non-null int64
1900               51 non-null int64
1890               51 non-null object
1880               51 non-null object
1870               51 non-null object
1860               51 non-null object
1850               51 non-null object
1840               51 non-null object
1830               51 non-null object
1820               51 non-null object
1810               51 non-null object
1800               51 non-null object
1790               51 non-null object
first_census       51 non-null int64
no_change_since    51 non-null int

In [7]:
# need to clean census by state 
# melt into tidy data with state, year, population and fips code columns
# clean object dtypes for 1790-1890 optional as I won't be using those years

In [8]:
census_state_pop = pd.melt(frame=census_state, id_vars=['state','first_census','no_change_since','fips'],
value_vars =['1990','1980','1970','1960','1950','1940','1930','1920','1910','1900','1890','1880','1870','1860','1850',
             '1840','1830','1820','1810','1800','1790'],
var_name='year', value_name='population')
census_state_pop.population=pd.to_numeric(census_state_pop.population, errors='coerce').fillna(0).astype(np.int64)
census_state_pop.year=pd.to_numeric(census_state_pop.year, errors='coerce').fillna(0).astype(np.int64)
census_state_pop.head() 

Unnamed: 0,state,first_census,no_change_since,fips,year,population
0,Alabama,1800,1820,1,1990,4040587
1,Alaska,1880,1880,2,1990,550043
2,Arizona,1860,1870,4,1990,3665228
3,Arkansas,1810,1830,5,1990,2350725
4,California,1850,1860,6,1990,29760021


In [9]:
census_state_pop=census_state_pop[['state','year','population','fips','first_census','no_change_since']]
census_state_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1071 entries, 0 to 1070
Data columns (total 6 columns):
state              1071 non-null object
year               1071 non-null int64
population         1071 non-null int64
fips               1071 non-null int64
first_census       1071 non-null int64
no_change_since    1071 non-null int64
dtypes: int64(5), object(1)
memory usage: 50.3+ KB


In [10]:
# comment out after use until need again
census_state_pop.to_csv('../data/census_state_pop.csv', index = False)

## get a national US Population table

In [11]:
# also aggregate census_state-pop to total by year national census census_pop
census_pop=census_state_pop.groupby(['year'])['population'].sum().reset_index(name='us_population')
census_pop.tail()

Unnamed: 0,year,us_population
16,1950,151325798
17,1960,179323175
18,1970,203211926
19,1980,226545805
20,1990,248709873


In [12]:
# add in two more census years 2000 and 2010 and an estimate for 2020
census_pop=census_pop.append({'year':2000, 'us_population':281401351}, ignore_index=True)
census_pop=census_pop.append({'year':2010, 'us_population':308745538}, ignore_index=True)
census_pop=census_pop.append({'year':2020, 'us_population':331002651}, ignore_index=True)
census_pop.tail()

Unnamed: 0,year,us_population
19,1980,226545805
20,1990,248709873
21,2000,281401351
22,2010,308745538
23,2020,331002651


In [13]:
census_pop.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 2 columns):
year             24 non-null int64
us_population    24 non-null int64
dtypes: int64(2)
memory usage: 464.0 bytes


In [14]:
# comment out after use until need again
# census_pop.to_csv('../data/census_pop.csv', index = False)

## need to melt 2000 and 2010 population into a year column and total by state but first get clean county version I may not use since the historical version will take a lot of cleaning

In [15]:
# read in .csv file
# population-change-counties_2010 to get census by state for 2000 and 2010
census_county_2010=pd.read_csv('..\data\population-change-counties_2010.csv')
census_county_2010.head()

Unnamed: 0,CTYNAME,STNAME,CENSUSPOP2000,CENSUSPOP2010,NPOPCHG_2010,PPOPCHG_2010
0,Autauga County,Alabama,43671,54571,10900,24.95935518
1,Baldwin County,Alabama,140415,182265,41850,29.80450807
2,Barbour County,Alabama,29038,27457,-1581,-5.444589848
3,Bibb County,Alabama,20826,22915,2089,10.03073082
4,Blount County,Alabama,51024,57322,6298,12.34321104


In [16]:
census_county_2010=pd.melt(frame=census_county_2010, id_vars=['CTYNAME','STNAME'],
                           value_vars=['CENSUSPOP2000','CENSUSPOP2010'],
                           var_name='year', value_name='population')
census_county_2010.head() 

Unnamed: 0,CTYNAME,STNAME,year,population
0,Autauga County,Alabama,CENSUSPOP2000,43671
1,Baldwin County,Alabama,CENSUSPOP2000,140415
2,Barbour County,Alabama,CENSUSPOP2000,29038
3,Bibb County,Alabama,CENSUSPOP2000,20826
4,Blount County,Alabama,CENSUSPOP2000,51024


In [17]:
census_county_2010['year'].replace({'CENSUSPOP2000':'2000', 'CENSUSPOP2010':'2010'}, inplace=True)
census_county_2010.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6442 entries, 0 to 6441
Data columns (total 4 columns):
CTYNAME       6442 non-null object
STNAME        6442 non-null object
year          6442 non-null object
population    6442 non-null object
dtypes: object(4)
memory usage: 201.4+ KB


In [18]:
# census_county_2010['population'] = census_county_2010['population'].str.replace(',', '')
# census_county_2010['population'] = census_county_2010['population'].astype(int)
census_county_2010['year'] = census_county_2010['year'].astype(int)
census_county_2010.rename(columns = {'CTYNAME':'county','STNAME':'state'}, inplace = True) 
census_county_2010.head()

Unnamed: 0,county,state,year,population
0,Autauga County,Alabama,2000,43671
1,Baldwin County,Alabama,2000,140415
2,Barbour County,Alabama,2000,29038
3,Bibb County,Alabama,2000,20826
4,Blount County,Alabama,2000,51024


In [19]:
census_county_2010['population'] = census_county_2010['population'].str.replace(',','')
census_county_2010['population'] = census_county_2010['population'].str.replace('X','0')
#census_county_2010['population'] = census_county_2010['population'].astype(int)
#census_county_2010['population'] = census_county_2010['population'].str.strip(',').str.join('').astype(float)
census_county_2010.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6442 entries, 0 to 6441
Data columns (total 4 columns):
county        6442 non-null object
state         6442 non-null object
year          6442 non-null int32
population    6442 non-null object
dtypes: int32(1), object(3)
memory usage: 176.2+ KB


In [20]:
census_county_2010['population'] = census_county_2010['population'].replace('', 0)
census_county_2010.sort_values(by ='population')
census_county_2010['population'] = census_county_2010['population'].astype(float)
#census_county_2010['population'] = census_county_2010['population'].astype(int)
census_county_2010.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6442 entries, 0 to 6441
Data columns (total 4 columns):
county        6442 non-null object
state         6442 non-null object
year          6442 non-null int32
population    6442 non-null float64
dtypes: float64(1), int32(1), object(2)
memory usage: 176.2+ KB


In [21]:
census_county_2010.head()

Unnamed: 0,county,state,year,population
0,Autauga County,Alabama,2000,43671.0
1,Baldwin County,Alabama,2000,140415.0
2,Barbour County,Alabama,2000,29038.0
3,Bibb County,Alabama,2000,20826.0
4,Blount County,Alabama,2000,51024.0


## group by state for 2000 and 2010 US population

In [22]:
# need population to be a number
census_state_2010=census_county_2010.groupby(['state','year'])['population'].sum().reset_index(
    name='population')
census_state_2010.head()

Unnamed: 0,state,year,population
0,Alabama,2000,4447100.0
1,Alabama,2010,4779736.0
2,Alaska,2000,610666.0
3,Alaska,2010,710231.0
4,Arizona,2000,5130632.0


In [23]:
census_state_2010.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 3 columns):
state         104 non-null object
year          104 non-null int64
population    104 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.5+ KB


In [24]:
# Drop Puerto Rico rows and drop un-needed columns
indexNames = census_state_2010[ census_state_2010['state'] == 'Puerto Rico' ].index
 
# Delete these row indexes from dataFrame
census_state_2010.drop(indexNames , inplace=True)

## Combine previous census_state_pop with census_state_2010 to append the two additional census results for 2000 and 2010 by state and reduce to columns state year and population

In [25]:
census_state_pop['year'] = census_state_pop['year'].astype(int)
census_state_pop_all= pd.merge(census_state_pop, census_state_2010, on=['state','year','population'], how='outer')
census_state_pop_all.head()

Unnamed: 0,state,year,population,fips,first_census,no_change_since
0,Alabama,1990,4040587,1.0,1800.0,1820.0
1,Alaska,1990,550043,2.0,1880.0,1880.0
2,Arizona,1990,3665228,4.0,1860.0,1870.0
3,Arkansas,1990,2350725,5.0,1810.0,1830.0
4,California,1990,29760021,6.0,1850.0,1860.0


In [26]:
census_state_pop_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1173 entries, 0 to 1172
Data columns (total 6 columns):
state              1173 non-null object
year               1173 non-null int64
population         1173 non-null int64
fips               1071 non-null float64
first_census       1071 non-null float64
no_change_since    1071 non-null float64
dtypes: float64(3), int64(2), object(1)
memory usage: 64.1+ KB


In [27]:
pd.merge(census_state_pop, census_state_2010, how='left', left_on=['state','year','population'],
         right_on=['state','year','population'])
census_state_pop_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1173 entries, 0 to 1172
Data columns (total 6 columns):
state              1173 non-null object
year               1173 non-null int64
population         1173 non-null int64
fips               1071 non-null float64
first_census       1071 non-null float64
no_change_since    1071 non-null float64
dtypes: float64(3), int64(2), object(1)
memory usage: 64.1+ KB


## Drop columns not needed in joined df


In [28]:
census_state_pop_all= census_state_pop_all.drop(['fips', 'first_census', 'no_change_since'], axis=1)
census_state_pop_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1173 entries, 0 to 1172
Data columns (total 3 columns):
state         1173 non-null object
year          1173 non-null int64
population    1173 non-null int64
dtypes: int64(2), object(1)
memory usage: 36.7+ KB


In [30]:
# comment out after use until need again
#census_state_pop_all.to_csv('../data/census_state_pop_all.csv', index = False)

# Civilian Employment data 

In [31]:
#read in csv files
#civilian unemployment 2000-2020 by month by age and race
civilian_unemployment = pd.read_csv(r'C:\Users\annru\Documents\Capstone\data\civ_unemployment_2000_2020.csv')
civilian_unemployment.head()

Unnamed: 0,Month,Grouped,Total,Men_over_20,Women_over_20,teens_16to19,White,Black,Asian,Hispanic,trim,len
0,Apr 2000,3.83.13.512.63.47.0 5.5,3.8,3.1,3.5,12.6,3.4,7.0,5.5,,3.83.13.512.63.47.0 5.5,23
1,May 2000,4.03.33.712.83.57.7 5.8,4.0,3.3,3.7,12.8,3.5,7.7,5.8,,4.03.33.712.83.57.7 5.8,23
2,June 2000,4.03.23.712.33.47.8 5.6,4.0,3.2,3.7,12.3,3.4,7.8,5.6,,4.03.23.712.33.47.8 5.6,23
3,July 2000,4.03.33.713.43.57.7 5.8,4.0,3.3,3.7,13.4,3.5,7.7,5.8,,4.03.33.713.43.57.7 5.8,23
4,Aug 2000,4.13.33.814.03.67.9 5.9,4.1,3.3,3.8,14.0,3.6,7.9,5.9,,4.13.33.814.03.67.9 5.9,23


In [32]:
civilian_unemployment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 12 columns):
    Month        241 non-null object
Grouped          241 non-null object
Total            241 non-null float64
Men_over_20      241 non-null float64
Women_over_20    241 non-null float64
teens_16to19     241 non-null float64
White            241 non-null float64
Black            241 non-null float64
Asian            241 non-null float64
Hispanic         208 non-null float64
trim             241 non-null object
len              241 non-null int64
dtypes: float64(8), int64(1), object(3)
memory usage: 22.7+ KB


In [33]:
civilian_unemployment.tail()

Unnamed: 0,Month,Grouped,Total,Men_over_20,Women_over_20,teens_16to19,White,Black,Asian,Hispanic,trim,len
236,Feb 2010,9.810.38.025.68.916.18.212.7,9.8,10.3,8.0,25.6,8.9,16.1,8.2,12.7,9.810.38.025.68.916.18.212.7,28
237,Mar 2010,9.910.28.126.28.916.87.612.9,9.9,10.2,8.1,26.2,8.9,16.8,7.6,12.9,9.910.28.126.28.916.87.612.9,28
238,Apr 2010,9.910.28.325.49.016.67.012.5,9.9,10.2,8.3,25.4,9.0,16.6,7.0,12.5,9.910.28.325.49.016.67.012.5,28
239,Oct 2009,10.010.48.027.29.215.87.612.8,10.0,10.4,8.0,27.2,9.2,15.8,7.6,12.8,10.010.48.027.29.215.87.612.8,29
240,Apr 2020,14.713.015.531.914.216.714.518.9,14.7,13.0,15.5,31.9,14.2,16.7,14.5,18.9,14.713.015.531.914.216.714.518.9,32


In [34]:
# df = df[df.columns.drop(col_to_drop)]
cols_to_drop = ['Grouped', 'trim','len']
civilian_unemployment=civilian_unemployment[civilian_unemployment.columns.drop(cols_to_drop)]
civilian_unemployment=civilian_unemployment
civilian_unemployment.head()

Unnamed: 0,Month,Total,Men_over_20,Women_over_20,teens_16to19,White,Black,Asian,Hispanic
0,Apr 2000,3.8,3.1,3.5,12.6,3.4,7.0,5.5,
1,May 2000,4.0,3.3,3.7,12.8,3.5,7.7,5.8,
2,June 2000,4.0,3.2,3.7,12.3,3.4,7.8,5.6,
3,July 2000,4.0,3.3,3.7,13.4,3.5,7.7,5.8,
4,Aug 2000,4.1,3.3,3.8,14.0,3.6,7.9,5.9,


In [35]:
civilian_unemployment.columns=civilian_unemployment.columns.str.strip().str.lower()
civilian_unemployment.head()

Unnamed: 0,month,total,men_over_20,women_over_20,teens_16to19,white,black,asian,hispanic
0,Apr 2000,3.8,3.1,3.5,12.6,3.4,7.0,5.5,
1,May 2000,4.0,3.3,3.7,12.8,3.5,7.7,5.8,
2,June 2000,4.0,3.2,3.7,12.3,3.4,7.8,5.6,
3,July 2000,4.0,3.3,3.7,13.4,3.5,7.7,5.8,
4,Aug 2000,4.1,3.3,3.8,14.0,3.6,7.9,5.9,


In [36]:
civilian_unemployment[['month','year']]=civilian_unemployment.month.apply(lambda x: pd.Series(str(x).split(" "))) 


In [37]:
civilian_unemployment.columns

Index(['month', 'total', 'men_over_20', 'women_over_20', 'teens_16to19',
       'white', 'black', 'asian', 'hispanic', 'year'],
      dtype='object')

In [38]:
civilian_unemployment=civilian_unemployment[['month', 'year', 'total', 'men_over_20', 'women_over_20', 'teens_16to19',
       'white', 'black', 'asian', 'hispanic']]
civilian_unemployment.head()

Unnamed: 0,month,year,total,men_over_20,women_over_20,teens_16to19,white,black,asian,hispanic
0,Apr,2000,3.8,3.1,3.5,12.6,3.4,7.0,5.5,
1,May,2000,4.0,3.3,3.7,12.8,3.5,7.7,5.8,
2,June,2000,4.0,3.2,3.7,12.3,3.4,7.8,5.6,
3,July,2000,4.0,3.3,3.7,13.4,3.5,7.7,5.8,
4,Aug,2000,4.1,3.3,3.8,14.0,3.6,7.9,5.9,


In [39]:
civilian_unemployment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 10 columns):
month            241 non-null object
year             241 non-null object
total            241 non-null float64
men_over_20      241 non-null float64
women_over_20    241 non-null float64
teens_16to19     241 non-null float64
white            241 non-null float64
black            241 non-null float64
asian            241 non-null float64
hispanic         208 non-null float64
dtypes: float64(8), object(2)
memory usage: 18.9+ KB


In [40]:
civilian_unemployment['year'] = civilian_unemployment['year'].astype(int)

In [41]:
civilian_unemployment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 10 columns):
month            241 non-null object
year             241 non-null int32
total            241 non-null float64
men_over_20      241 non-null float64
women_over_20    241 non-null float64
teens_16to19     241 non-null float64
white            241 non-null float64
black            241 non-null float64
asian            241 non-null float64
hispanic         208 non-null float64
dtypes: float64(8), int32(1), object(1)
memory usage: 18.0+ KB


In [42]:
# comment out after use until need again
# civilian_unemployment.to_csv('../data/civ_unemploy.csv', index = False)

# read in Food expenditures 1997-2019 from USDA and web scrape the table

In [None]:
# web scrape population numbers 
url = 'https://www.worldometers.info/world-population/us-population/'
url = f'{url}'

In [None]:
# pull data from the page
resp = requests.get(url)

# if successful, the print statement should be 200
print(resp.status_code)

In [None]:
# Establish the "soup" object so to read the website content
soup = BeautifulSoup(resp.text, 'html.parser')

In [None]:
pop_demographics = soup.find(id='worldometers_728x90_970x90_BTF') \
    .find('div', {'class': 'table-responsive'}) 

In [None]:
# Use the soup.. are there pagination elements? (none)
len(soup.findAll('div', {'class': 'table table-striped table-bordered table-hover table-condensed table-list'}))

In [None]:
pop_demographics = soup.find('table', {'class': 'table table-striped table-bordered table-hover table-condensed table-list'})
pop_demographics

In [None]:
tables = soup.find_all('table')
table_rows = tables[1].find_all('tr')
for tr in table_rows:
     print (tr)

In [None]:
# There are three tables on the page - I want the second one tables[1] ** these are my table heders
tables = soup.find_all('table')
table_head = tables[1].find_all('th')
for th in table_head:
     print (th)

In [None]:
# There are three tables on the page - I want the second one tables[1] ** these are my table contents
content=[]
tables = soup.find_all('table')
content = tables[1].find_all('td')
for td in content:
     print (td)        

In [None]:
pop_dem = pd.DataFrame(columns=range(0,12))
pop_dem = pd.DataFrame(pop_dem, columns=['year', 'population', 'yearly_pct_chg', 'yearly_chg','migrants_net','median_age',
                                    'fertility_rate','density','urban_pop_pct','urban_population',
                                    'country_share_worldpop','world_population','us_global_rank'])
# pop_dem = pd.DataFrame(pop_dem[content])
pop_dem

In [None]:
pop_dem.info()

## STUCK!! how do I get my table_rows  (contents) into the data frame?

## easy way to scrape per Taylor my savior

In [43]:
res = requests.get("https://www.worldometers.info/world-population/us-population/")
print(res.status_code)

200


### this reads in all tables on the page!

In [44]:
# read in tables on the page for world-population
dataframes=pd.read_html(res.text)

In [45]:
dataframes[1]

Unnamed: 0,Year,Population,Yearly % Change,Yearly Change,Migrants (net),Median Age,Fertility Rate,Density (P/Km²),Urban Pop %,Urban Population,Country's Share of World Pop,World Population,U.S.Global Rank
0,2020,331002651,0.59 %,1937734,954806,38.3,1.78,36,82.8 %,273975139,4.25 %,7794798739,3
1,2019,329064917,0.60 %,1968652,954806,37.7,1.85,36,82.5 %,271365914,4.27 %,7713468100,3
2,2018,327096265,0.62 %,2011509,954806,37.7,1.85,36,82.2 %,268786714,4.29 %,7631091040,3
3,2017,325084756,0.64 %,2068761,954806,37.7,1.85,36,81.9 %,266243516,4.31 %,7547858925,3
4,2016,323015995,0.67 %,2137685,954806,37.7,1.85,35,81.7 %,263743312,4.33 %,7464022049,3
5,2015,320878310,0.76 %,2373367,992343,37.6,1.88,35,81.4 %,261287811,4.35 %,7379797139,3
6,2010,309011475,0.93 %,2803593,1085751,36.9,2.06,34,80.7 %,249297076,4.44 %,6956823603,3
7,2005,294993511,0.93 %,2656520,1066979,36.1,2.04,32,80.0 %,235892407,4.51 %,6541907027,3
8,2000,281710909,1.22 %,3309433,1771991,35.2,2.0,31,79.1 %,222927913,4.59 %,6143493823,3
9,1995,265163745,1.01 %,2608687,892751,34.0,2.03,29,77.4 %,205240402,4.62 %,5744212979,3


In [56]:
pop_world_share=dataframes[1]
pop_world_share.head()

Unnamed: 0,Year,GDP Nominal (Current USD),GDP Real (Inflation adj.),GDP change,GDP per capita,Pop. change,Population
0,2017,"$19,485,394,000,000","$17,348,625,758,200",2.27%,"$53,366",0.64 %,325084756
1,2016,"$18,707,189,000,000","$16,972,347,070,400",1.49%,"$52,543",0.67 %,323015995
2,2015,"$18,219,297,000,000","$16,710,458,234,000",2.86%,"$52,077",0.69 %,320878310
3,2014,"$17,521,747,000,000","$16,242,525,613,600",2.57%,"$50,969",0.72 %,318673411
4,2013,"$16,784,851,000,000","$15,853,794,839,100",1.68%,"$50,107",0.75 %,316400538


In [57]:
# comment out after use until need again
#pop_world_share.to_csv('../data/pop_world_share.csv', index = False)

## more from Worldometers

In [46]:
# get the third table as a dataframe - will drop the first NaN row 
dataframes[2]

Unnamed: 0,Year,Population,Yearly % Change,Yearly Change,Migrants (net),Median Age,Fertility Rate,Density (P/Km²),Urban Pop %,Urban Population,Country's Share of World Pop,World Population,U.S.Global Rank
0,,,,,,,,,,,,,
1,2020.0,331002651.0,0.62 %,2024868.0,954806.0,38.3,1.78,36.0,82.8 %,273975139.0,4.25 %,7794799000.0,3.0
2,2025.0,340399601.0,0.56 %,1879390.0,922456.0,39.1,1.78,37.0,84.4 %,287421363.0,4.16 %,8184437000.0,3.0
3,2030.0,349641876.0,0.54 %,1848455.0,982310.0,39.9,1.78,38.0,86.1 %,301000560.0,4.09 %,8548487000.0,3.0
4,2035.0,358690999.0,0.51 %,1809825.0,1046821.0,40.9,1.78,39.0,87.5 %,313969203.0,4.04 %,8887524000.0,3.0
5,2040.0,366572154.0,0.44 %,1576231.0,1062465.0,41.6,1.78,40.0,88.9 %,325949179.0,3.98 %,9198847000.0,3.0
6,2045.0,373343348.0,0.37 %,1354239.0,1073911.0,42.2,1.78,41.0,90.2 %,336913503.0,3.94 %,9481803000.0,3.0
7,2050.0,379419102.0,0.32 %,1215151.0,,42.7,1.78,41.0,91.5 %,347346215.0,3.90 %,9735034000.0,4.0


In [47]:
pop_forecast=dataframes[2]
pop_forecast=pop_forecast.drop([0],axis=0)
pop_forecast.columns = pop_forecast.columns.str.strip().str.lower()
pop_forecast.columns = pop_forecast.columns.str.replace(' ', '_')
pop_forecast.head()

Unnamed: 0,year,population,yearly_%_change,yearly_change,migrants_(net),median_age,fertility_rate,density_(p/km²),urban_pop_%,urban_population,country's_share_of_world_pop,world_population,u.s.global_rank
1,2020.0,331002651.0,0.62 %,2024868.0,954806.0,38.3,1.78,36.0,82.8 %,273975139.0,4.25 %,7794799000.0,3.0
2,2025.0,340399601.0,0.56 %,1879390.0,922456.0,39.1,1.78,37.0,84.4 %,287421363.0,4.16 %,8184437000.0,3.0
3,2030.0,349641876.0,0.54 %,1848455.0,982310.0,39.9,1.78,38.0,86.1 %,301000560.0,4.09 %,8548487000.0,3.0
4,2035.0,358690999.0,0.51 %,1809825.0,1046821.0,40.9,1.78,39.0,87.5 %,313969203.0,4.04 %,8887524000.0,3.0
5,2040.0,366572154.0,0.44 %,1576231.0,1062465.0,41.6,1.78,40.0,88.9 %,325949179.0,3.98 %,9198847000.0,3.0


In [48]:
pop_forecast.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 1 to 7
Data columns (total 13 columns):
year                            7 non-null float64
population                      7 non-null float64
yearly_%_change                 7 non-null object
yearly_change                   7 non-null float64
migrants_(net)                  6 non-null float64
median_age                      7 non-null float64
fertility_rate                  7 non-null float64
density_(p/km²)                 7 non-null float64
urban_pop_%                     7 non-null object
urban_population                7 non-null float64
country's_share_of_world_pop    7 non-null object
world_population                7 non-null float64
u.s.global_rank                 7 non-null float64
dtypes: float64(10), object(3)
memory usage: 784.0+ bytes


In [49]:
pop_forecast.isnull().values.any()
pop_forecast.isnull().sum().sum()
# nan_rows = pop_forecast[pop_forecast['Population'].isnull()]

1

In [55]:
# comment out after use until need again
#pop_forecast.to_csv('../data/pop_forecast.csv', index = False)

## read in GDP data - but only through 2017

In [50]:
# GDP US
res = requests.get("https://www.worldometers.info/gdp/us-gdp/")
print(res.status_code)

200


In [51]:
# read in tables on the page for US GDP
dataframes=pd.read_html(res.text)

In [52]:
dataframes[1]

Unnamed: 0,Year,GDP Nominal (Current USD),GDP Real (Inflation adj.),GDP change,GDP per capita,Pop. change,Population
0,2017,"$19,485,394,000,000","$17,348,625,758,200",2.27%,"$53,366",0.64 %,325084756
1,2016,"$18,707,189,000,000","$16,972,347,070,400",1.49%,"$52,543",0.67 %,323015995
2,2015,"$18,219,297,000,000","$16,710,458,234,000",2.86%,"$52,077",0.69 %,320878310
3,2014,"$17,521,747,000,000","$16,242,525,613,600",2.57%,"$50,969",0.72 %,318673411
4,2013,"$16,784,851,000,000","$15,853,794,839,100",1.68%,"$50,107",0.75 %,316400538
5,2012,"$16,197,007,000,000","$15,567,037,390,000",2.22%,"$49,570",0.79 %,314043885
6,2011,"$15,542,582,000,000","$15,224,554,065,500",1.60%,"$48,862",0.83 %,311584047
7,2010,"$14,992,052,000,000","$14,992,052,000,000",2.53%,"$48,516",0.88 %,309011475
8,2009,"$14,418,739,000,000","$14,594,842,181,900",-2.78%,"$47,648",0.93 %,306307567
9,2008,"$14,718,582,000,000","$15,011,490,541,400",-0.29%,"$49,464",0.96 %,303486012


In [53]:
us_gdp=dataframes[1]
us_gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 7 columns):
Year                         24 non-null int64
GDP Nominal (Current USD)    24 non-null object
GDP Real (Inflation adj.)    24 non-null object
GDP change                   24 non-null object
GDP per capita               24 non-null object
Pop. change                  24 non-null object
Population                   24 non-null int64
dtypes: int64(2), object(5)
memory usage: 1.4+ KB


In [54]:
# comment out after use until need again
#us_gdp.to_csv('../data/us_gdp.csv', index = False)