This notebook reads raw csv files from my local drive for populations, converting them into dataframes, cleans them, and writes clean pickle files to my local drive.

In [1]:
import pandas as pd
import numpy as np
from urllib.request import urlopen
import json

raw_path = r"C:\Users\adiad\Anaconda3\envs\CovidApp36\covidapp\data_raw\\"
clean_path = r"C:\Users\adiad\Anaconda3\envs\CovidApp36\covidapp\data_clean\\"

# data is from https://www2.census.gov/programs-surveys/popest/datasets/2010-2019/counties/totals/co-est2019-alldata.csv
us_pop_df = pd.read_csv(raw_path + "co-est2019-alldata.csv", encoding='cp1252')
us_pop_df["FIPS"] = us_pop_df.STATE.astype(str).str.pad(width=2, side="left", fillchar="0") + \
                    us_pop_df.COUNTY.astype(str).str.pad(width=3, side="left", fillchar="0")
us_pop_df.loc[us_pop_df.COUNTY == 0, "FIPS"] = np.nan
us_pop_df = us_pop_df[["STNAME", "CTYNAME", "FIPS", "POPESTIMATE2019"]]
us_pop_df.columns = ["State", "County", "FIPS", "Population"]
us_pop_df.State = us_pop_df.State.astype("category")
us_pop_df.County = us_pop_df.County.astype("category")
us_pop_df.FIPS = us_pop_df.FIPS.astype("category")
us_pop_df.to_pickle(clean_path + "us_pop_df.pkl")
us_pop_df.head(3)

Unnamed: 0,State,County,FIPS,Population
0,Alabama,Alabama,,4903185
1,Alabama,Autauga County,1001.0,55869
2,Alabama,Baldwin County,1003.0,223234


In [2]:
# data from https://en.wikipedia.org/wiki/List_of_countries_by_population_%28United_Nations%29
world_pop_df = pd.read_csv(raw_path + "Nation_Pop_Wikipedia_2020_04_08.csv", encoding='cp1252')
world_pop_df = world_pop_df[["Country or area", "Population_2019"]]
world_pop_df.columns = ["Country", "Population"]
world_pop_df.State = world_pop_df.Country.astype("category")
world_pop_df.to_pickle(clean_path + "world_pop_df.pkl")
world_pop_df.head(3)

Unnamed: 0,Country,Population
0,China,1433783686
1,India,1366417754
2,United States,329064917


In [3]:
# data from https://en.wikipedia.org/wiki/Provinces_of_China#List_of_province-level_divisions
china_pop_df = pd.read_csv(raw_path + "China_Pop_2020-04_08.csv")
china_pop_df.Province = china_pop_df.Province.str.replace(" Province", "")
china_pop_df.Province = china_pop_df.Province.str.replace(" Municipality", "")
china_pop_df.columns = ["State", "Population"]
china_pop_df["Country"] = "China"
china_pop_df.State = china_pop_df.State.astype("category")
china_pop_df.Country = china_pop_df.Country.astype("category")
china_pop_df.head(30)

Unnamed: 0,State,Population,Country
0,Anhui,59500510,China
1,Beijing,19612368,China
2,Chongqing,28846170,China
3,Fujian,36894216,China
4,Guangdong,104303132,China
5,Gansu,25575254,China
6,Guangxi Zhuang Autonomous Region,46026629,China
7,Guizhou,34746468,China
8,Henan,94023567,China
9,Hubei,57237740,China


In [4]:
# data from https://en.wikipedia.org/wiki/States_and_territories_of_Australia
australia_pop_df = pd.read_csv(raw_path + "Australia_Pop_2020_04_09.csv")
australia_pop_df.columns = ["State", "Population"]
australia_pop_df["Country"] = "Australia"
australia_pop_df.State = australia_pop_df.State.astype("category")
australia_pop_df.Country = australia_pop_df.Country.astype("category")
australia_pop_df.head(3)

Unnamed: 0,State,Population,Country
0,New South Wales,8089526.0,Australia
1,Queensland,5095100.0,Australia
2,South Australia,1751693.0,Australia


In [5]:
# data from https://en.wikipedia.org/wiki/Population_of_Canada_by_province_and_territory
canada_pop_df = pd.read_csv(raw_path + "Canada_Pop_2020_04_08.csv")
canada_pop_df.columns = ["State", "Population"]
canada_pop_df["Country"] = "Canada"
canada_pop_df.State = canada_pop_df.State.astype("category")
canada_pop_df.Country = canada_pop_df.Country.astype("category")
canada_pop_df.head(3)

Unnamed: 0,State,Population,Country
0,Ontario,13448494,Canada
1,Quebec,8164361,Canada
2,British Columbia,4648055,Canada


In [6]:
australia_china_canada_pop_df = australia_pop_df.append(china_pop_df, ignore_index=True)
australia_china_canada_pop_df = australia_china_canada_pop_df.append(canada_pop_df, ignore_index=True)
australia_china_canada_pop_df.to_pickle(clean_path + "australia_china_canada_pop_df.pkl")
australia_china_canada_pop_df.head(3)

Unnamed: 0,State,Population,Country
0,New South Wales,8089526.0,Australia
1,Queensland,5095100.0,Australia
2,South Australia,1751693.0,Australia


In [7]:
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

with open(clean_path +"us_county_geo.json", "w", encoding="utf-8") as f:
    json.dump(counties, f, ensure_ascii=False, indent=4)