# ETL_EDA
This file records the process of acquiring raw data, traforming them, and loading them into a MongoDB. The data are store (almost) in their raw form.

In [1]:
import requests
import pandas as pd
import numpy as np
from io import StringIO
import pymongo

In [2]:
all_states = np.array(['Washington', 'Wisconsin', 'Wyoming', 'Illinois', 'California',
       'Arizona', 'Massachusetts', 'Texas', 'Nebraska', 'Utah', 'Oregon',
       'Florida', 'New York', 'Rhode Island', 'Georgia', 'New Hampshire',
       'North Carolina', 'New Jersey', 'Colorado', 'Maryland', 'Nevada',
       'Tennessee', 'Hawaii', 'Indiana', 'Kentucky', 'Minnesota',
       'Oklahoma', 'Pennsylvania', 'South Carolina',
       'District of Columbia', 'Kansas', 'Missouri', 'Vermont',
       'Virginia', 'Connecticut', 'Iowa', 'Louisiana', 'Ohio', 'Michigan',
       'South Dakota', 'Arkansas', 'Delaware', 'Mississippi',
       'New Mexico', 'North Dakota', 'Alaska', 'Maine', 'Alabama',
       'Idaho', 'Montana', 'Puerto Rico', 'Virgin Islands', 'Guam',
       'West Virginia', 'Northern Mariana Islands'], dtype=object)

## Extract, Transform and Load (ETL)

### 1. Raw Data from New York Times
The [dataset](https://github.com/nytimes/covid-19-data) is a continuously updated txt file in csv format. It contains the covid case and death in the U.S. over time. It includes a cumulative series data at the national, state and county levels.

### 2. Raw Data to Documents/Dicts
Using pandas, it is simple to parse a in-memory string. The first few lines of description need to be skipped. Datetime conversion is made and blank lines are dropped. Now the data can be easily converted to a list of dicts which is what we want for the MongoDB.

### 3. Upsert MongoDB
If we fetch the data frequently, there are lots of duplicate data entry between each run. The de-duplication happens at insertion. The MongoDB API to use is `collection.replace_one(filter=..., replacement=..., upsert=True)`. The statement matches a document in MongoDB with `filter`, replaces it with `replacement` if the document exists or inserts `replacement` into the database if `filter` matches nothing. 

In [3]:
urls = {
    'covid-us': "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us.csv",
    'covid-us-state': "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-states.csv",
    'covid-us-county': "https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv",
    'mask-use-by-county': "https://raw.githubusercontent.com/nytimes/covid-19-data/master/mask-use/mask-use-by-county.csv"
}

filters = {
    'covid-us': ['date'],
    'covid-us-state': ['date', 'state'],
    'covid-us-county': ['date', 'county'],
    'mask-use-by-county': ['COUNTYFP']
}

In [4]:
def data_insert(level, url):
    """Request the data from url, and insert the data into
    Mongodb database.
    
    Parameters
    ==========
    level: str
        geographical level for covid tracker
    url: str
        raw data link.
    """
    ## Initialize the Mongodb database
    client = pymongo.MongoClient()
    
    
    ## Raw data from the websites
    req = requests.get(url, timeout=0.5)
    req.raise_for_status()
    text = req.text
    
    ## Raw data to documents/dictionaries.
    df = pd.read_csv(StringIO(text), delimiter=',')
    df.columns = df.columns.str.strip()             # remove space in columns name
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
    df.dropna(inplace=True)
    
    ## Upsert MongoDB
    db = client.get_database("covid-us")
    collection = db.get_collection(level)
    update_count = 0
    if level == 'covid-us-county':
        for state in all_states:
            df_state = df[df['state'] == state]
        for record in df_state.to_dict('records'):
            filter_ = {_:record[_] for _ in filters[level]}
            result = collection.replace_one(
                filter=filter_,                             # locate the document if exists
                replacement=record,                         # latest document
                upsert=True)                                # update if exists, insert if not
            if result.matched_count > 0:
                update_count += 1
    else:
        for record in df.to_dict('records'):
            filter_ = {_:record[_] for _ in filters[level]}
            result = collection.replace_one(
                filter=filter_,                             # locate the document if exists
                replacement=record,                         # latest document
                upsert=True)                                # update if exists, insert if not
            if result.matched_count > 0:
                update_count += 1
    print(f"{level.split('-')[-1]}:",
          f"rows={df.shape[0]}, update={update_count}, "
          f"insert={df.shape[0]-update_count}")

Web scrape the data and insert the data into Mongoda database.

In [5]:
%%time
for level, url in urls.items():
    data_insert(level, url)

us: rows=303, update=303, insert=0
state: rows=14369, update=14369, insert=0
county: rows=738157, update=256, insert=737901
county: rows=3142, update=3142, insert=0
CPU times: user 6.16 s, sys: 370 ms, total: 6.53 s
Wall time: 1min 6s


### Retrieve Mongodb database

In [6]:
client = pymongo.MongoClient()
db = client.get_database("covid-us")

# national data 
collection = db.get_collection("covid-us")

# state-level data
# collection = db.get_collection("covid-us-state")

# # county-level data
# collection = db.get_collection("covid-us-county")

# # mask-use by county
# collection = db.get_collection("mask-use-by-county")

data = list(collection.find())
df = pd.DataFrame.from_records(data)    
df.drop('_id', axis=1, inplace=True)
df.tail()

Unnamed: 0,date,cases,deaths
298,2020-11-14,10978295,245460
299,2020-11-15,11113482,246083
300,2020-11-16,11279747,246879
301,2020-11-17,11441484,248486
302,2020-11-18,11613875,250409
