In [2]:
from read_s3_boto import *

In [53]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col
import pandas as pd
import os

spark = pyspark.sql.SparkSession.builder.appName("EDA") \
        .getOrCreate()

In [30]:
bucket = "covid19-lake"
cases_pref = "rearc-covid-19-nyt-data-in-usa/csv/us-counties"
tests_pref = "rearc-covid-19-testing-data/json/states_daily"
pop_pref = "static-datasets/csv/CountyPopulation"

In [31]:
cases_key = get_matching_s3_keys(bucket, cases_pref, "csv")
tests_key = get_matching_s3_keys(bucket, tests_pref, "json")
pop_key = get_matching_s3_keys(bucket, pop_pref, "csv")

In [34]:
cases_df = get_spark_dataframes(spark, bucket, cases_pref, "csv", "csv")
tests_df = get_spark_dataframes(spark, bucket, tests_pref, "json", "json")
pop_df = get_spark_dataframes(spark, bucket, pop_pref, "csv", "csv")

In [55]:
### initialize helper functions

###function to grab all numeric variables for model training
def getNumericColumns(df):
  numeric_features = []
  for c, t in df.dtypes:
    if t == "double" or t == "int" or t == "float":
      numeric_features.append(c)
  
  return numeric_features

###function to display NaN counts by column
def countNaNs(df):
  nanTable = df.select([count(when(isnan(c), c)).alias(c) for c in df.columns])
  return nanTable

##count nulls
def countNulls(df):
  nullTable = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])
  return nullTable

###function to print all correlations b/n features and target (unvectorized)
def printCorrs(df, features, target):
  for f in features:
    print("Correlation for target variable " + target + " and "+ f + ": " + str(df.corr(f, target)))
    
def getCorr(df, feature, target):
  return df.select(corr(feature, target))

### Checking Table Schemas

In [70]:
cases_df.show(5)
cases_df.printSchema()

+----------+---------+----------+-------+-----+------+
|      date|   county|     state|   fips|cases|deaths|
+----------+---------+----------+-------+-----+------+
|2020-01-21|Snohomish|Washington|53061.0|    1|     0|
|2020-01-22|Snohomish|Washington|53061.0|    1|     0|
|2020-01-23|Snohomish|Washington|53061.0|    1|     0|
|2020-01-24|     Cook|  Illinois|17031.0|    1|     0|
|2020-01-24|Snohomish|Washington|53061.0|    1|     0|
+----------+---------+----------+-------+-----+------+
only showing top 5 rows

root
 |-- date: string (nullable = true)
 |-- county: string (nullable = true)
 |-- state: string (nullable = true)
 |-- fips: double (nullable = true)
 |-- cases: long (nullable = true)
 |-- deaths: long (nullable = true)



In [69]:
tests_df.select("date", "state", "positive", "negative", "death", "hospitalized","total").show(5)
tests_df.printSchema()

+--------+-----+--------+---------+------+------------+---------+
|    date|state|positive| negative| death|hospitalized|    total|
+--------+-----+--------+---------+------+------------+---------+
|20200921|   AK|  7838.0| 420807.0|  45.0|         NaN| 428645.0|
|20200921|   AL|145780.0| 928112.0|2439.0|     16487.0|1073892.0|
|20200921|   AR| 76364.0| 817238.0|1197.0|      4986.0| 893602.0|
|20200921|   AS|     0.0|   1571.0|   0.0|         NaN|   1571.0|
|20200921|   AZ|214251.0|1176711.0|5478.0|     21878.0|1390962.0|
+--------+-----+--------+---------+------+------------+---------+
only showing top 5 rows

root
 |-- date: long (nullable = true)
 |-- state: string (nullable = true)
 |-- positive: double (nullable = true)
 |-- negative: double (nullable = true)
 |-- death: double (nullable = true)
 |-- total: double (nullable = true)
 |-- hash: string (nullable = true)
 |-- dateChecked: string (nullable = true)
 |-- totalTestResults: double (nullable = true)
 |-- fips: long (nullabl

In [68]:
pop_df.show(5)
pop_df.printSchema()

+--------------+----+-------+-------+------------------------+
|            Id| Id2| County|  State|Population Estimate 2018|
+--------------+----+-------+-------+------------------------+
|0500000US01001|1001|Autauga|Alabama|                   55601|
|0500000US01003|1003|Baldwin|Alabama|                  218022|
|0500000US01005|1005|Barbour|Alabama|                   24881|
|0500000US01007|1007|   Bibb|Alabama|                   22400|
|0500000US01009|1009| Blount|Alabama|                   57840|
+--------------+----+-------+-------+------------------------+
only showing top 5 rows

root
 |-- Id: string (nullable = true)
 |-- Id2: long (nullable = true)
 |-- County: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Population Estimate 2018: long (nullable = true)



#### Analysis

Initial Analysis shows that Date values vary across the different data sources, which will have to be normalized via dim table or by adding a date time column to the respective fact tables.

Additionally the usage of state names and abbreviations will also have to be normalized via dim table, in order to allow consistiancy and relation across the tables.

### Checking for Nulls/Invalids

In [58]:
cases_nulls = countNulls(cases_df)
cases_NaNs = countNaNs(cases_df)

print("=================Null values for Covid_Case Table=================")
cases_nulls.show()
print("\n=================NaN values for Covid_Case Table=================")
cases_NaNs.show()


+----+------+-----+----+-----+------+
|date|county|state|fips|cases|deaths|
+----+------+-----+----+-----+------+
|   0|     0|    0|   0|    0|     0|
+----+------+-----+----+-----+------+


+----+------+-----+----+-----+------+
|date|county|state|fips|cases|deaths|
+----+------+-----+----+-----+------+
|   0|     0|    0|5363|    0|     0|
+----+------+-----+----+-----+------+



In [71]:
tests_nulls = countNulls(tests_df)
tests_NaNs = countNaNs(tests_df)

print("=================Null values for Covid_Test Table=================")
tests_nulls.show()
print("\n=================NaN values for Covid_Test Table=================")
tests_NaNs.select("date", "state", "positive", "negative", "death","hash","dateChecked", "totalTestResults").show()
tests_NaNs.select("fips", "deathIncrease", "hospitalizedIncrease", "negativeIncrease", "positiveIncrease","totalTestResultsIncrease","hospitalized", "pending").show()

+----+-----+--------+--------+-----+-----+----+-----------+----------------+----+-------------+--------------------+----------------+----------------+------------------------+------------+-------+
|date|state|positive|negative|death|total|hash|dateChecked|totalTestResults|fips|deathIncrease|hospitalizedIncrease|negativeIncrease|positiveIncrease|totalTestResultsIncrease|hospitalized|pending|
+----+-----+--------+--------+-----+-----+----+-----------+----------------+----+-------------+--------------------+----------------+----------------+------------------------+------------+-------+
|   0|    0|       0|       0|    0|    0|   0|          0|               0|   0|            0|                   0|               0|               0|                       0|           0|      0|
+----+-----+--------+--------+-----+-----+----+-----------+----------------+----+-------------+--------------------+----------------+----------------+------------------------+------------+-------+


+----+-----+-

In [74]:
pop_nulls = countNulls(pop_df)
pop_NaNs = countNaNs(pop_df)

print("=================Null values for Populations Table=================")
pop_nulls.show()
print("\n=================NaN values for Populations Table=================")
pop_NaNs.show()

+---+---+------+-----+------------------------+
| Id|Id2|County|State|Population Estimate 2018|
+---+---+------+-----+------------------------+
|  0|  0|     0|    0|                       0|
+---+---+------+-----+------------------------+


+---+---+------+-----+------------------------+
| Id|Id2|County|State|Population Estimate 2018|
+---+---+------+-----+------------------------+
|  0|  0|     0|    0|                       0|
+---+---+------+-----+------------------------+



#### Analysis

There are a handful of missing values in the test and cases table, but the instance count is small enough (several thousand missing out of hundreds of thousands of rows) that the rows can dropped. This can be handled in ETL preprocessing  

### Other EDA

In [80]:
cases_df.groupBy("state").count().orderBy(col("count").desc()).show(50)

+--------------+-----+
|         state|count|
+--------------+-----+
|         Texas|41426|
|       Georgia|29049|
|      Virginia|23453|
|      Kentucky|20692|
|      Missouri|19516|
|North Carolina|17908|
|      Illinois|17889|
|     Tennessee|17232|
|          Iowa|17170|
|       Indiana|16778|
|        Kansas|16173|
|          Ohio|15954|
|     Minnesota|15171|
|   Mississippi|14971|
|      Michigan|14886|
|      Arkansas|13508|
|      Oklahoma|13202|
|      Nebraska|13164|
|     Wisconsin|12834|
|       Florida|12630|
|  Pennsylvania|12308|
|       Alabama|12179|
|     Louisiana|11968|
|      Colorado|11193|
|   Puerto Rico|11076|
|      New York|10950|
|    California|10943|
|  South Dakota|10018|
| West Virginia| 9291|
|South Carolina| 8537|
|  North Dakota| 8154|
|    Washington| 7436|
|       Montana| 7127|
|         Idaho| 6819|
|        Oregon| 6232|
|    New Mexico| 5545|
|          Utah| 4949|
|      Maryland| 4670|
|    New Jersey| 4239|
|       Wyoming| 4038|
|        Al