# Azure Databricks COVID-19 Data Analysis
### Full notebook available at: https://aka.ms/DatabricksBuild2020
#### For purposes of demo, please don't take any of this information as accurate or scientific fact...

In [2]:
%fs ls "/databricks-datasets/"

path,name,size
dbfs:/databricks-datasets/,databricks-datasets/,0
dbfs:/databricks-datasets/COVID/,COVID/,0
dbfs:/databricks-datasets/README.md,README.md,976
dbfs:/databricks-datasets/Rdatasets/,Rdatasets/,0
dbfs:/databricks-datasets/SPARK_README.md,SPARK_README.md,3359
dbfs:/databricks-datasets/adult/,adult/,0
dbfs:/databricks-datasets/airlines/,airlines/,0
dbfs:/databricks-datasets/amazon/,amazon/,0
dbfs:/databricks-datasets/asa/,asa/,0
dbfs:/databricks-datasets/atlas_higgs/,atlas_higgs/,0


In [3]:
%run ./COVID-ETL

In [4]:
%sql 
select * from jhu_daily_covid

FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,process_date
,,Anhui,Mainland China,1/22/2020 17:00,,,1.0,,,,,2020-01-22
,,Beijing,Mainland China,1/22/2020 17:00,,,14.0,,,,,2020-01-22
,,Chongqing,Mainland China,1/22/2020 17:00,,,6.0,,,,,2020-01-22
,,Fujian,Mainland China,1/22/2020 17:00,,,1.0,,,,,2020-01-22
,,Gansu,Mainland China,1/22/2020 17:00,,,,,,,,2020-01-22
,,Guangdong,Mainland China,1/22/2020 17:00,,,26.0,,,,,2020-01-22
,,Guangxi,Mainland China,1/22/2020 17:00,,,2.0,,,,,2020-01-22
,,Guizhou,Mainland China,1/22/2020 17:00,,,1.0,,,,,2020-01-22
,,Hainan,Mainland China,1/22/2020 17:00,,,4.0,,,,,2020-01-22
,,Hebei,Mainland China,1/22/2020 17:00,,,1.0,,,,,2020-01-22


In [5]:
%sql
select process_date, Admin2, Confirmed, Deaths, Recovered, Active from jhu_daily_pop where Province_State in ('Washington') and Admin2 in ('King') order by process_date

process_date,Admin2,Confirmed,Deaths,Recovered,Active
2020-03-22,King,1040,75,0,0
2020-03-23,King,1170,87,0,0
2020-03-24,King,1170,87,0,0
2020-03-25,King,1359,101,0,0
2020-03-26,King,1577,109,0,0
2020-03-27,King,1577,109,0,0
2020-03-28,King,2077,136,0,0
2020-03-29,King,2159,141,0,0
2020-03-30,King,2161,144,0,0
2020-03-31,King,2330,150,0,0


In [6]:
df_usa = spark.sql("""
  select fips, 
         cast(100000.*Confirmed/POPESTIMATE2019 as int) as confirmed_per100K, 
         cast(100000.*Deaths/POPESTIMATE2019 as int) as deaths_per100K, 
         recovered, 
         active, 
         lat, 
         long_, 
         admin2 as county, 
         province_state as state, 
         process_date, 
         cast(replace(process_date, '-', '') as integer) as process_date_num 
 from jhu_daily_pop 
 where lat is not null and long_ is not null and fips is not null and (lat <> 0 and long_ <> 0)
""")

df_usa.createOrReplaceTempView("df_usa")

In [7]:
process_date_zero = spark.sql("select min(process_date) from df_usa where fips is not null").collect()[0][0]
df_usa_conf = spark.sql("""
select fips, 100 + datediff(process_date, '""" + process_date_zero + """') as day_num, confirmed_per100K
  from (
     select fips, process_date, max(confirmed_per100K) as confirmed_per100K
       from df_usa
      group by fips, process_date
) x """)
df_usa_conf.createOrReplaceTempView("df_usa_conf")

df_usa_deaths = spark.sql("""
select lat, long_, 100 + datediff(process_date, '""" + process_date_zero + """') as day_num, deaths_per100K
  from (
     select lat, long_, process_date, max(deaths_per100K) as deaths_per100K
       from df_usa
      group by lat, long_, process_date
) x """)
df_usa_deaths.createOrReplaceTempView("df_usa_deaths")

In [8]:
%run ./prepare-chart

### Notice the time it took to run these queries (~7.7sec and ~18sec)
### Let's see if we can do better with Delta Lake

In [10]:
spark.sql("select * from jhu_daily_pop").write.format("delta").mode("overwrite").partitionBy("Province_State").save("/tmp/kyweller/COVID_DeltaLake/jhu_daily_pop_delta/")

In [11]:
%sql
DROP TABLE IF EXISTS jhu_daily_pop_deltalake;
CREATE TABLE jhu_daily_pop_deltalake USING DELTA LOCATION '/tmp/kyweller/COVID_DeltaLake/jhu_daily_pop_delta/';
OPTIMIZE jhu_daily_pop_deltalake ZORDER BY (process_date);

path,metrics
,"List(51, 2499, List(5157, 65494, 20982.0, 51, 1070112), List(3639, 14670, 6324.0, 2499, 15805399), 0, List(minCubeSize(107374182400), List(0, 0), List(2499, 15805399), 0, List(2499, 15805399), 0), 1)"


In [12]:
%sql
select process_date, Admin2, Confirmed, Deaths, Recovered, Active from jhu_daily_pop_deltalake where Province_State in ('Washington') and Admin2 in ('King') order by process_date

process_date,Admin2,Confirmed,Deaths,Recovered,Active
2020-03-22,King,1040,75,0,0
2020-03-23,King,1170,87,0,0
2020-03-24,King,1170,87,0,0
2020-03-25,King,1359,101,0,0
2020-03-26,King,1577,109,0,0
2020-03-27,King,1577,109,0,0
2020-03-28,King,2077,136,0,0
2020-03-29,King,2159,141,0,0
2020-03-30,King,2161,144,0,0
2020-03-31,King,2330,150,0,0


In [13]:
%sql
select process_date, Admin2, Confirmed, Deaths, Recovered, Active from jhu_daily_pop where Province_State in ('Washington') and Admin2 in ('King') order by process_date

process_date,Admin2,Confirmed,Deaths,Recovered,Active
2020-03-22,King,1040,75,0,0
2020-03-23,King,1170,87,0,0
2020-03-24,King,1170,87,0,0
2020-03-25,King,1359,101,0,0
2020-03-26,King,1577,109,0,0
2020-03-27,King,1577,109,0,0
2020-03-28,King,2077,136,0,0
2020-03-29,King,2159,141,0,0
2020-03-30,King,2161,144,0,0
2020-03-31,King,2330,150,0,0


# Appendix
# -
# -
# -
# -
# -
# -
# -
# -
# -
# -

In [15]:
df_usa_conf.write.format("delta").mode("overwrite").save("/tmp/kyweller/COVID_DeltaLake/df_usa_conf/")
df_usa_deaths.write.format("delta").mode("overwrite").save("/tmp/kyweller/COVID_DeltaLake/df_usa_deaths/")

In [16]:
%sql
DROP TABLE IF EXISTS df_usa_conf_delta;
CREATE TABLE df_usa_conf_delta USING DELTA LOCATION '/tmp/kyweller/COVID_DeltaLake/df_usa_conf/';
OPTIMIZE df_usa_conf_delta ZORDER BY (day_num);

DROP TABLE IF EXISTS df_usa_deaths_delta;
CREATE TABLE df_usa_deaths_delta USING DELTA LOCATION '/tmp/kyweller/COVID_DeltaLake/df_usa_deaths/';
OPTIMIZE df_usa_deaths_delta ZORDER BY (day_num);

In [17]:
%run ./prepare-chart-DeltaLake