#Notebook shortcuts

### shift+enter = Run cell and move to the next one

### ctrl+alt+P = insert cell above
### ctrl+alt+N = insert cell below

### ctrl+alt+up = move cell up
### ctrl+alt+down = move cell down


for more shortcuts go to https://docs.microsoft.com/en-us/azure/databricks/notebooks/notebooks-use

# Start here if the Lake has been mounted to this cluster

Check file structure is setup

In [4]:
display(dbutils.fs.ls("/mnt/coviddata/"))
        

path,name,size
dbfs:/mnt/coviddata/04-20-2020.txt,04-20-2020.txt,386595
dbfs:/mnt/coviddata/5a32172c-c259-4d99-a36b-7f69d73b844f,5a32172c-c259-4d99-a36b-7f69d73b844f,355018
dbfs:/mnt/coviddata/UID_ISO_FIPS_LookUp_Table.csv,UID_ISO_FIPS_LookUp_Table.csv,355018
dbfs:/mnt/coviddata/data_92dda754-047f-4ce5-ba65-380053b7d0a7_406e314c-6fea-457f-8550-2353d6224396.txt,data_92dda754-047f-4ce5-ba65-380053b7d0a7_406e314c-6fea-457f-8550-2353d6224396.txt,276136
dbfs:/mnt/coviddata/inputs/,inputs/,0
dbfs:/mnt/coviddata/outputs/,outputs/,0


Check all the required files are visible

In [6]:
display(dbutils.fs.ls("/mnt/coviddata/inputs/"))

path,name,size
dbfs:/mnt/coviddata/inputs/01-22-2020.csv,01-22-2020.csv,1886
dbfs:/mnt/coviddata/inputs/04-02-2020.csv,04-02-2020.csv,243956
dbfs:/mnt/coviddata/inputs/04-03-2020.csv,04-03-2020.csv,327070
dbfs:/mnt/coviddata/inputs/04-04-2020.csv,04-04-2020.csv,312226
dbfs:/mnt/coviddata/inputs/04-12-2020.csv,04-12-2020.csv,305548
dbfs:/mnt/coviddata/inputs/04-15-2020.csv,04-15-2020.csv,312551
dbfs:/mnt/coviddata/inputs/04-20-2020.csv,04-20-2020.csv,317177
dbfs:/mnt/coviddata/inputs/04-22-2020.csv,04-22-2020.csv,319302
dbfs:/mnt/coviddata/inputs/04-27-2020.csv,04-27-2020.csv,325150
dbfs:/mnt/coviddata/inputs/04-28-2020.txt,04-28-2020.txt,397693


#Wrangle the Doctor Data

In [8]:
filepath2="/mnt/coviddata/inputs/DoctorCountLatest.csv"


###Infer the schema and load the data to a spark data frame. 
### Cache the data for faster operations

In [10]:
doctorraw = spark.read.format('csv').options(header='false', inferSchema='true', comment='I').load(filepath2)
doctorraw.cache()

### Check if schema was inferred correctly

In [12]:
doctorraw.printSchema()

### Display the data in a nice readable format

In [14]:
display(doctorraw)

_c0,_c1,_c2,_c3,_c4,_c5,_c6,_c7,_c8,_c9,_c10,_c11,_c12,_c13,_c14,_c15,_c16,_c17,_c18,_c19,_c20,_c21,_c22
22092804,HWF_0001,COUNTRY,AGO,YEAR,1997,,,,,,,,,0.592,0.592,,,,2020-02-12 09:20:25.0000000 +00:00,1997,1996-12-31 23:00:00.0000000 +00:00,1997-12-30 23:00:00.0000000 +00:00
22092805,HWF_0001,COUNTRY,AGO,YEAR,2004,,,,,,,,,0.621,0.621,,,,2020-02-12 09:20:25.0000000 +00:00,2004,2003-12-31 23:00:00.0000000 +00:00,2004-12-30 23:00:00.0000000 +00:00
22092806,HWF_0001,COUNTRY,AGO,YEAR,2009,,,,,,,,,1.313,1.313,,,,2020-02-12 09:20:25.0000000 +00:00,2009,2008-12-31 23:00:00.0000000 +00:00,2009-12-30 23:00:00.0000000 +00:00
22092807,HWF_0001,COUNTRY,AGO,YEAR,2017,,,,,,,,,2.146,2.146,,,,2020-02-12 09:20:25.0000000 +00:00,2017,2016-12-31 23:00:00.0000000 +00:00,2017-12-30 23:00:00.0000000 +00:00
22092808,HWF_0001,COUNTRY,BDI,YEAR,2004,,,,,,,,,0.28,0.28,,,,2020-02-12 09:20:25.0000000 +00:00,2004,2003-12-31 23:00:00.0000000 +00:00,2004-12-30 23:00:00.0000000 +00:00
22092809,HWF_0001,COUNTRY,BDI,YEAR,2010,,,,,,,,,0.482,0.482,,,,2020-02-12 09:20:25.0000000 +00:00,2010,2009-12-31 23:00:00.0000000 +00:00,2010-12-30 23:00:00.0000000 +00:00
22092810,HWF_0001,COUNTRY,BDI,YEAR,2011,,,,,,,,,0.399,0.399,,,,2020-02-12 09:20:25.0000000 +00:00,2011,2010-12-31 23:00:00.0000000 +00:00,2011-12-30 23:00:00.0000000 +00:00
22092811,HWF_0001,COUNTRY,BDI,YEAR,2012,,,,,,,,,0.539,0.539,,,,2020-02-12 09:20:25.0000000 +00:00,2012,2011-12-31 23:00:00.0000000 +00:00,2012-12-30 23:00:00.0000000 +00:00
22092812,HWF_0001,COUNTRY,BDI,YEAR,2013,,,,,,,,,0.581,0.581,,,,2020-02-12 09:20:25.0000000 +00:00,2013,2012-12-31 23:00:00.0000000 +00:00,2013-12-30 23:00:00.0000000 +00:00
22092813,HWF_0001,COUNTRY,BDI,YEAR,2014,,,,,,,,,0.619,0.619,,,,2020-02-12 09:20:25.0000000 +00:00,2014,2013-12-31 23:00:00.0000000 +00:00,2014-12-30 23:00:00.0000000 +00:00


### focus on the columns we want to work with

In [16]:
display(doctorraw.select("_c3","_c5","_c15"))

_c3,_c5,_c15
AGO,1997,0.592
AGO,2004,0.621
AGO,2009,1.313
AGO,2017,2.146
BDI,2004,0.28
BDI,2010,0.482
BDI,2011,0.399
BDI,2012,0.539
BDI,2013,0.581
BDI,2014,0.619


In [17]:
doctorraw.select("_c3","_c5","_c15").show()

### filter the data so that there is only the doctor per 10k count for the most recent year for each country in the list. We only need the latest year.

In [19]:
doctorlatest=doctorraw.groupBy("_c3").max("_c5","_c15")

### just confirm that Australia exists in the data set we're pulling

In [21]:
doctorlatest.filter("_c3= 'AUS'").show()

In [22]:
display(doctorlatest)

_c3,max(_c5),max(_c15)
NIU,2008,23.529
HTI,2018,2.384
BRB,2017,24.843
LVA,2017,46.934
POL,2017,24.188
ZMB,2018,11.867
JAM,2017,13.061
BRA,2018,21.652
ARM,2017,44.023
MOZ,2018,0.838


### rename the columns

In [24]:
doctorlatest=doctorlatest.withColumnRenamed("_c3",'COUNTRY').withColumnRenamed("max(_c5)",'YEAR').withColumnRenamed("max(_c15)",'DoctorsPer10k')

In [25]:
doctorlatest.printSchema()

In [26]:
doctorlatest.printSchema()

### save it to csv on our data lake

In [28]:
#doctorlatest.write.mode('overwrite').option("header","true").csv('/mnt/coviddata/outputs/DoctorCountLatestYear')

###Load the country code data

In [30]:
filepath3="/mnt/coviddata/inputs/UID_ISO_FIPS_LookUp_Table.csv"

In [31]:
countrycodes = spark.read.format('csv').options(header='true', inferSchema='true').load(filepath3)

In [32]:
countrycodes.printSchema()

In [33]:
countrycodes.show()

### just want country region and iso3

In [35]:
countrycodeiso3=countrycodes.select("iso3","Country_Region").distinct()

In [36]:
countrycodeiso3.filter("iso3='AUS'").show()

In [37]:
#countrycodeiso3.write.mode('overwrite').csv('/mnt/coviddata/outputs/CountryCodesISO3')

#Wrangle the COVID Data

### load the covid data and summarize by country
### join the summarized data with the count of doctors and country codes

In [39]:
# Creating widgets for leveraging parameters, and printing the parameters

dbutils.widgets.text("input", "","")
y = dbutils.widgets.get("input")
print ("Param -\'input':")
print (y)

dbutils.widgets.text("fileDate", "","")
z = dbutils.widgets.get("fileDate")
print ("Param -\'fileDate':")
print (z)

dbutils.widgets.text("name", "","")
u = dbutils.widgets.get("name")
print ("Param -\'name':")
print (u)

dbutils.widgets.text("name2", "","")
v = dbutils.widgets.get("name2")
print ("Param -\'name2':")
print (v)

In [40]:
filepath = "/mnt/coviddata/inputs/latestcovidcount.csv"

### infer the schema and load the data into a spark dataframe

In [42]:
covidraw = spark.read.format('csv').options(header='true', inferSchema='true').load(filepath)


In [43]:
display(covidraw)

FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key,Incidence_Rate,Case-Fatality_Ratio
45001.0,Abbeville,South Carolina,US,2020-08-02T04:34:47.000+0000,34.22333378,-82.46170658,288,7,0,281,"Abbeville, South Carolina, US",1174.2161699351734,2.430555555555556
22001.0,Acadia,Louisiana,US,2020-08-02T04:34:47.000+0000,30.2950649,-92.41419698,2331,71,0,2260,"Acadia, Louisiana, US",3756.950600370699,3.045903045903046
51001.0,Accomack,Virginia,US,2020-08-02T04:34:47.000+0000,37.76707161,-75.63234615,1077,15,0,1062,"Accomack, Virginia, US",3332.7144448570366,1.392757660167131
16001.0,Ada,Idaho,US,2020-08-02T04:34:47.000+0000,43.4526575,-116.24155159999998,8004,62,0,7942,"Ada, Idaho, US",1662.0049959820346,0.7746126936531734
19001.0,Adair,Iowa,US,2020-08-02T04:34:47.000+0000,41.33075609,-94.47105874,20,0,0,20,"Adair, Iowa, US",279.6420581655481,0.0
21001.0,Adair,Kentucky,US,2020-08-02T04:34:47.000+0000,37.10459774,-85.28129668,201,19,0,182,"Adair, Kentucky, US",1046.765961878971,9.45273631840796
29001.0,Adair,Missouri,US,2020-08-02T04:34:47.000+0000,40.19058551,-92.60078167,134,0,0,134,"Adair, Missouri, US",528.7456102276764,0.0
40001.0,Adair,Oklahoma,US,2020-08-02T04:34:47.000+0000,35.88494195,-94.65859267,289,5,0,284,"Adair, Oklahoma, US",1302.1537352437597,1.7301038062283738
8001.0,Adams,Colorado,US,2020-08-02T04:34:47.000+0000,39.87432092,-104.3362578,6018,171,0,5847,"Adams, Colorado, US",1163.0761024388264,2.8414755732801598
16003.0,Adams,Idaho,US,2020-08-02T04:34:47.000+0000,44.89333571,-116.4545247,18,0,0,18,"Adams, Idaho, US",419.18956683744767,0.0


In [44]:
covidlatest=covidraw.select("Country_Region","Confirmed","Deaths","Recovered").groupby("Country_Region").sum("Confirmed","Deaths","Recovered")

In [45]:
display(covidlatest)

Country_Region,sum(Confirmed),sum(Deaths),sum(Recovered)
Chad,936,75,813
Russia,843890,14034,645316
Paraguay,5485,52,3786
Yemen,1730,494,862
Senegal,10284,209,6822
Sweden,80422,5743,0
Cabo Verde,2480,24,1837
Guyana,430,20,185
Burma,353,6,297
Eritrea,279,0,225


In [46]:
#covidlatest.write.mode('overwrite').csv('/mnt/coviddata/outputs/CovidLatest')

In [47]:
doctorlatest.show()

In [48]:
countrycodeiso3.show()

In [49]:
from pyspark.sql.functions import col
doctoriso3=doctorlatest.join(countrycodeiso3,col("COUNTRY")==col("iso3"))

In [50]:
doctoriso3.select("COUNTRY","YEAR","DoctorsPer10k","Country_Region").filter("COUNTRY = 'AUS'").show()

In [51]:
coviddoctors = covidlatest.join(doctoriso3, doctoriso3.Country_Region == covidlatest.Country_Region)

In [52]:
coviddoctors.show()

In [53]:
coviddoctorselect = coviddoctors.select(covidlatest.Country_Region,"sum(Confirmed)","sum(Deaths)","sum(Recovered)","COUNTRY","YEAR","DoctorsPer10k")

In [54]:
coviddoctorfinal=coviddoctorselect\
.withColumnRenamed('sum(Confirmed)','Confirmed')\
.withColumnRenamed('sum(Deaths)','Deaths')\
.withColumnRenamed('sum(Recovered)','Recovered')\
.withColumnRenamed('COUNTRY','Iso3')\
.withColumnRenamed('YEAR','YearOfDoctorCount')

In [55]:
coviddoctorfinal.printSchema()

In [56]:
coviddoctorfinal.write.mode("overwrite").option("header", "true").csv("/mnt/coviddata/outputs/CovidDoctorCombined")

In [57]:
#dbutils.fs.rm("/mnt/coviddata/outputs/final",True)
#dbutils.fs.mkdirs("/mnt/coviddata/outputs/final")

### define where you want your final data to go

In [59]:
dbutils.fs.rm("/mnt/coviddata/outputs/finalVJ",True)
dbutils.fs.mkdirs("/mnt/coviddata/outputs/finalVJ")

### for the data factory to correctly copy these shards to Synapse, we need to remove any unneeded files from the output directory. Since trying to find files that begin with "_" seem to throw java, we will look for all the .csv files and move them to a clean directory and then point the factory to that as the source

In [61]:
%scala

//CHANGE THIS FOLDER DEPENDING ON YOUR DATA
val fileprefix= "/mnt/coviddata/outputs/finalVJ/"
val partition_path = dbutils.fs.ls("/mnt/coviddata/outputs/CovidDoctorCombined")
     .filter(file=>file.name.endsWith("csv"))//(0).path

partition_path.foreach { file => dbutils.fs.cp(file.path,fileprefix+file.name)}

//partition_path.show()

//partition_path.toDF().foreach { file => dbutils.fs.cp(file(0).toString,)}//.toString, true)}


//dbutils.fs.cp(partition_path,fileprefix+".tab")

//dbutils.fs.rm(fileprefix+".tmp",recurse=true)

In [62]:
display(dbutils.fs.ls('/mnt/coviddata/outputs/finalVJ/'))

path,name,size
dbfs:/mnt/coviddata/outputs/finalVJ/part-00000-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4894-1-c000.csv,part-00000-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4894-1-c000.csv,125
dbfs:/mnt/coviddata/outputs/finalVJ/part-00002-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4896-1-c000.csv,part-00002-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4896-1-c000.csv,113
dbfs:/mnt/coviddata/outputs/finalVJ/part-00003-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4897-1-c000.csv,part-00003-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4897-1-c000.csv,114
dbfs:/mnt/coviddata/outputs/finalVJ/part-00004-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4898-1-c000.csv,part-00004-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4898-1-c000.csv,119
dbfs:/mnt/coviddata/outputs/finalVJ/part-00009-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4903-1-c000.csv,part-00009-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4903-1-c000.csv,189
dbfs:/mnt/coviddata/outputs/finalVJ/part-00010-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4904-1-c000.csv,part-00010-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4904-1-c000.csv,122
dbfs:/mnt/coviddata/outputs/finalVJ/part-00011-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4905-1-c000.csv,part-00011-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4905-1-c000.csv,174
dbfs:/mnt/coviddata/outputs/finalVJ/part-00012-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4906-1-c000.csv,part-00012-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4906-1-c000.csv,114
dbfs:/mnt/coviddata/outputs/finalVJ/part-00016-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4910-1-c000.csv,part-00016-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4910-1-c000.csv,124
dbfs:/mnt/coviddata/outputs/finalVJ/part-00017-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4911-1-c000.csv,part-00017-tid-8261168128314198120-d20532ca-c7cf-4cd5-9bfc-eb83d87768af-4911-1-c000.csv,113
