#Notebook shortcuts

### shift+enter = Run cell and move to the next one

### ctrl+alt+P = insert cell above
### ctrl+alt+N = insert cell below

### ctrl+alt+up = move cell up
### ctrl+alt+down = move cell down


for more shortcuts go to https://docs.microsoft.com/en-us/azure/databricks/notebooks/notebooks-use

# Start here if the Lake has been mounted to this cluster

Check file structure is setup

In [4]:
display(dbutils.fs.ls("/mnt/coviddata/"))
        

path,name,size
dbfs:/mnt/coviddata/inputs/,inputs/,0
dbfs:/mnt/coviddata/outputs/,outputs/,0


Check all the required files are visible

In [6]:
display(dbutils.fs.ls("/mnt/coviddata/inputs/"))

#Wrangle the Doctor Data

In [8]:
filepath2="/mnt/coviddata/inputs/DoctorCountLatest.csv"


###Infer the schema and load the data to a spark data frame. 
### Cache the data for faster operations

In [10]:
doctorraw = spark.read.format('csv').options(header='false', inferSchema='true').load(filepath2)
doctorraw.cache()

### Check if schema was inferred correctly

In [12]:
doctorraw.printSchema()

### Display the data in a nice readable format

In [14]:
display(doctorraw)

### focus on the columns we want to work with

In [16]:
display(doctorraw.select("_c3","_c5","_c15"))

In [17]:
doctorraw.select("_c3","_c5","_c15").show()

### filter the data so that there is only the doctor per 10k count for the most recent year for each country in the list. We only need the latest year.

In [19]:
doctorlatest=doctorraw.groupBy("_c3").max("_c5","_c15")

### just confirm that Australia exists in the data set we're pulling

In [21]:
doctorlatest.filter("_c3= 'AUS'").show()

In [22]:
display(doctorlatest)

### rename the columns

In [24]:
doctorlatest=doctorlatest.withColumnRenamed("_c3",'COUNTRY').withColumnRenamed("max(_c5)",'YEAR').withColumnRenamed("max(_c15)",'DoctorsPer10k')

In [25]:
doctorlatest.printSchema()

In [26]:
doctorlatest.printSchema()

### save it to csv on our data lake

In [28]:
doctorlatest.write.mode('overwrite').option("header","true").csv('/mnt/coviddata/outputs/DoctorCountLatestYear')

###Load the country code data

In [30]:
filepath3="/mnt/coviddata/inputs/UID_ISO_FIPS_LookUp_Table.csv"

In [31]:
countrycodes = spark.read.format('csv').options(header='true', inferSchema='true').load(filepath3)

In [32]:
countrycodes.printSchema()

In [33]:
countrycodes.show()

### just want country region and iso3

In [35]:
countrycodeiso3=countrycodes.select("iso3","Country_Region").distinct()

In [36]:
countrycodeiso3.filter("iso3='AUS'").show()

In [37]:
countrycodeiso3.write.mode('overwrite').csv('/mnt/coviddata/outputs/CountryCodesISO3')

#Wrangle the COVID Data

### load the covid data and summarize by country
### join the summarized data with the count of doctors and country codes

In [39]:
# Creating widgets for leveraging parameters, and printing the parameters

dbutils.widgets.text("input", "","")
y = dbutils.widgets.get("input")
print ("Param -\'input':")
print (y)

dbutils.widgets.text("fileDate", "","")
z = dbutils.widgets.get("fileDate")
print ("Param -\'fileDate':")
print (z)

dbutils.widgets.text("name", "","")
u = dbutils.widgets.get("name")
print ("Param -\'name':")
print (u)

dbutils.widgets.text("name2", "","")
v = dbutils.widgets.get("name2")
print ("Param -\'name2':")
print (v)

In [40]:
filepath = "/mnt/coviddata/inputs/latestcovidcount.csv"

### infer the schema and load the data into a spark dataframe

In [42]:
covidraw = spark.read.format('csv').options(header='true', inferSchema='true').load(filepath)


In [43]:
display(covidraw)

In [44]:
covidlatest=covidraw.select("Country_Region","Confirmed","Deaths","Recovered").groupby("Country_Region").sum("Confirmed","Deaths","Recovered")

In [45]:
display(covidlatest)

In [46]:
#covidlatest.write.mode('overwrite').csv('/mnt/coviddata/outputs/CovidLatest')

In [47]:
doctorlatest.show()

In [48]:
countrycodeiso3.show()

In [49]:
from pyspark.sql.functions import col
doctoriso3=doctorlatest.join(countrycodeiso3,col("COUNTRY")==col("iso3"))

In [50]:
doctoriso3.select("COUNTRY","YEAR","DoctorsPer10k","Country_Region").filter("COUNTRY = 'AUS'").show()

In [51]:
coviddoctors = covidlatest.join(doctoriso3, doctoriso3.Country_Region == covidlatest.Country_Region)

In [52]:
coviddoctors.show()

In [53]:
coviddoctorselect = coviddoctors.select(covidlatest.Country_Region,"sum(Confirmed)","sum(Deaths)","sum(Recovered)","COUNTRY","YEAR","DoctorsPer10k")

In [54]:
coviddoctorfinal=coviddoctorselect\
.withColumnRenamed('sum(Confirmed)','Confirmed')\
.withColumnRenamed('sum(Deaths)','Deaths')\
.withColumnRenamed('sum(Recovered)','Recovered')\
.withColumnRenamed('COUNTRY','Iso3')\
.withColumnRenamed('YEAR','YearOfDoctorCount')

In [55]:
coviddoctorfinal.printSchema()

In [56]:
coviddoctorfinal.write.mode("overwrite").option("header", "true").csv("/mnt/coviddata/outputs/CovidDoctorCombined")

In [57]:
#dbutils.fs.mkdirs("/mnt/coviddata/outputs/final")

### for the data factory to correctly copy these shards to Synapse, we need to remove any unneeded files from the output directory. Since trying to find files that begin with "_" seem to throw java, we will look for all the .csv files and move them to a clean directory and then point the factory to that as the source

In [59]:
%scala

val fileprefix= "/mnt/coviddata/outputs/final/"
val partition_path = dbutils.fs.ls("/mnt/coviddata/outputs/CovidDoctorCombined")
     .filter(file=>file.name.endsWith("csv"))//(0).path

partition_path.foreach { file => dbutils.fs.cp(file.path,fileprefix+file.name)}

//partition_path.show()

//partition_path.toDF().foreach { file => dbutils.fs.cp(file(0).toString,)}//.toString, true)}


//dbutils.fs.cp(partition_path,fileprefix+".tab")

//dbutils.fs.rm(fileprefix+".tmp",recurse=true)

In [60]:
display(dbutils.fs.ls('/mnt/coviddata/outputs/final/'))