# Read the customer data from csvFiles on the ADLS Gen2 Database

In [0]:
%fs ls /mnt/Gen2

path,name,size,modificationTime
dbfs:/mnt/Gen2/CustMarketSegmentAgg/,CustMarketSegmentAgg/,0,1669189577000
dbfs:/mnt/Gen2/Orders.csv,Orders.csv,217,1669119196000
dbfs:/mnt/Gen2/customer/,customer/,0,1669185295000


In [0]:
#Read csv file into dataframe

dfSeoulFloating = spark.read.format('csv').options(header='true').load('dbfs:/databricks-datasets/COVID/coronavirusdataset/SeoulFloating.csv')
display(dfSeoulFloating.count())

display (dfSeoulFloating.limit(10))
display (dfSeoulFloating.printSchema())

1084800

date,hour,birth_year,sex,province,city,fp_num
2020-01-01,0,20,female,Seoul,Dobong-gu,19140
2020-01-01,0,20,male,Seoul,Dobong-gu,19950
2020-01-01,0,20,female,Seoul,Dongdaemun-gu,25450
2020-01-01,0,20,male,Seoul,Dongdaemun-gu,27050
2020-01-01,0,20,female,Seoul,Dongjag-gu,28880
2020-01-01,0,20,male,Seoul,Dongjag-gu,30350
2020-01-01,0,20,female,Seoul,Eunpyeong-gu,27750
2020-01-01,0,20,male,Seoul,Eunpyeong-gu,27910
2020-01-01,0,20,female,Seoul,Gangbuk-gu,19490
2020-01-01,0,20,male,Seoul,Gangbuk-gu,21940


### Connect to COSMOSDB and Create the Database and the Container

In [0]:
# Install library com.azure.cosmos.spark - azure-cosmos-spark_3-2_2-12 onto cluster
# Create an Azure Cosmos with Provisioned Storage
# create Databasename and container name in CosmosDB
cosmosEndpoint = "https://carao2023azurecosmosdb.documents.azure.com:443/"
cosmosMasterKey = "nuiWJRdzZ7teNVYJerwtbdyHespqu77b155j0a2AjX40NSDUfYxNmUuZbSS06HPHCBg7lx04L4sTACDbZr73wQ=="
cosmosDatabaseName = "Covid"
cosmosContainerName = "SouthKoreaCovid"
spark.conf.set("spark.sql.catalog.cosmosCatalog", "com.azure.cosmos.spark.CosmosCatalog")
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountEndpoint", cosmosEndpoint)
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountKey", cosmosMasterKey)
spark.sql("CREATE DATABASE IF NOT EXISTS cosmosCatalog.{};".format(cosmosDatabaseName))
spark.sql("CREATE TABLE IF NOT EXISTS cosmosCatalog.{}.{} using cosmos.oltp TBLPROPERTIES(partitionKeyPath = '/id', manualThroughput = '500')".format(cosmosDatabaseName, cosmosContainerName))

Out[4]: DataFrame[]

In [0]:
#Writing DataFrame to Cosmos DB. If the Comos DB RU's are less then it will take quite some time to write 150K records. We are using save mode as append.
#
#Set the write configuration
writeCfg = {
  "spark.cosmos.accountEndpoint": cosmosEndpoint,
  "spark.cosmos.accountKey": cosmosMasterKey,
  "spark.cosmos.database": cosmosDatabaseName,
  "spark.cosmos.container": cosmosContainerName,
  "spark.cosmos.write.strategy": "ItemOverwrite",
}

In [0]:
#ingest the data
dfSeoulFloating.toDF("date","hour","birth_year","sex","province","city","id")\
   .write\
   .format("cosmos.oltp")\
   .options(**writeCfg)\
   .mode("APPEND")\
   .save()


In [0]:
#Set the read configuration
readCfg = {
  "spark.cosmos.accountEndpoint": cosmosEndpoint,
  "spark.cosmos.accountKey": cosmosMasterKey,
  "spark.cosmos.database": cosmosDatabaseName,
  "spark.cosmos.container": cosmosContainerName,
  "spark.cosmos.read.inferSchema.enabled" : "false"
}
#Read the data into a Spark dataframe and print the count
query_df = spark.read.format("cosmos.oltp").options(**readCfg).load()
print(query_df.count())

display (query_df.limit(10))


6805


_rawBody,id,_ts
"{""date"":""2020-05-21"",""hour"":""0"",""birth_year"":""70"",""sex"":""male"",""province"":""Seoul"",""city"":""Yeongdeungpo-gu"",""id"":""13040"",""_rid"":""T81yAPBaBVgBAAAAAAAAAA=="",""_self"":""dbs/T81yAA==/colls/T81yAPBaBVg=/docs/T81yAPBaBVgBAAAAAAAAAA==/"",""_etag"":""\""03002626-0000-0700-0000-637f10ab0000\"""",""_attachments"":""attachments/"",""_ts"":1669271723}",13040,1669271723
"{""date"":""2020-05-15"",""hour"":""9"",""birth_year"":""60"",""sex"":""male"",""province"":""Seoul"",""city"":""Jongno-gu"",""id"":""15680"",""_rid"":""T81yAPBaBVgIAAAAAAAAAA=="",""_self"":""dbs/T81yAA==/colls/T81yAPBaBVg=/docs/T81yAPBaBVgIAAAAAAAAAA==/"",""_etag"":""\""03005b20-0000-0700-0000-637f109c0000\"""",""_attachments"":""attachments/"",""_ts"":1669271708}",15680,1669271708
"{""date"":""2020-02-15"",""hour"":""22"",""birth_year"":""20"",""sex"":""male"",""province"":""Seoul"",""city"":""Seodaemun-gu"",""id"":""27800"",""_rid"":""T81yAPBaBVgLAAAAAAAAAA=="",""_self"":""dbs/T81yAA==/colls/T81yAPBaBVg=/docs/T81yAPBaBVgLAAAAAAAAAA==/"",""_etag"":""\""03009726-0000-0700-0000-637f10ad0000\"""",""_attachments"":""attachments/"",""_ts"":1669271725}",27800,1669271725
"{""date"":""2020-02-15"",""hour"":""8"",""birth_year"":""30"",""sex"":""female"",""province"":""Seoul"",""city"":""Mapo-gu"",""id"":""35540"",""_rid"":""T81yAPBaBVgNAAAAAAAAAA=="",""_self"":""dbs/T81yAA==/colls/T81yAPBaBVg=/docs/T81yAPBaBVgNAAAAAAAAAA==/"",""_etag"":""\""0300962d-0000-0700-0000-637f10bf0000\"""",""_attachments"":""attachments/"",""_ts"":1669271743}",35540,1669271743
"{""date"":""2020-03-12"",""hour"":""8"",""birth_year"":""70"",""sex"":""male"",""province"":""Seoul"",""city"":""Dongdaemun-gu"",""id"":""13190"",""_rid"":""T81yAPBaBVgSAAAAAAAAAA=="",""_self"":""dbs/T81yAA==/colls/T81yAPBaBVg=/docs/T81yAPBaBVgSAAAAAAAAAA==/"",""_etag"":""\""03001a2e-0000-0700-0000-637f10c00000\"""",""_attachments"":""attachments/"",""_ts"":1669271744}",13190,1669271744
"{""date"":""2020-05-28"",""hour"":""20"",""birth_year"":""20"",""sex"":""female"",""province"":""Seoul"",""city"":""Geumcheon-gu"",""id"":""16520"",""_rid"":""T81yAPBaBVgWAAAAAAAAAA=="",""_self"":""dbs/T81yAA==/colls/T81yAPBaBVg=/docs/T81yAPBaBVgWAAAAAAAAAA==/"",""_etag"":""\""0300952c-0000-0700-0000-637f10bc0000\"""",""_attachments"":""attachments/"",""_ts"":1669271740}",16520,1669271740
"{""date"":""2020-03-28"",""hour"":""17"",""birth_year"":""20"",""sex"":""female"",""province"":""Seoul"",""city"":""Gangdong-gu"",""id"":""25430"",""_rid"":""T81yAPBaBVguAAAAAAAAAA=="",""_self"":""dbs/T81yAA==/colls/T81yAPBaBVg=/docs/T81yAPBaBVguAAAAAAAAAA==/"",""_etag"":""\""0300641e-0000-0700-0000-637f10970000\"""",""_attachments"":""attachments/"",""_ts"":1669271703}",25430,1669271703
"{""date"":""2020-05-24"",""hour"":""6"",""birth_year"":""30"",""sex"":""male"",""province"":""Seoul"",""city"":""Seodaemun-gu"",""id"":""22690"",""_rid"":""T81yAPBaBVgvAAAAAAAAAA=="",""_self"":""dbs/T81yAA==/colls/T81yAPBaBVg=/docs/T81yAPBaBVgvAAAAAAAAAA==/"",""_etag"":""\""03000a28-0000-0700-0000-637f10b00000\"""",""_attachments"":""attachments/"",""_ts"":1669271728}",22690,1669271728
"{""date"":""2020-01-04"",""hour"":""3"",""birth_year"":""70"",""sex"":""male"",""province"":""Seoul"",""city"":""Geumcheon-gu"",""id"":""8710"",""_rid"":""T81yAPBaBVgxAAAAAAAAAA=="",""_self"":""dbs/T81yAA==/colls/T81yAPBaBVg=/docs/T81yAPBaBVgxAAAAAAAAAA==/"",""_etag"":""\""020027f4-0000-0700-0000-637f10290000\"""",""_attachments"":""attachments/"",""_ts"":1669271593}",8710,1669271593
"{""date"":""2020-02-15"",""hour"":""12"",""birth_year"":""20"",""sex"":""male"",""province"":""Seoul"",""city"":""Gangseo-gu"",""id"":""37480"",""_rid"":""T81yAPBaBVhAAAAAAAAAAA=="",""_self"":""dbs/T81yAA==/colls/T81yAPBaBVg=/docs/T81yAPBaBVhAAAAAAAAAAA==/"",""_etag"":""\""0300a81e-0000-0700-0000-637f10980000\"""",""_attachments"":""attachments/"",""_ts"":1669271704}",37480,1669271704
