In [None]:
import pandas as pd
import pyspark.sql.functions
from pyspark.sql.functions import when
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, col, lit, lower
from pyspark.sql.types import StringType,DecimalType
from pyspark.sql.functions import input_file_name, substring
import matplotlib.pyplot as plt

## Extract and Transform

In [None]:
df = pd.read_csv('/dbfs/mnt/capstone-group2-data/datain/rawdata/indicators.csv')
df.info()

In [None]:
## drop unnecessary columns
df = df.drop(columns=["Quartile Range","Suppression Flag","Low CI","High CI","Confidence Interval"])

In [None]:
## drop na values
df = df.dropna()
df.info()

## Load
#### to file

In [None]:
from pyspark.sql import SparkSession
#Create PySpark SparkSession
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("SparkByExamples.com") \
    .getOrCreate()
#Create PySpark DataFrame from Pandas
df = spark.createDataFrame(df) 
df.printSchema()
df.show()

In [None]:
# Mount the capstone container
from config import storageAccount
from config import storageContainer
from config import clientSecret
from config import clientid
mount_point = "/mnt/capstone-group2-data/dataout"
    
    
configs = {"fs.azure.account.auth.type": "OAuth",
   "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
   "fs.azure.account.oauth2.client.id": clientid,
   "fs.azure.account.oauth2.client.secret": clientSecret,
   "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
   "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

try:
    dbutils.fs.unmount(mount_point)
except:
    pass


dbutils.fs.mount(
source = f"abfss://{storageContainer}@{storageAccount}.dfs.core.windows.net/", 
mount_point = mount_point, 
extra_configs = configs)

In [None]:
%fs
ls /mnt/capstone-group2-data/dataout

In [None]:
df.write.mode("overwrite").option("header", "true").csv("/mnt/capstone-group2-data/dataout/cleandata/cleanIndicators")

## Load
#### to SQL database

In [None]:
# Mount the capstone container
from config import storageAccount
from config import storageContainer
from config import clientSecret
from config import clientid
mount_point = "/mnt/capstone-group2-data/datain"
    
    
configs = {"fs.azure.account.auth.type": "OAuth",
   "fs.azure.account.oauth.provider.type": "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider",
   "fs.azure.account.oauth2.client.id": clientid,
   "fs.azure.account.oauth2.client.secret": clientSecret,
   "fs.azure.account.oauth2.client.endpoint": "https://login.microsoftonline.com/d46b54b2-a652-420b-aa5a-2ef7f8fc706e/oauth2/token",
   "fs.azure.createRemoteFileSystemDuringInitialization": "true"}

try:
    dbutils.fs.unmount(mount_point)
except:
    pass


dbutils.fs.mount(
source = f"abfss://{storageContainer}@{storageAccount}.dfs.core.windows.net/", 
mount_point = mount_point, 
extra_configs = configs)

In [None]:
df = spark.read.option("header", "true").csv("/mnt/capstone-group2-data/datain/cleandata/cleanIndicators")

In [None]:
#Age breakout table
age_df = df[df["Group"] == "By Age"]
age_df = age_df.select("Subgroup").distinct().sort('Subgroup')
age_df = age_df.withColumnRenamed('Subgroup', 'AgeLabel')
display(age_df)

In [None]:
#Load into SQL Database
from config import server
from config import database
from config import user
from config import password
table = "dbo.Age"

age_df.write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .mode("append") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .save()

In [None]:
#Indicator breakout table
ind_df = df.select("Indicator").distinct()
ind_df = ind_df.withColumnRenamed('Indicator', 'IndicatorLabel')
display(ind_df)

In [None]:
#Load into SQL Database
table = "dbo.Indicator"

ind_df.write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .mode("append") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .save()

In [None]:
#Group breakout table
group_df = df.select("Group").distinct().sort("Group")
group_df = group_df.withColumnRenamed('Group', 'GroupLabel')
display(group_df)

In [None]:
#Load into SQL Database
table = "dbo.[Group]"

group_df.write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .mode("append") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .save()

In [None]:
#Education breakout table
edu_df = df[df["Group"] == "By Education"]
edu_df = edu_df.select("Subgroup").distinct().sort("Subgroup")
edu_df = edu_df.withColumnRenamed('Subgroup', 'EducationLabel')
display(edu_df)

In [None]:
#Load into SQL Database
table = "dbo.Education"

edu_df.write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .mode("append") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .save()

In [None]:
#Week breakout table
week_df = df.select("Week Label").distinct().sort("Week")
week_df = week_df.withColumnRenamed('Week Label', 'WeekLabel')
display(week_df)

In [None]:
#Load into SQL Database
table = "dbo.Week"

week_df.write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .mode("append") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .save()

In [None]:
#State breakout table
state_df = df[df["Group"] == "By State"]
state_df = state_df.select("Subgroup").distinct().sort("Subgroup")
state_df = state_df.withColumnRenamed('Subgroup', 'StateLabel')
display(state_df)

In [None]:
#Load into SQL Database
table = "dbo.State"

state_df.write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .mode("append") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .save()

In [None]:
race_df = df[df["Group"] == "By Race/Hispanic ethnicity"]
race_df = race_df.select("Subgroup").distinct().sort("Subgroup")
race_df = race_df.withColumnRenamed('Subgroup', 'RaceLabel')
allrace = [["All races"]]
allrace_df = spark.createDataFrame(allrace)
race_df = race_df.union(allrace_df)

race_df = race_df.withColumn(
    "RaceLabel", when(col("RaceLabel") == "Hispanic or Latino","Hispanic or Latino, any race").otherwise(col("RaceLabel"))
)

display(race_df)

In [None]:
#Load into SQL Database
table = "dbo.Race"

race_df.write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .mode("append") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .save()

Making the InsuranceIndicators table with joins

In [None]:
df = df.drop("State")
display(df)

In [None]:
df = df.withColumn("Sex", when((df.Group == "By Gender"),df.Subgroup))
df = df.withColumn("Sex", lower(col("Sex")))
df = df.withColumn("Race", when((df.Group == "By Race/Hispanic ethnicity"),df.Subgroup))
df = df.withColumn("Age", when((df.Group == "By Age"),df.Subgroup))
df = df.withColumn("Education", when((df.Group == "By Education"),df.Subgroup))
df = df.withColumn("State", when((df.Group == "By State"),df.Subgroup))
display(df)

In [None]:
table = "dbo.Sex"

#Read from server
sex_df = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()

#show the data loaded into dataframe
sex_df.show()

In [None]:
df = df.join(sex_df, df.Sex ==  sex_df.SexLabel, "left")

In [None]:
table = "dbo.Age"

#Read from server
age_df = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()

#show the data loaded into dataframe
age_df.show()

In [None]:
df = df.join(age_df, df.Age ==  age_df.AgeLabel, "left")

In [None]:
table = "dbo.[Group]"

#Read from server
group_df = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()

#show the data loaded into dataframe
group_df.show()

In [None]:
df = df.join(group_df, df.Group ==  group_df.GroupLabel, "left")

In [None]:
table = "dbo.Indicator"

#Read from server
indicator_df = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()

#show the data loaded into dataframe
indicator_df.show()

In [None]:
df = df.join(indicator_df, df.Indicator ==  indicator_df.IndicatorLabel, "left")

In [None]:
table = "dbo.Race"

#Read from server
race_df = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()

#show the data loaded into dataframe
race_df.show()

In [None]:
df = df.join(race_df, df.Race ==  race_df.RaceLabel, "left")

In [None]:
table = "dbo.Education"

#Read from server
education_df = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()

#show the data loaded into dataframe
education_df.show()

In [None]:
df = df.join(education_df, df.Education ==  education_df.EducationLabel, "left")

In [None]:
table = "dbo.State"

#Read from server
state_df = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()

#show the data loaded into dataframe
state_df.show()

In [None]:
df = df.join(state_df, df.State ==  state_df.StateLabel, "left")

In [None]:
table = "dbo.Week"

#Read from server
week_df = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()

#show the data loaded into dataframe
week_df.show()

In [None]:
df = df.join(week_df, col('Week Label') ==  week_df.WeekLabel, "left")
display(df)

In [None]:
df = df.drop("Subgroup", "Indicator", "Group", "Age", "Sex", "Race", "Education", "State", "Week", "Week Label", "IndicatorLabel", "GroupLabel", "AgeLabel", "SexLabel", "RaceLabel", "EducationLabel", "StateLabel", "WeekLabel")

In [None]:
df = df.withColumnRenamed('Value', 'DataValue')
display(df)

In [None]:
#Load to SQL Database
table = "dbo.InsuranceCoverage"

df.write.format('jdbc').option("url", f"jdbc:sqlserver://{server}:1433;databaseName={database};") \
    .mode("append") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .save()