In [1]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('Create Delta Table') \
    .config("spark.jars", "/home/ubuntu/Downloads/postgresql-42.6.0.jar") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.4.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()



23/11/14 11:15:00 WARN Utils: Your hostname, ubuntu-ThinkPad-T480 resolves to a loopback address: 127.0.1.1; using 192.168.1.111 instead (on interface wlp3s0)
23/11/14 11:15:00 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-dfb9489c-740e-4ef0-8f19-baeed8024137;1.0
	confs: [default]
	found io.delta#delta-core_2.12;2.4.0 in central
	found io.delta#delta-storage;2.4.0 in central
	found org.antlr#antlr4-runtime;4.9.3 in central
:: resolution report :: resolve 203ms :: artifacts dl 13ms
	:: modules in use:
	io.delta#delta-core_2.12;2.4.0 from central in [default]
	io.delta#delta-storage;2.4.0 from central in [default]
	org.antlr#antlr4-runtime;4.9.3 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   

In [None]:

from pyspark.sql.types import StructType, StructField, StringType, DateType, BooleanType

# Define the schema for the data
schema = StructType([
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("date_of_birth", DateType(), True),
    StructField("gender", StringType(), True),
    StructField("address", StringType(), True),
    StructField("phone_number", StringType(), True),
    StructField("email", StringType(), True),
    StructField("diagnosis", StringType(), True),
    StructField("admission_date", DateType(), True),
    StructField("discharged", BooleanType(), True)
])
patient_df = spark.read.format("csv").option("delimeter", ",").option("header", True).schema(schema).load("data.csv")

In [None]:
patient_df.repartition(1000).write.mode(saveMode="overwrite").format("delta").save("delta/patients")



In [2]:
delta_patient = spark.read.format("delta").load("delta/patients")
from pyspark.sql import SQLContext

sqlContext = SQLContext(spark)
sqlContext.registerDataFrameAsTable(delta_patient, "patients")



In [3]:
import time 

s = time.time()
sqlContext.sql("SELECT * FROM patients WHERE admission_date REGEXP '01-21'")
print(time.time() - s)

0.19359064102172852


In [4]:
s = time.time()
sqlContext.sql("SELECT * FROM patients WHERE admission_date REGEXP '2020-01-21' and diagnosis='Fever' and gender='Male' and first_name='Steven'")
print(time.time() - s)

0.05654597282409668


In [5]:
s = time.time()
sqlContext.sql("SELECT * FROM patients WHERE admission_date REGEXP '01-21' and diagnosis='Fever' and gender='Male'")
print(time.time() - s)

0.029584884643554688


In [6]:
s = time.time()
sqlContext.sql("SELECT diagnosis, COUNT(diagnosis) as diagnosis_count FROM patients GROUP BY diagnosis")
print(time.time() - s)

0.09287500381469727


In [None]:
from pyspark.sql.types import FloatType
from faker import Faker

fake = Faker()

practitioner_schema = StructType([
    StructField("practitioner_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("specialty", StringType(), True),
    StructField("experience_years", FloatType(), True),
    StructField("location", StringType(), True),
    StructField("email", StringType(), True)
])

# Generate Fake Practitioners
num_practitioners = 10000

practitioners = []
for id,_ in enumerate(range(num_practitioners)):
    practitioner_id = "PRAC" + str(num_practitioners + id)
    name = fake.name()
    specialty = fake.random.choice(["Cardiologist", "Dermatologist", "Pediatrician", "Orthopedic Surgeon", "Psychiatrist"])
    experience_years = float(fake.random.randrange(1,20))
    address = fake.address().replace('\n', '').replace(',', '')
    email = fake.email()

    practitioners.append((practitioner_id, name, specialty, experience_years, address, email))

# Create DataFrame from practitioners
practitioner_df = spark.createDataFrame(practitioners, schema=practitioner_schema)
practitioner_df.repartition(100).write.mode(saveMode="overwrite").format("delta").save("delta/practitioners")

In [7]:

delta_practitioner = spark.read.format("delta").load("delta/practitioners")
from pyspark.sql import SQLContext

sqlContext = SQLContext(spark)
sqlContext.registerDataFrameAsTable(delta_practitioner, "practitioners")

In [None]:
import random
from pyspark.sql.functions import udf

random_id_gen = udf(lambda: "PRAC"+str(random.randint(10000,11111)))
patient_df = patient_df.withColumn("practitioner", random_id_gen())
patient_df.repartition(1000).write.mode(saveMode="overwrite").format("delta").option("overwriteSchema", "true").save("delta/patients")


In [None]:
delta_patient = spark.read.format("delta").load("delta/patients")
from pyspark.sql import SQLContext

sqlContext = SQLContext(spark)
sqlContext.registerDataFrameAsTable(delta_patient, "patients")

In [8]:
s = time.time()
spark.sql("SELECT * FROM patients INNER JOIN practitioners ON practitioners.practitioner_id=patients.practitioner")
print(time.time() - s)

0.04899191856384277


In [9]:
s = time.time()
sqlContext.sql("SELECT * FROM practitioners;")
print(time.time() - s)

0.014677762985229492


In [10]:
s = time.time()
sqlContext.sql("SELECT * FROM patients WHERE diagnosis = 'Headache';")
print(time.time() - s)

0.016487836837768555


In [11]:
s = time.time()
sqlContext.sql("SELECT * FROM practitioners WHERE experience_years > 10;")
print(time.time() - s)

0.0245363712310791


In [12]:
s = time.time()
sqlContext.sql("SELECT gender, COUNT(*) FROM patients GROUP BY gender;")
print(time.time() - s)

0.020940065383911133


In [13]:
s = time.time()
sqlContext.sql("SELECT specialty, AVG(experience_years) FROM practitioners GROUP BY specialty;")
print(time.time() - s)

0.01909327507019043


In [14]:
s = time.time()
sqlContext.sql("SELECT * FROM patients WHERE admission_date < '2023-01-01';")
print(time.time() - s)

0.02260136604309082


In [15]:
s = time.time()
sqlContext.sql("SELECT * FROM practitioners WHERE location LIKE '%City%';")
print(time.time() - s)

0.034147024154663086


In [16]:
s = time.time()
sqlContext.sql("SELECT diagnosis, COUNT(*) FROM patients GROUP BY diagnosis;")
print(time.time() - s)

0.014020681381225586


In [19]:
s = time.time()
sqlContext.sql("SELECT * FROM patients as p JOIN practitioners as pr ON p.address = pr.location;")
print(time.time() - s)

0.026125431060791016


In [20]:
s = time.time()
sqlContext.sql("SELECT * FROM patients WHERE MONTH(admission_date) = 5;")
print(time.time() - s)

0.03163766860961914


In [21]:
s = time.time()
sqlContext.sql("SELECT * FROM practitioners WHERE experience_years BETWEEN 5 AND 15;")
print(time.time() - s)

0.017019271850585938


In [22]:
s = time.time()
sqlContext.sql("SELECT * FROM patients as p JOIN practitioners as pr ON p.diagnosis = pr.specialty;")
print(time.time() - s)

0.019042491912841797


In [23]:
s = time.time()
sqlContext.sql("SELECT YEAR(admission_date) AS year, COUNT(*) FROM patients GROUP BY year;")
print(time.time() - s)

0.030568838119506836


In [24]:
s = time.time()
sqlContext.sql("SELECT * FROM practitioners ORDER BY experience_years DESC LIMIT 1;")
print(time.time() - s)

0.023092031478881836


In [25]:
s = time.time()
sqlContext.sql("SELECT * FROM patients WHERE discharged = false;")
print(time.time() - s)

0.017248868942260742


In [26]:
s = time.time()
sqlContext.sql("SELECT * FROM practitioners WHERE email LIKE '%.com';")
print(time.time() - s)

0.012778282165527344


In [28]:
s = time.time()
sqlContext.sql("SELECT * FROM patients WHERE practitioner = '123';")
print(time.time() - s)

0.015375614166259766


In [29]:
s = time.time()
sqlContext.sql("SELECT * FROM practitioners WHERE specialty = 'Cardiologist' AND location = 'Hospital A';")
print(time.time() - s)

0.015847444534301758


In [30]:
s = time.time()
sqlContext.sql("SELECT * FROM patients as p JOIN practitioners as pr ON p.gender = pr.name;")
print(time.time() - s)

0.014188766479492188


In [31]:
s = time.time()
sqlContext.sql("SELECT * FROM patients INNER JOIN practitioners ON patients.practitioner = practitioners.practitioner_id;")
print(time.time() - s)

0.018751859664916992


In [32]:
s = time.time()
sqlContext.sql("SELECT * FROM patients RIGHT JOIN practitioners ON patients.practitioner = practitioners.practitioner_id;")
print(time.time() - s)

0.02447676658630371


In [33]:
s = time.time()
sqlContext.sql("SELECT * FROM patients FULL OUTER JOIN practitioners ON patients.practitioner = practitioners.practitioner_id;")
print(time.time() - s)

0.015737295150756836


In [34]:
s = time.time()
sqlContext.sql("SELECT * FROM patients CROSS JOIN practitioners;")
print(time.time() - s)

0.014001846313476562


In [36]:
s = time.time()
sqlContext.sql("SELECT * FROM patients as p1 INNER JOIN patients as p2 ON p1.practitioner = p2.practitioner AND p1.first_name != p2.first_name;")
print(time.time() - s)

0.026643037796020508
