In [1]:
from pyspark.sql import SparkSession

# Initialize SparkSession
spark = SparkSession.builder.master("local[*]") \
    .appName("Create Iceberg Table") \
    .config("spark.jars", "/home/ubuntu/Downloads/postgresql-42.6.0.jar") \
    .config("spark.jars.packages", "org.apache.iceberg:iceberg-spark-runtime-3.4_2.12:1.4.1") \
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \
    .config("spark.sql.catalog.spark_catalog.type", "hive") \
    .config("spark.sql.catalog.local", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.local.type", "hadoop") \
    .config("spark.sql.catalog.local.warehouse", "./iceberg-partition") \
    .config("spark.hadoop.iceberg.writer.spec-id", "-1") \
    .getOrCreate()




23/11/14 15:34:16 WARN Utils: Your hostname, ubuntu-ThinkPad-T480 resolves to a loopback address: 127.0.1.1; using 192.168.1.111 instead (on interface wlp3s0)
23/11/14 15:34:16 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.4_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-29a0a2db-f807-4d1c-bab0-97288fff68b9;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.4_2.12;1.4.1 in central
:: resolution report :: resolve 144ms :: artifacts dl 6ms
	:: modules in use:
	org.apache.iceberg#iceberg-spark-runtime-3.4_2.12;1.4.1 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   1   |   0   |   0   |   0   ||   1   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.sp

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, DateType, BooleanType

# Define the schema for the data
schema = StructType([
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("date_of_birth", DateType(), True),
    StructField("gender", StringType(), True),
    StructField("address", StringType(), True),
    StructField("phone_number", StringType(), True),
    StructField("email", StringType(), True),
    StructField("diagnosis", StringType(), True),
    StructField("admission_date", DateType(), True),
    StructField("discharged", BooleanType(), True)
])
patient_df = spark.read.format("csv").option("delimeter", ",").option("header", True).schema(schema).load("data.csv")

In [None]:

patient_df.repartition(1000).writeTo("local.patient_iceberg").createOrReplace()

In [3]:
import time 

s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg WHERE admission_date REGEXP '01-21'")
print(time.time() - s)

0.06533265113830566


In [4]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg WHERE admission_date REGEXP '2020-01-21' and diagnosis='Fever' and gender='Male' and first_name='Steven'")
print(time.time() - s)

0.11518120765686035


In [5]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg WHERE admission_date REGEXP '01-21' and diagnosis='Fever' and gender='Male'")
print(time.time() - s)

0.05190753936767578


In [6]:
s = time.time()
spark.sql("SELECT diagnosis, COUNT(diagnosis) as diagnosis_count FROM local.patient_iceberg GROUP BY diagnosis")
print(time.time() - s)

0.11379551887512207


In [None]:
from pyspark.sql.types import FloatType
from faker import Faker

fake = Faker()

practitioner_schema = StructType([
    StructField("practitioner_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("specialty", StringType(), True),
    StructField("experience_years", FloatType(), True),
    StructField("location", StringType(), True),
    StructField("email", StringType(), True)
])

# Generate Fake local.practitioner_iceberg
num_practitioners = 10000

practitioners = []
for id,_ in enumerate(range(num_practitioners)):
    practitioner_id = "PRAC" + str(num_practitioners + id)
    name = fake.name()
    specialty = fake.random.choice(["Cardiologist", "Dermatologist", "Pediatrician", "Orthopedic Surgeon", "Psychiatrist"])
    experience_years = float(fake.random.randrange(1,20))
    address = fake.address().replace('\n', '').replace(',', '')
    email = fake.email()

    practitioners.append((practitioner_id, name, specialty, experience_years, address, email))

# Create DataFrame from local.practitioner_iceberg
practitioner_df = spark.createDataFrame(local.practitioner_iceberg, schema=practitioner_schema)



In [None]:
practitioner_df.repartition(100).writeTo("local.practitioner_iceberg").createOrReplace()

In [None]:
import random
from pyspark.sql.functions import udf

random_id_gen = udf(lambda: "PRAC"+str(random.randint(10000,99999)))
patient_df = patient_df.withColumn("practitioner", random_id_gen())

In [None]:
patient_df.repartition(1000).writeTo("local.patient_iceberg").createOrReplace()

In [7]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg INNER JOIN local.practitioner_iceberg on local.practitioner_iceberg.practitioner_id=local.patient_iceberg.practitioner")
print(time.time() - s)

0.09034490585327148


In [8]:
s = time.time()
spark.sql("SELECT * FROM local.practitioner_iceberg;")
print(time.time() - s)

0.02079916000366211


In [9]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg WHERE diagnosis = 'Headache';")
print(time.time() - s)

0.030780315399169922


In [10]:
s = time.time()
spark.sql("SELECT * FROM local.practitioner_iceberg WHERE experience_years > 10;")
print(time.time() - s)

0.04002499580383301


In [11]:
s = time.time()
spark.sql("SELECT gender, COUNT(*) FROM local.patient_iceberg GROUP BY gender;")
print(time.time() - s)

0.030773162841796875


In [12]:
s = time.time()
spark.sql("SELECT specialty, AVG(experience_years) FROM local.practitioner_iceberg GROUP BY specialty;")
print(time.time() - s)

0.027812719345092773


In [13]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg WHERE admission_date < '2023-01-01';")
print(time.time() - s)

0.03177285194396973


In [14]:
s = time.time()
spark.sql("SELECT * FROM local.practitioner_iceberg WHERE location LIKE '%City%';")
print(time.time() - s)

0.04801654815673828


In [15]:
s = time.time()
spark.sql("SELECT diagnosis, COUNT(*) FROM local.patient_iceberg GROUP BY diagnosis;")
print(time.time() - s)

0.021311521530151367


In [17]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg p JOIN local.practitioner_iceberg pr ON p.address = pr.location;")
print(time.time() - s)

0.04553055763244629


In [18]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg WHERE MONTH(admission_date) = 5;")
print(time.time() - s)

0.05259132385253906


In [19]:
s = time.time()
spark.sql("SELECT * FROM local.practitioner_iceberg WHERE experience_years BETWEEN 5 AND 15;")
print(time.time() - s)

0.020527362823486328


In [20]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg p JOIN local.practitioner_iceberg pr ON p.diagnosis = pr.specialty;")
print(time.time() - s)

0.03374624252319336


In [21]:
s = time.time()
spark.sql("SELECT YEAR(admission_date) AS year, COUNT(*) FROM local.patient_iceberg GROUP BY year;")
print(time.time() - s)

0.03134918212890625


In [22]:
s = time.time()
spark.sql("SELECT * FROM local.practitioner_iceberg ORDER BY experience_years DESC LIMIT 1;")
print(time.time() - s)

0.03145790100097656


In [23]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg WHERE discharged = false;")
print(time.time() - s)

0.0253751277923584


In [24]:
s = time.time()
spark.sql("SELECT * FROM local.practitioner_iceberg WHERE email LIKE '%.com';")
print(time.time() - s)

0.014177322387695312


In [26]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg WHERE practitioner = '123';")
print(time.time() - s)

0.019626379013061523


In [27]:
s = time.time()
spark.sql("SELECT * FROM local.practitioner_iceberg WHERE specialty = 'Cardiologist' AND location = 'Hospital A';")
print(time.time() - s)

0.021912574768066406


In [28]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg p JOIN local.practitioner_iceberg pr ON p.gender = pr.name;")
print(time.time() - s)

0.027526378631591797


In [30]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg INNER JOIN local.practitioner_iceberg ON local.patient_iceberg.practitioner = local.practitioner_iceberg.practitioner_id;")
print(time.time() - s)

0.02884960174560547


In [31]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg RIGHT JOIN local.practitioner_iceberg ON local.patient_iceberg.practitioner = local.practitioner_iceberg.practitioner_id;")
print(time.time() - s)

0.024173974990844727


In [32]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg FULL OUTER JOIN local.practitioner_iceberg ON local.patient_iceberg.practitioner = local.practitioner_iceberg.practitioner_id;")
print(time.time() - s)

0.024791955947875977


In [33]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg CROSS JOIN local.practitioner_iceberg;")
print(time.time() - s)

0.021206140518188477


In [38]:
s = time.time()
spark.sql("SELECT * FROM local.patient_iceberg p1 INNER JOIN local.patient_iceberg p2 ON p1.practitioner = p2.practitioner AND p1.fiRST_name != p2.fiRST_name;")
print(time.time() - s)

0.03283858299255371
