# AWS Glue Studio Notebook
##### You are now running a AWS Glue Studio notebook; To start using your notebook you need to start an AWS Glue Interactive Session.


#### Optional: Run this cell to see available notebook commands ("magics").


In [None]:
%help

####  Run this cell to set up and start your interactive session.


In [3]:
%idle_timeout 2880
%glue_version 3.0
%worker_type G.1X
%number_of_workers 10

import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job

from awsglue.dynamicframe import DynamicFrame
from pyspark.sql.functions import col

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
job.commit()

You are already connected to a glueetl session 19d32ec2-e064-45c4-ac7b-b50e92500cd7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Current idle_timeout is 2880 minutes.
idle_timeout has been set to 2880 minutes.


You are already connected to a glueetl session 19d32ec2-e064-45c4-ac7b-b50e92500cd7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Setting Glue version to: 3.0


You are already connected to a glueetl session 19d32ec2-e064-45c4-ac7b-b50e92500cd7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous worker type: G.1X
Setting new worker type to: G.1X


You are already connected to a glueetl session 19d32ec2-e064-45c4-ac7b-b50e92500cd7.

No change will be made to the current session that is set as glueetl. The session configuration change will apply to newly created sessions.


Previous number of workers: 10
Setting new number of workers to: 10
GlueArgumentError: the following arguments are required: --JOB_NAME


In [29]:
output_path = 's3://imba-charlie-test-111/data-partition/orders_partitioned/'

# Load orders from imbadb
orders = glueContext.create_dynamic_frame.from_catalog(database="imbadb", table_name="orders")
orders_df = orders.toDF()




### Table `orders` repartitionByRange 
- across user_id
- 100 bins

In [5]:
orders_df.show()

+--------+-------+--------+------------+---------+-----------------+----------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|
+--------+-------+--------+------------+---------+-----------------+----------------------+
| 2539329|      1|   prior|           1|        2|                8|                  null|
| 2398795|      1|   prior|           2|        3|                7|                  15.0|
|  473747|      1|   prior|           3|        3|               12|                  21.0|
| 2254736|      1|   prior|           4|        4|                7|                  29.0|
|  431534|      1|   prior|           5|        4|               15|                  28.0|
| 3367565|      1|   prior|           6|        2|                7|                  19.0|
|  550135|      1|   prior|           7|        1|                9|                  20.0|
| 3108588|      1|   prior|           8|        1|               14|            

In [30]:
# Perform range partitioning on the "user_id" column
min_user_id = orders_df.selectExpr("min(user_id)").collect()[0][0]
max_user_id = orders_df.selectExpr("max(user_id)").collect()[0][0]
print(min_user_id, type(min_user_id),max_user_id, type(max_user_id))

1 <class 'int'> 206209 <class 'int'>


In [31]:
num_partitions = 100
partition_size = (max_user_id - min_user_id + 1) // num_partitions

orders_df = orders_df.repartitionByRange(
    num_partitions, col("user_id")
)




In [32]:
orders_df.show()

+--------+-------+--------+------------+---------+-----------------+----------------------+
|order_id|user_id|eval_set|order_number|order_dow|order_hour_of_day|days_since_prior_order|
+--------+-------+--------+------------+---------+-----------------+----------------------+
| 2539329|      1|   prior|           1|        2|                8|                  null|
| 2398795|      1|   prior|           2|        3|                7|                  15.0|
|  473747|      1|   prior|           3|        3|               12|                  21.0|
| 2254736|      1|   prior|           4|        4|                7|                  29.0|
|  431534|      1|   prior|           5|        4|               15|                  28.0|
| 3367565|      1|   prior|           6|        2|                7|                  19.0|
|  550135|      1|   prior|           7|        1|                9|                  20.0|
| 3108588|      1|   prior|           8|        1|               14|            

In [34]:
orders_df.write.mode("overwrite").parquet(output_path)




In [33]:
# Count the number of records in each partition
counts_per_partition = orders_df.rdd.glom().map(len).collect()
print(counts_per_partition)

[33954, 34423, 32636, 32972, 31579, 35339, 32462, 33193, 31087, 35305, 37391, 35356, 34823, 37393, 30778, 37784, 32734, 30672, 33738, 35327, 34930, 34797, 34060, 34951, 34891, 33516, 32953, 35034, 31932, 35362, 34925, 30982, 33951, 34357, 34768, 30555, 35866, 33612, 34079, 31265, 39076, 32663, 34246, 34399, 36448, 35689, 33496, 35059, 34613, 34429, 29920, 35235, 34358, 36361, 31766, 38014, 39583, 32805, 34622, 35542, 33444, 33008, 36674, 35330, 32728, 31840, 33307, 34747, 32670, 34264, 34081, 36593, 32433, 33111, 37975, 36326, 33769, 34832, 34467, 34759, 33539, 32164, 35835, 32049, 36253, 36301, 35869, 33399, 33822, 33832, 30694, 36931, 36421, 33526, 35291, 31939, 33897, 33290, 30954, 34663]


In [37]:
# Calculate the average number of records per partition
total_count = sum([count for count in counts_per_partition])
avg_count_per_partition = total_count / 100
print(f"Average number of records per partition: {avg_count_per_partition:.2f}")

Average number of records per partition: 34210.83


In [38]:
# Calculate the standard deviation of the number of records per partition
import math
std_dev = math.sqrt(sum([(count - avg_count_per_partition) ** 2 for count in counts_per_partition]) / 100)
print(f"Standard deviation of the number of records per partition: {std_dev:.2f}")

# Calculate the coefficient of variation (CV) of the number of records per partition
cv = std_dev / avg_count_per_partition
print(f"Coefficient of variation of the number of records per partition: {cv:.2f}")

Standard deviation of the number of records per partition: 1916.87
Coefficient of variation of the number of records per partition: 0.06


In [39]:
from pyspark.sql.functions import countDistinct

distinct_user_ids = orders_df.select(countDistinct("user_id")).collect()[0][0]
print("Distinct user IDs:", distinct_user_ids)

Distinct user IDs: 206209


### Table `order_product` repartitionByRange
- across product_id
- 50 bins

In [None]:
# Load order_product from imbadb
output_path = 's3://imba-charlie-test-111/data-partition/op_partitioned/'
op = glueContext.create_dynamic_frame.from_catalog(database="imbadb", table_name="order_product")
op_df = op.toDF()

In [41]:
print("Hello")

Hello


### Table `pda` repartition
- merged table of products, departements, aisles,
- across department_id (21 distinct values)

In [47]:
products_df = glueContext.create_dynamic_frame.from_catalog(database="imbadb", table_name="products")
departments_df = glueContext.create_dynamic_frame.from_catalog(database="imbadb", table_name="departments")
aisles_df = glueContext.create_dynamic_frame.from_catalog(database="imbadb", table_name="aisles")




In [48]:
# Convert the dynamic frames to data frames
products_df = products_df.toDF()
departments_df = departments_df.toDF()
aisles_df = aisles_df.toDF()




In [51]:
# register DataFrames as temporary views
products_df.createOrReplaceTempView("products")
departments_df.createOrReplaceTempView("departments")
aisles_df.createOrReplaceTempView("aisles")




In [61]:
pda_df = spark.sql("""
    SELECT p.product_id, p.product_name, p.department_id, d.department, p.aisle_id, a.aisle
    FROM products p
    LEFT JOIN departments d ON p.department_id = d.department_id
    LEFT JOIN aisles a ON p.aisle_id = a.aisle_id
""")

pda_df.show()

+----------+--------------------+-------------+----------+--------+--------------------+
|product_id|        product_name|department_id|department|aisle_id|               aisle|
+----------+--------------------+-------------+----------+--------+--------------------+
|        14|Fresh Scent Dishw...|           17| household|      74|     dish detergents|
|        48|School Glue Washa...|           17| household|      87|      more household|
|        57|     Flat Toothpicks|           17| household|     111|plates bowls cups...|
|        71|Ultra 7 Inch Poly...|           17| household|     111|plates bowls cups...|
|       105|Easy Grab 9x13 Ob...|           17| household|      10|    kitchen supplies|
|       111|Fabric Softener G...|           17| household|      75|             laundry|
|       134|Stain Release Boo...|           17| household|      75|             laundry|
|       153|Fabric Refresher ...|           17| household|      75|             laundry|
|       186|Fresh Sce

In [62]:
output_path_pda = 's3://imba-charlie-test-111/data-partition/pda_partitioned/'
pda_df.write.mode("overwrite").partitionBy("department_id").parquet(output_path_pda)


