<h2> Imports & Configuration </h2>

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [2]:
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.master("local[4]").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [6]:
# spark.conf.set("spark.sql.shuffle.partitions", "3")
spark.conf.set("spark.sql.adaptive.enabled", "false")

<h2> Simulating Uniform Dataset </h2>

In [7]:
df_uniform = spark.createDataFrame([i for i in range(1000000)], IntegerType())
df_uniform.show(3, False)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+
|value|
+-----+
|0    |
|1    |
|2    |
+-----+
only showing top 3 rows



                                                                                

In [8]:
(
    df_uniform
    .withColumn("partition", F.spark_partition_id())
    .groupBy("partition")
    .count()
    .orderBy("partition")
    .show()
)

                                                                                

+---------+------+
|partition| count|
+---------+------+
|        0|249856|
|        1|249856|
|        2|249856|
|        3|250432|
+---------+------+



<h2> Skewed Dataset </h2>

In [9]:
df0 = spark.range(0, 1000000).repartition(1)
df1 = spark.range(0, 10).repartition(1)
df2 = spark.range(0, 10).repartition(1)
df_skew = df0.union(df1).union(df2)
df_skew.show(3, False)

+---+
|id |
+---+
|0  |
|1  |
|2  |
+---+
only showing top 3 rows



In [10]:
(
    df_skew
    .withColumn("partition", F.spark_partition_id())
    .groupBy("partition")
    .count()
    .orderBy("partition")
    .show()
)



+---------+-------+
|partition|  count|
+---------+-------+
|        0|1000000|
|        1|     10|
|        2|     10|
+---------+-------+



                                                                                

# Join Skews

In [11]:
transactions_file = "../../data/data_skew/transactions.parquet"
customer_file = "../../data/data_skew/customers.parquet"

df_transactions = spark.read.parquet(transactions_file)
df_customers = spark.read.parquet(customer_file)

In [12]:
df_transactions.printSchema()
df_transactions.show(5, False)

root
 |-- cust_id: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- expense_type: string (nullable = true)
 |-- amt: string (nullable = true)

+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+
|cust_id   |start_date|end_date  |txn_id         |date      |year|month|day|expense_type |amt   |
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+
|C2AU14903J|2013-04-01|2019-05-01|TRGA5GWBO0CY0F3|2014-02-17|2014|2    |17 |Motor/Travel |58.25 |
|C2AU14903J|2013-04-01|2019-05-01|THCM719A8W1I5MT|2017-05-13|2017|5    |13 |Entertainment|27.03 |
|C2AU14903J|2013-04-01|2019-05-01|TZGMSM7SEKCLIAK|2014-09-12|2014|9    |12 |Entertainment|61.12 |
|C2AU14903J|2013-04-01|2

In [13]:
df_customers.printSchema()
df_customers.show(5, False)

root
 |-- cust_id: string (nullable = true)
 |-- distinct_txns: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- zip: string (nullable = true)

+----------+-------------+-------------+---+------+----------+-----+
|cust_id   |distinct_txns|name         |age|gender|birthday  |zip  |
+----------+-------------+-------------+---+------+----------+-----+
|C000BK8N2S|6949         |Aaron Abbott |34 |Female|7/13/1991 |97823|
|C005K7U9RE|6540         |Aaron Austin |37 |Female|12/16/2004|30332|
|C006CT8BVO|6356         |Aaron Barnes |29 |Female|3/11/1977 |23451|
|C007YEYTX9|7445         |Aaron Barrett|31 |Male  |7/9/1998  |46613|
|C00B971T1J|7532         |Aaron Becker |54 |Male  |11/24/1979|40284|
+----------+-------------+-------------+---+------+----------+-----+
only showing top 5 rows



In [14]:
(
    df_transactions
    .groupBy("cust_id")
    .agg(F.countDistinct("txn_id").alias("ct"))
    .orderBy(F.desc("ct"))
    .show(5, False)
)

                                                                                

+----------+--------+
|cust_id   |ct      |
+----------+--------+
|C0YDPQWPBJ|43551962|
|CXD6UZEGKS|6999    |
|CP2GC38KPG|6999    |
|CQZK7HS7HL|6999    |
|C1MZ9FNHAN|6999    |
+----------+--------+
only showing top 5 rows



In [15]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [16]:
df_txn_details = (
    df_transactions.join(
        df_customers,
        on="cust_id",
        how="inner"
    )
)

In [17]:
df_txn_details.count()

                                                                                

127125002

In [18]:
spark.stop()