<h2> Imports & Configuration </h2>

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
import time

In [2]:
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.master("local[*]").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("ERROR")

23/06/30 16:39:58 WARN Utils: Your hostname, Afaques-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.2 instead (on interface en0)
23/06/30 16:39:58 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/06/30 16:39:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [4]:
# spark.conf.set("spark.sql.shuffle.partitions", "3")
spark.conf.set("spark.sql.adaptive.enabled", "false")

# What is the final objective? 
- Find the `count` of transactions done by users in their home `city`? 

# Join Skews

In [5]:
transactions_file = "../../data/data_skew/transactions.parquet"
customer_file = "../../data/data_skew/customers.parquet"

df_transactions = spark.read.parquet(transactions_file)
df_customers = spark.read.parquet(customer_file)

                                                                                

In [6]:
df_transactions.printSchema()
df_transactions.show(5, False)

root
 |-- cust_id: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- expense_type: string (nullable = true)
 |-- amt: string (nullable = true)
 |-- city: string (nullable = true)

+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|cust_id   |start_date|end_date  |txn_id         |date      |year|month|day|expense_type |amt   |city       |
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|C0YDPQWPBJ|2010-07-01|2018-12-01|TZ5SMKZY9S03OQJ|2018-10-07|2018|10   |7  |Entertainment|10.42 |boston     |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TYIAPPNU066CJ5R|2016-03-27|2016|3    |27 |Motor/Travel |44.34 |portland   |
|C0YDPQWPBJ|2010-07-01|201

In [7]:
df_customers.printSchema()
df_customers.show(5, False)

root
 |-- cust_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- zip: string (nullable = true)
 |-- city: string (nullable = true)

+----------+-------------+---+------+----------+-----+-----------+
|cust_id   |name         |age|gender|birthday  |zip  |city       |
+----------+-------------+---+------+----------+-----+-----------+
|C007YEYTX9|Aaron Abbott |34 |Female|7/13/1991 |97823|boston     |
|C00B971T1J|Aaron Austin |37 |Female|12/16/2004|30332|chicago    |
|C00WRSJF1Q|Aaron Barnes |29 |Female|3/11/1977 |23451|denver     |
|C01AZWQMF3|Aaron Barrett|31 |Male  |7/9/1998  |46613|los_angeles|
|C01BKUFRHA|Aaron Becker |54 |Male  |11/24/1979|40284|san_diego  |
+----------+-------------+---+------+----------+-----+-----------+
only showing top 5 rows



In [8]:
(
    df_transactions
    .groupBy("cust_id")
    .agg(F.countDistinct("txn_id").alias("ct"))
    .orderBy(F.desc("ct"))
    .show(5, False)
)



+----------+--------+
|cust_id   |ct      |
+----------+--------+
|C0YDPQWPBJ|17539732|
|C89FCEGPJP|7999    |
|C3KUDEN3KO|7999    |
|CBW3FMEAU7|7999    |
|CHNFNR89ZV|7998    |
+----------+--------+
only showing top 5 rows





In [9]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [10]:
df_txn_details = (
    df_transactions.join(
        df_customers,
        on="cust_id",
        how="inner"
    )
)

In [11]:
start_time = time.time()
df_txn_details.count()
print(f"time taken: {time.time() - start_time}")

                                                                                

39790092

time taken: 11.673514127731323


# Changing Join Condition

In [12]:
(
    df_transactions
    .groupBy("cust_id", "city")
    .agg(F.countDistinct("txn_id").alias("ct"))
    .orderBy(F.desc("ct"))
    .show(10, False)
)



+----------+-------------+-------+
|cust_id   |city         |ct     |
+----------+-------------+-------+
|C0YDPQWPBJ|portland     |1756379|
|C0YDPQWPBJ|los_angeles  |1755910|
|C0YDPQWPBJ|denver       |1755398|
|C0YDPQWPBJ|san_francisco|1754952|
|C0YDPQWPBJ|seattle      |1754184|
|C0YDPQWPBJ|chicago      |1753398|
|C0YDPQWPBJ|boston       |1752906|
|C0YDPQWPBJ|san_diego    |1752767|
|C0YDPQWPBJ|philadelphia |1752140|
|C0YDPQWPBJ|new_york     |1751698|
+----------+-------------+-------+
only showing top 10 rows





In [13]:
df_txn_details_2 = (
    df_transactions.join(
        df_customers,
        on=["cust_id", "city"],
        how="inner"
    )
)

In [14]:
start_time = time.time()
df_txn_details_2.count()
print(f"time taken: {time.time() - start_time}")

                                                                                

3978769

time taken: 6.950979232788086


# Count of Txns in Home City

In [15]:
(
    df_txn_details_2
    .groupBy("cust_id", "city")
    .agg(F.countDistinct("txn_id").alias("txn_ct"))
    .show(10, False)
)



+----------+-------------+------+
|cust_id   |city         |txn_ct|
+----------+-------------+------+
|C0GZ0JZFVQ|philadelphia |727   |
|CBW3FMEAU7|new_york     |805   |
|CHD1KYWASO|san_francisco|732   |
|CIE3KWI9UN|portland     |782   |
|CLTLBERRQV|los_angeles  |737   |
|CM60NRREKK|new_york     |774   |
|CQH21DQJQH|new_york     |734   |
|CRCOQVJSOW|san_francisco|798   |
|CUNG96IOA6|seattle      |746   |
|CVNS15OIG2|seattle      |744   |
+----------+-------------+------+
only showing top 10 rows



[Stage 20:>                                                         (0 + 1) / 1]                                                                                

In [None]:
# spark.stop()