In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [4]:
spark = (
    SparkSession
    .builder
    .config("spark.driver.memory", "10g")
    .config("spark.sql.files.maxPartitionBytes", "268435456")
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .master("local[*]")
    .getOrCreate()
)
sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/08/19 10:51:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


<h1> Topics </h1>

1. Reading Files (parquet)
2. Narrow Operations
   - `filter`
   - `withColumn`: adding/modifying a column
   - `select`: selecting relevant column
3. Wide Operations
   - Joins
     - Sort Merge Join
     - Broadcast Join
   - GroupBy
     - `count`
     - `sum`
     - `countDistinct`

In [5]:
transactions_file = "/Users/bhushanchowdary/Documents/GitHub/pyspark/Optimization/data/data_skew/transactions.parquet"
df_transactions = spark.read.parquet(transactions_file)

In [6]:
df_transactions.rdd.getNumPartitions()

10

In [7]:
df_transactions.show(5, False)

+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|cust_id   |start_date|end_date  |txn_id         |date      |year|month|day|expense_type |amt   |city       |
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|C0YDPQWPBJ|2010-07-01|2018-12-01|TZ5SMKZY9S03OQJ|2018-10-07|2018|10   |7  |Entertainment|10.42 |boston     |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TYIAPPNU066CJ5R|2016-03-27|2016|3    |27 |Motor/Travel |44.34 |portland   |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TETSXIK4BLXHJ6W|2011-04-11|2011|4    |11 |Entertainment|3.18  |chicago    |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TQKL1QFJY3EM8LO|2018-02-22|2018|2    |22 |Groceries    |268.97|los_angeles|
|C0YDPQWPBJ|2010-07-01|2018-12-01|TYL6DFP09PPXMVB|2010-10-16|2010|10   |16 |Entertainment|2.66  |chicago    |
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
only showi

In [8]:
customer_data = "/Users/bhushanchowdary/Documents/GitHub/pyspark/Optimization/data/data_skew/customers.parquet"
df_customer = spark.read.parquet(customer_data)

In [9]:
df_narrow_transform = (
    df_customer
    .filter(F.col("city") == "boston")
    .withColumn("first_name", F.split("name", " ").getItem(0))
    .withColumn("last_name", F.split("name", " ").getItem(1))
    .withColumn("age", F.col("age") + F.lit(5))
    .select("cust_id", "first_name", "last_name", "age", "gender", "birthday")
)

df_narrow_transform.write.format("noop").mode("overwrite").save("../data/test/df_narrow_transform.parquet")

In [10]:
df_narrow_transform.show(7, False)

+----------+----------+---------+---+------+---------+
|cust_id   |first_name|last_name|age|gender|birthday |
+----------+----------+---------+---+------+---------+
|C007YEYTX9|Aaron     |Abbott   |39 |Female|7/13/1991|
|C08XAQUY73|Aaron     |Lambert  |59 |Female|11/5/1966|
|C094P1VXF9|Aaron     |Lindsey  |29 |Male  |9/21/1990|
|C097SHE1EF|Aaron     |Lopez    |27 |Female|4/18/2001|
|C0DTC6436T|Aaron     |Schwartz |57 |Female|7/9/1962 |
|C0R42FPHRH|Abbie     |Reyes    |68 |Male  |10/8/1995|
|C0RZV4BH7T|Abbie     |Stevenson|41 |Male  |2/10/1971|
+----------+----------+---------+---+------+---------+
only showing top 7 rows


In [12]:
df_customer_gt_50 = (
    df_customer
    .filter(F.col("age").cast("int") > 50)
)
df_customer_gt_50.write.format("noop").mode("overwrite").save("../data/test/df_customer_gt_50.parquet")

# Wide Transformations
1. Joins
   - Sort Merge Join
2. GroupBy
   - `count`
   - `countDistinct`
   - `sum`

In [13]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [15]:
df_joined = (
    df_transactions.join(
        df_customer,
        how="inner",
        on="cust_id"
    )
)

In [16]:
df_joined.write.format("noop").mode("overwrite").save("../data/test/df_joined.parquet")

                                                                                

In [17]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", 10485760)

In [19]:
df_broadcast_joined = (
    df_transactions.join(
        F.broadcast(df_customer),
        how="inner",
        on="cust_id"
    )
)

In [20]:
df_broadcast_joined.write.format("noop").mode("overwrite").save("../data/test/df_broadcast_joined.parquet")

                                                                                

In [21]:
df_txn_per_city = (
    df_transactions
    .groupBy("city")
    .agg(F.countDistinct("txn_id").alias("txn_count"))
)

In [22]:
df_txn_per_city.show(5, False)



+---------+---------+
|city     |txn_count|
+---------+---------+
|san_diego|3977780  |
|chicago  |3979023  |
|denver   |3980274  |
|boston   |3978268  |
|seattle  |3980022  |
+---------+---------+
only showing top 5 rows


                                                                                

In [23]:
spark.stop()