# Imports & Configuration

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [3]:
from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql import SparkSession

In [4]:
spark = (
    SparkSession
    .builder
    .config("spark.driver.memory", "18g")
    .master("local[*]")
    .getOrCreate()
)
sc = spark.sparkContext
sc.setLogLevel("ERROR")

23/07/17 22:34:32 WARN Utils: Your hostname, Afaques-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.0.0.6 instead (on interface en0)
23/07/17 22:34:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/07/17 22:34:32 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Reading File

In [5]:
transactions_file = "../data/data_skew/transactions.parquet"
df_transactions = spark.read.parquet(transactions_file)

                                                                                

In [6]:
df_transactions.show(5, False)

+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|cust_id   |start_date|end_date  |txn_id         |date      |year|month|day|expense_type |amt   |city       |
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
|C0YDPQWPBJ|2010-07-01|2018-12-01|TZ5SMKZY9S03OQJ|2018-10-07|2018|10   |7  |Entertainment|10.42 |boston     |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TYIAPPNU066CJ5R|2016-03-27|2016|3    |27 |Motor/Travel |44.34 |portland   |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TETSXIK4BLXHJ6W|2011-04-11|2011|4    |11 |Entertainment|3.18  |chicago    |
|C0YDPQWPBJ|2010-07-01|2018-12-01|TQKL1QFJY3EM8LO|2018-02-22|2018|2    |22 |Groceries    |268.97|los_angeles|
|C0YDPQWPBJ|2010-07-01|2018-12-01|TYL6DFP09PPXMVB|2010-10-16|2010|10   |16 |Entertainment|2.66  |chicago    |
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-----------+
only showi

In [7]:
customers_file = "../data/data_skew/customers.parquet"
df_customers = spark.read.parquet(customers_file)

In [8]:
df_customers.show(5, False)

+----------+-------------+---+------+----------+-----+-----------+
|cust_id   |name         |age|gender|birthday  |zip  |city       |
+----------+-------------+---+------+----------+-----+-----------+
|C007YEYTX9|Aaron Abbott |34 |Female|7/13/1991 |97823|boston     |
|C00B971T1J|Aaron Austin |37 |Female|12/16/2004|30332|chicago    |
|C00WRSJF1Q|Aaron Barnes |29 |Female|3/11/1977 |23451|denver     |
|C01AZWQMF3|Aaron Barrett|31 |Male  |7/9/1998  |46613|los_angeles|
|C01BKUFRHA|Aaron Becker |54 |Male  |11/24/1979|40284|san_diego  |
+----------+-------------+---+------+----------+-----+-----------+
only showing top 5 rows



**Note**: Even though the DataFrame shows `12` partitions with `df_transactions.rdd.getNumPartitions()`, the Parquet file is small enough to be read as a single split. Therefore, Spark shows only 1 task for the read operation in the Spark UI.

In [9]:
df_transactions.rdd.getNumPartitions()

13

# Narrow Transformations
- `filter` rows where `city='boston'`
- `add` a new column: adding `first_name` and `last_name`
- `alter` an exisitng column: adding 5 to `age` column
- `select` relevant columns

In [10]:
df_narrow_transform = (
    df_customers
    .filter(F.col("city") == "boston")
    .withColumn("first_name", F.split("name", " ").getItem(0))
    .withColumn("last_name", F.split("name", " ").getItem(1))
    .withColumn("age", F.col("age") + F.lit(5))
    .select("cust_id", "first_name", "last_name", "age", "gender", "birthday")
)

df_narrow_transform.show(7, False)

+----------+----------+---------+----+------+---------+
|cust_id   |first_name|last_name|age |gender|birthday |
+----------+----------+---------+----+------+---------+
|C007YEYTX9|Aaron     |Abbott   |39.0|Female|7/13/1991|
|C08XAQUY73|Aaron     |Lambert  |59.0|Female|11/5/1966|
|C094P1VXF9|Aaron     |Lindsey  |29.0|Male  |9/21/1990|
|C097SHE1EF|Aaron     |Lopez    |27.0|Female|4/18/2001|
|C0DTC6436T|Aaron     |Schwartz |57.0|Female|7/9/1962 |
|C0R42FPHRH|Abbie     |Reyes    |68.0|Male  |10/8/1995|
|C0RZV4BH7T|Abbie     |Stevenson|41.0|Male  |2/10/1971|
+----------+----------+---------+----+------+---------+
only showing top 7 rows



In [11]:
df_customer_gt_50 = (
    df_customers
    .filter(F.col("age").cast("int") > 50)
)
df_customer_gt_50.show(9, False)

+----------+--------------+---+------+----------+-----+------------+
|cust_id   |name          |age|gender|birthday  |zip  |city        |
+----------+--------------+---+------+----------+-----+------------+
|C01BKUFRHA|Aaron Becker  |54 |Male  |11/24/1979|40284|san_diego   |
|C01WMZQ7PN|Aaron Brady   |51 |Female|8/20/1994 |52204|philadelphia|
|C021567NJZ|Aaron Briggs  |57 |Male  |3/10/1990 |22008|philadelphia|
|C02JNTM46B|Aaron Chambers|51 |Male  |1/6/2001  |63337|new_york    |
|C030A69V1L|Aaron Clarke  |55 |Male  |4/28/1999 |77176|philadelphia|
|C034RB2MQ6|Aaron Ford    |63 |Male  |7/8/1988  |90592|chicago     |
|C03U340T3R|Aaron Gardner |59 |Female|3/18/1975 |31502|denver      |
|C044XUK8IK|Aaron Gibbs   |58 |Male  |5/17/1999 |67547|seattle     |
|C0694GX5HW|Aaron Guerrero|62 |Male  |8/17/1976 |39243|denver      |
+----------+--------------+---+------+----------+-----+------------+
only showing top 9 rows



# Wide Transformations
1. Repartition
2. Coalesce
3. Joins
4. GroupBy
   - `count`
   - `countDistinct`
   - `sum`

## 1. Repartition

In [12]:
df_transactions.rdd.getNumPartitions()

13

In [13]:
df_repartitioned = df_transactions.repartition(50)

In [14]:
df_repartitioned.show(5, False)



+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-------------+
|cust_id   |start_date|end_date  |txn_id         |date      |year|month|day|expense_type |amt   |city         |
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-------------+
|C0CZCC9UCH|2011-12-01|2019-03-01|TTU5VP07BG1OO6Y|2015-09-17|2015|9    |17 |Education    |49.76 |los_angeles  |
|C0YDPQWPBJ|2011-11-01|null      |TKQ16P00JYR4D4R|2017-03-06|2017|3    |6  |Entertainment|147.19|denver       |
|CHPV658EGQ|2012-03-01|2020-06-01|THIDSLZ2NRS622G|2017-01-11|2017|1    |11 |Entertainment|12.01 |los_angeles  |
|C0W0YS75TQ|2011-03-01|2019-06-01|TX2MW4UTN1DIMQ4|2013-09-08|2013|9    |8  |Motor/Travel |39.18 |portland     |
|C0YDPQWPBJ|2010-12-01|2019-07-01|TZRGI6BM0CTAC2V|2016-09-07|2016|9    |7  |Groceries    |367.73|san_francisco|
+----------+----------+----------+---------------+----------+----+-----+---+-------------+------+-------

                                                                                

## 2. Coalesce

In [15]:
df_transactions.coalesce(5).explain(True)

== Parsed Logical Plan ==
Repartition 5, false
+- Relation [cust_id#0,start_date#1,end_date#2,txn_id#3,date#4,year#5,month#6,day#7,expense_type#8,amt#9,city#10] parquet

== Analyzed Logical Plan ==
cust_id: string, start_date: string, end_date: string, txn_id: string, date: string, year: string, month: string, day: string, expense_type: string, amt: string, city: string
Repartition 5, false
+- Relation [cust_id#0,start_date#1,end_date#2,txn_id#3,date#4,year#5,month#6,day#7,expense_type#8,amt#9,city#10] parquet

== Optimized Logical Plan ==
Repartition 5, false
+- Relation [cust_id#0,start_date#1,end_date#2,txn_id#3,date#4,year#5,month#6,day#7,expense_type#8,amt#9,city#10] parquet

== Physical Plan ==
Coalesce 5
+- *(1) ColumnarToRow
   +- FileScan parquet [cust_id#0,start_date#1,end_date#2,txn_id#3,date#4,year#5,month#6,day#7,expense_type#8,amt#9,city#10] Batched: true, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex(1 paths)[file:/Users/afaqueahmad/Documents/youtube/spar

## 3. Joins

In [16]:
spark.conf.set("spark.sql.autoBroadcastJoinThreshold", -1)

In [17]:
df_joined = (
    df_transactions.join(
        df_customers,
        how="inner",
        on="cust_id"
    )
)

In [18]:
df_joined.show(5, False)



+----------+----------+--------+---------------+----------+----+-----+---+-------------+-------+-------------+------------+---+------+---------+-----+------+
|cust_id   |start_date|end_date|txn_id         |date      |year|month|day|expense_type |amt    |city         |name        |age|gender|birthday |zip  |city  |
+----------+----------+--------+---------------+----------+----+-----+---+-------------+-------+-------------+------------+---+------+---------+-----+------+
|C00WRSJF1Q|2012-11-01|null    |TXNU40MYVB3QXBU|2018-11-01|2018|11   |1  |Motor/Travel |2129.82|san_diego    |Aaron Barnes|29 |Female|3/11/1977|23451|denver|
|C00WRSJF1Q|2012-11-01|null    |TKGK0XNNTDI0MPX|2014-08-06|2014|8    |6  |Groceries    |126.65 |boston       |Aaron Barnes|29 |Female|3/11/1977|23451|denver|
|C00WRSJF1Q|2012-11-01|null    |T1QLRMJWEYOP8YD|2015-09-10|2015|9    |10 |Entertainment|28.94  |new_york     |Aaron Barnes|29 |Female|3/11/1977|23451|denver|
|C00WRSJF1Q|2012-11-01|null    |T7YCEUYHV6FCVRR|2020

                                                                                

## 4. GroupBy

In [19]:
df_transactions.printSchema()

root
 |-- cust_id: string (nullable = true)
 |-- start_date: string (nullable = true)
 |-- end_date: string (nullable = true)
 |-- txn_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)
 |-- expense_type: string (nullable = true)
 |-- amt: string (nullable = true)
 |-- city: string (nullable = true)



### GroupBy Count

In [20]:
df_city_counts = (
    df_transactions
    .groupBy("city")
    .count()
)

In [21]:
df_city_counts.show(5, False)



+---------+-------+
|city     |count  |
+---------+-------+
|san_diego|3977780|
|chicago  |3979023|
|denver   |3980274|
|boston   |3978268|
|seattle  |3980022|
+---------+-------+
only showing top 5 rows



                                                                                

### GroupBy Count Distinct 

In [22]:
df_txn_per_city = (
    df_transactions
    .groupBy("city")
    .agg(F.countDistinct("txn_id").alias("txn_count"))
)

In [23]:
df_txn_per_city.show(5, False)



+---------+---------+
|city     |txn_count|
+---------+---------+
|san_diego|3977780  |
|chicago  |3979023  |
|denver   |3980274  |
|boston   |3978268  |
|seattle  |3980022  |
+---------+---------+
only showing top 5 rows



                                                                                

In [24]:
df_txn_amt_city = (
    df_transactions
    .groupBy("city")
    .agg(F.sum("amt").alias("txn_amt"))
)

In [25]:
df_txn_amt_city.explain(True)

== Parsed Logical Plan ==
'Aggregate ['city], ['city, sum('amt) AS txn_amt#399]
+- Relation [cust_id#0,start_date#1,end_date#2,txn_id#3,date#4,year#5,month#6,day#7,expense_type#8,amt#9,city#10] parquet

== Analyzed Logical Plan ==
city: string, txn_amt: double
Aggregate [city#10], [city#10, sum(cast(amt#9 as double)) AS txn_amt#399]
+- Relation [cust_id#0,start_date#1,end_date#2,txn_id#3,date#4,year#5,month#6,day#7,expense_type#8,amt#9,city#10] parquet

== Optimized Logical Plan ==
Aggregate [city#10], [city#10, sum(cast(amt#9 as double)) AS txn_amt#399]
+- Project [amt#9, city#10]
   +- Relation [cust_id#0,start_date#1,end_date#2,txn_id#3,date#4,year#5,month#6,day#7,expense_type#8,amt#9,city#10] parquet

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- HashAggregate(keys=[city#10], functions=[sum(cast(amt#9 as double))], output=[city#10, txn_amt#399])
   +- Exchange hashpartitioning(city#10, 200), ENSURE_REQUIREMENTS, [id=#416]
      +- HashAggregate(keys=[city#10], function

In [26]:
spark.stop()