# Learning PySpark 
### Video series

### Packt Publishing

**Author**: Tomasz Drabas
**Date**:   2018-02-01





# Section 5: Data Processing with Spark DataFrames

In this section we will look at processing data using Spark DataFrames.

# Read the data

In [2]:
import pyspark
sc = pyspark.SparkContext()

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("create").getOrCreate()

In [45]:
import pandas
import os
date_cols = ['OrderDate']
df = pandas.read_excel('sample_data.xlsx', parse_dates=date_cols)
sample_df_inferred= spark.createDataFrame(df)
sample_df_inferred.printSchema()
sample_df_inferred.show(4)

root
 |-- OrderDate: timestamp (nullable = true)
 |-- Region: string (nullable = true)
 |-- Rep: string (nullable = true)
 |-- Item: string (nullable = true)
 |-- Units: long (nullable = true)
 |-- UnitCost: double (nullable = true)
 |-- Total: double (nullable = true)

+-------------------+-------+-------+------+-----+--------+------+
|          OrderDate| Region|    Rep|  Item|Units|UnitCost| Total|
+-------------------+-------+-------+------+-----+--------+------+
|2016-01-06 00:00:00|   East|  Jones|Pencil|   95|    1.99|189.05|
|2017-03-02 00:00:00|Central| Kivell|Binder|   50|   19.99| 999.5|
|2016-02-09 00:00:00|Central|Jardine|Pencil|   36|    4.99|179.64|
|2016-02-26 00:00:00|Central|   Gill|   Pen|   27|   19.99|539.73|
+-------------------+-------+-------+------+-----+--------+------+
only showing top 4 rows



In [16]:
import pyspark.sql.functions as f

sample_df_inferred = spark.read.csv(
    'sample_data.csv'
    , header=True
    , inferSchema = True
)

sample_df_inferred = (
    sample_df_inferred
    .withColumn('OrderDate'
                , f.to_date('OrderDate', 'MM/dd/yy')
               )
)

sample_df_inferred.show(4)

sample_df_inferred.select(to_date(sample_df_inferred.OrderDate).alias('new_date')).show()

+---------+-------+-------+------+-----+--------+------+
|OrderDate| Region|    Rep|  Item|Units|UnitCost| Total|
+---------+-------+-------+------+-----+--------+------+
|     null|   East|  Jones|Pencil|   95|    1.99|189.05|
|     null|Central| Kivell|Binder|   50|   19.99| 999.5|
|     null|Central|Jardine|Pencil|   36|    4.99|179.64|
|     null|Central|   Gill|   Pen|   27|   19.99|539.73|
+---------+-------+-------+------+-----+--------+------+
only showing top 4 rows



## Dropping Columns

In [49]:
#Using Select
sample_df_inferred.select('Rep', 'Item', 'Units','UnitCost').show(4)

+-------+------+-----+--------+
|    Rep|  Item|Units|UnitCost|
+-------+------+-----+--------+
|  Jones|Pencil|   95|    1.99|
| Kivell|Binder|   50|   19.99|
|Jardine|Pencil|   36|    4.99|
|   Gill|   Pen|   27|   19.99|
+-------+------+-----+--------+
only showing top 4 rows



In [50]:
#Using Drop
sample_df_inferred.drop('OrderDate', 'Region', 'Total').show(4)

+-------+------+-----+--------+
|    Rep|  Item|Units|UnitCost|
+-------+------+-----+--------+
|  Jones|Pencil|   95|    1.99|
| Kivell|Binder|   50|   19.99|
|Jardine|Pencil|   36|    4.99|
|   Gill|   Pen|   27|   19.99|
+-------+------+-----+--------+
only showing top 4 rows



## Renaming columns

In [51]:
# Using Select
sample_df_inferred.select(f.col('OrderDate').alias('Date'),
                          f.col('Region').alias('Location')).show(4)

+-------------------+--------+
|               Date|Location|
+-------------------+--------+
|2016-01-06 00:00:00|    East|
|2017-03-02 00:00:00| Central|
|2016-02-09 00:00:00| Central|
|2016-02-26 00:00:00| Central|
+-------------------+--------+
only showing top 4 rows



In [52]:
#Using WithColumn Method
(
    sample_df_inferred
    .withColumnRenamed('OrderDate', 'Date')
    .withColumnRenamed('Region', 'Location')
    .show(4)
)

+-------------------+--------+-------+------+-----+--------+------+
|               Date|Location|    Rep|  Item|Units|UnitCost| Total|
+-------------------+--------+-------+------+-----+--------+------+
|2016-01-06 00:00:00|    East|  Jones|Pencil|   95|    1.99|189.05|
|2017-03-02 00:00:00| Central| Kivell|Binder|   50|   19.99| 999.5|
|2016-02-09 00:00:00| Central|Jardine|Pencil|   36|    4.99|179.64|
|2016-02-26 00:00:00| Central|   Gill|   Pen|   27|   19.99|539.73|
+-------------------+--------+-------+------+-----+--------+------+
only showing top 4 rows



## Dropping Observations

In [53]:
import numpy as np

sample_df_broken_rdd = (
    sample_df_inferred
    .rdd
    .map(lambda row: 
         row[:5] + 
         ((None, None) if np.random.rand() < 0.2 else tuple(row[5:]))
    )
)

sample_df_broken = (
    spark
    .createDataFrame(
        sample_df_broken_rdd
        , sample_df_inferred.columns
    )
)

sample_df_broken.dropna(subset=['OrderDate']).show(4)

+-------------------+-------+-------+------+-----+--------+------+
|          OrderDate| Region|    Rep|  Item|Units|UnitCost| Total|
+-------------------+-------+-------+------+-----+--------+------+
|2016-01-06 00:00:00|   East|  Jones|Pencil|   95|    1.99|189.05|
|2017-03-02 00:00:00|Central| Kivell|Binder|   50|    null|  null|
|2016-02-09 00:00:00|Central|Jardine|Pencil|   36|    null|  null|
|2016-02-26 00:00:00|Central|   Gill|   Pen|   27|   19.99|539.73|
+-------------------+-------+-------+------+-----+--------+------+
only showing top 4 rows



## Filling Missing Values

In [39]:
sample_df_broken.show(5)

+-------------------+-------+-------+------+-----+--------+------+
|          OrderDate| Region|    Rep|  Item|Units|UnitCost| Total|
+-------------------+-------+-------+------+-----+--------+------+
|2016-01-06 00:00:00|   East|  Jones|Pencil|   95|    1.99|189.05|
|2017-03-02 00:00:00|Central| Kivell|Binder|   50|   19.99| 999.5|
|2016-02-09 00:00:00|Central|Jardine|Pencil|   36|    4.99|179.64|
|2016-02-26 00:00:00|Central|   Gill|   Pen|   27|   19.99|539.73|
|2016-03-15 00:00:00|   West|Sorvino|Pencil|   56|    2.99|167.44|
+-------------------+-------+-------+------+-----+--------+------+
only showing top 5 rows



## Finding Missing Values

In [40]:
avg_unitCost = (
    sample_df_broken
    .select('UnitCost')
    .agg(
        f.mean(f.col('UnitCost'))
        .alias('UnitCost')
    ).toPandas()
    .to_dict('records')
)

sample_df_fixed = (
    sample_df_broken
    .fillna(*avg_unitCost)
    .withColumn('Total', f.col('Units') * f.col('UnitCost'))
)

sample_df_fixed.show(4)

+-------------------+-------+-------+------+-----+------------------+------------------+
|          OrderDate| Region|    Rep|  Item|Units|          UnitCost|             Total|
+-------------------+-------+-------+------+-----+------------------+------------------+
|2016-01-06 00:00:00|   East|  Jones|Pencil|   95|              1.99|            189.05|
|2017-03-02 00:00:00|Central| Kivell|Binder|   50|             19.99| 999.4999999999999|
|2016-02-09 00:00:00|Central|Jardine|Pencil|   36|              4.99|179.64000000000001|
|2016-02-26 00:00:00|Central|   Gill|   Pen|   27|22.593611111111116| 610.0275000000001|
+-------------------+-------+-------+------+-----+------------------+------------------+
only showing top 4 rows



# Filtering Data

In [60]:
sample_df_inferred.select("Item").distinct().show()

+-------+
|   Item|
+-------+
|   Desk|
| Binder|
|    Pen|
|Pen Set|
| Pencil|
+-------+



In [65]:
# Using where method
sample_df_inferred.where('Item = "Pencil"').show(4)

+-------------------+-------+-------+------+-----+--------+------+
|          OrderDate| Region|    Rep|  Item|Units|UnitCost| Total|
+-------------------+-------+-------+------+-----+--------+------+
|2016-01-06 00:00:00|   East|  Jones|Pencil|   95|    1.99|189.05|
|2016-02-09 00:00:00|Central|Jardine|Pencil|   36|    4.99|179.64|
|2016-03-15 00:00:00|   West|Sorvino|Pencil|   56|    2.99|167.44|
|2016-04-18 00:00:00|Central|Andrews|Pencil|   75|    1.99|149.25|
+-------------------+-------+-------+------+-----+--------+------+
only showing top 4 rows



In [66]:
# Using Filter Method
sample_df_inferred.filter('Item = "Pencil"').show(4)

+-------------------+-------+-------+------+-----+--------+------+
|          OrderDate| Region|    Rep|  Item|Units|UnitCost| Total|
+-------------------+-------+-------+------+-----+--------+------+
|2016-01-06 00:00:00|   East|  Jones|Pencil|   95|    1.99|189.05|
|2016-02-09 00:00:00|Central|Jardine|Pencil|   36|    4.99|179.64|
|2016-03-15 00:00:00|   West|Sorvino|Pencil|   56|    2.99|167.44|
|2016-04-18 00:00:00|Central|Andrews|Pencil|   75|    1.99|149.25|
+-------------------+-------+-------+------+-----+--------+------+
only showing top 4 rows



In [64]:
sample_df_inferred.where(f.col("Item").isin({"Desk","Pen"})).show(10)

+-------------------+-------+-------+----+-----+--------+------+
|          OrderDate| Region|    Rep|Item|Units|UnitCost| Total|
+-------------------+-------+-------+----+-----+--------+------+
|2016-02-26 00:00:00|Central|   Gill| Pen|   27|   19.99|539.73|
|2016-09-01 00:00:00|Central|  Smith|Desk|    2|   125.0| 250.0|
|2016-10-22 00:00:00|   East|  Jones| Pen|   64|    8.99|575.36|
|2016-11-08 00:00:00|   East| Parent| Pen|   15|   19.99|299.85|
|2017-04-27 00:00:00|   East| Howard| Pen|   96|    4.99|479.04|
|2017-06-17 00:00:00|Central| Kivell|Desk|    5|   125.0| 625.0|
|2017-08-24 00:00:00|   West|Sorvino|Desk|    3|   275.0| 825.0|
|2017-09-27 00:00:00|   West|Sorvino| Pen|   76|    1.99|151.24|
+-------------------+-------+-------+----+-----+--------+------+



# Aggregating data in DataFrames

In [69]:
sample_df_inferred.groupby('Rep', 'Region').count().orderBy('count', ascending=False).show()

+--------+-------+-----+
|     Rep| Region|count|
+--------+-------+-----+
|   Jones|   East|    8|
|    Gill|Central|    5|
| Jardine|Central|    5|
| Andrews|Central|    4|
|  Kivell|Central|    4|
| Sorvino|   West|    4|
|  Parent|   East|    3|
|   Smith|Central|    3|
|  Morgan|Central|    3|
|Thompson|   West|    2|
|  Howard|   East|    2|
+--------+-------+-----+



In [70]:
(
    sample_df_inferred
    .groupby('Item')
    .agg(
          f.sum('Units').alias('UnitsTotal')
        , f.sum('Total').alias('GrandTotal')
        , f.avg('Total').alias('AvgPerTransaction')
    )
    .show()
)

+-------+----------+----------+------------------+
|   Item|UnitsTotal|GrandTotal| AvgPerTransaction|
+-------+----------+----------+------------------+
|   Desk|        10|    1700.0| 566.6666666666666|
| Binder|       722|   9577.65|            638.51|
|    Pen|       278|   2045.22|           409.044|
|Pen Set|       395|   4169.87| 595.6957142857143|
| Pencil|       716|   2135.14|164.24153846153845|
+-------+----------+----------+------------------+



In [83]:
sample_df_inferred.groupby('Rep')\
     .agg(f.sum('Units').alias('UnitsTotal'),
     f.round(f.sum('Total'),2).alias('GrandTotal'),
     f.round(f.avg('Total'),2).alias('AvgPerTransaction')).\
     orderBy('GrandTotal',ascending=False).show(5)

+-------+----------+----------+-----------------+
|    Rep|UnitsTotal|GrandTotal|AvgPerTransaction|
+-------+----------+----------+-----------------+
| Kivell|       193|   3109.44|           777.36|
| Parent|       170|    3102.3|           1034.1|
|Jardine|       281|   2812.19|           562.44|
|  Jones|       396|   2363.04|           295.38|
|   Gill|       213|   1749.87|           349.97|
+-------+----------+----------+-----------------+
only showing top 5 rows



In [89]:
df=sample_df_inferred.groupby('Rep','Item')\
     .agg(f.sum('Units').alias('UnitsTotal'),
     f.round(f.sum('Total'),2).alias('GrandTotal'),
     f.round(f.avg('Total'),2).alias('AvgPerTransaction')).\
     orderBy('GrandTotal',ascending=False)

In [92]:
df1=df.toPandas()
df1.head()

Unnamed: 0,Rep,Item,UnitsTotal,GrandTotal,AvgPerTransaction
0,Jardine,Binder,105,1933.95,966.98
1,Parent,Binder,81,1619.19,1619.19
2,Kivell,Pen Set,138,1484.94,742.47
3,Smith,Binder,87,1305.0,1305.0
4,Parent,Pen Set,74,1183.26,1183.26


# Selecting Data
## .select(...)

In [94]:
(
    sample_df_inferred
    .select('Rep','Total')
    .show(5)
)

+-------+------+
|    Rep| Total|
+-------+------+
|  Jones|189.05|
| Kivell| 999.5|
|Jardine|179.64|
|   Gill|539.73|
|Sorvino|167.44|
+-------+------+
only showing top 5 rows



## .sql(...)

In [100]:
sample_df_inferred.createOrReplaceTempView('sample_df_view')

In [101]:
spark.sql('''
    SELECT OrderDate
        , Rep
        , Region
        , Total
    FROM sample_df_view
    ORDER BY Rep
        , OrderDate
''').show(8)

+-------------------+-------+-------+------+
|          OrderDate|    Rep| Region| Total|
+-------------------+-------+-------+------+
|2016-04-18 00:00:00|Andrews|Central|149.25|
|2017-04-10 00:00:00|Andrews|Central|131.34|
|2017-10-30 00:00:00|Andrews|Central| 18.06|
|2017-12-21 00:00:00|Andrews|Central|139.72|
|2016-02-26 00:00:00|   Gill|Central|539.73|
|2017-01-15 00:00:00|   Gill|Central|413.54|
|2017-05-14 00:00:00|   Gill|Central| 68.37|
|2017-05-30 00:00:00|   Gill|Central| 719.2|
+-------------------+-------+-------+------+
only showing top 8 rows



In [109]:
spark.sql('''
select Rep, Region, sum(Total) as GrandTotal
FROM sample_df_view
group by Rep, Region
order by GrandTotal DESC
''').show(5)

+-------+-------+------------------+
|    Rep| Region|        GrandTotal|
+-------+-------+------------------+
| Kivell|Central|           3109.44|
| Parent|   East|            3102.3|
|Jardine|Central|           2812.19|
|  Jones|   East|           2363.04|
|   Gill|Central|1749.8700000000001|
+-------+-------+------------------+
only showing top 5 rows



# Transforming Data

In [110]:
commission = spark.createDataFrame(
    sc.parallelize([
          ('Central', 0.033)
        , ('East',    0.032)
        , ('West',    0.034)
    ])
    , ['Region', 'Commission']
)

In [115]:
sample_df_inferred.join(commission, on=['Region'], how='left_outer').show(6)

+-------+-------------------+-------+------+-----+--------+------+----------+
| Region|          OrderDate|    Rep|  Item|Units|UnitCost| Total|Commission|
+-------+-------------------+-------+------+-----+--------+------+----------+
|Central|2017-03-02 00:00:00| Kivell|Binder|   50|   19.99| 999.5|     0.033|
|Central|2016-02-09 00:00:00|Jardine|Pencil|   36|    4.99|179.64|     0.033|
|Central|2016-02-26 00:00:00|   Gill|   Pen|   27|   19.99|539.73|     0.033|
|Central|2016-04-18 00:00:00|Andrews|Pencil|   75|    1.99|149.25|     0.033|
|Central|2016-05-05 00:00:00|Jardine|Pencil|   90|    4.99| 449.1|     0.033|
|Central|2016-06-25 00:00:00| Morgan|Pencil|   90|    4.99| 449.1|     0.033|
+-------+-------------------+-------+------+-----+--------+------+----------+
only showing top 6 rows



In [116]:
(
    sample_df_inferred
    .join(commission, on=['Region'], how='left_outer')
    .withColumn('CommissionValue', f.round(f.col('Total') * f.col('Commission')))
    .show(4)
)

+-------+-------------------+-------+------+-----+--------+------+----------+---------------+
| Region|          OrderDate|    Rep|  Item|Units|UnitCost| Total|Commission|CommissionValue|
+-------+-------------------+-------+------+-----+--------+------+----------+---------------+
|Central|2017-03-02 00:00:00| Kivell|Binder|   50|   19.99| 999.5|     0.033|           33.0|
|Central|2016-02-09 00:00:00|Jardine|Pencil|   36|    4.99|179.64|     0.033|            6.0|
|Central|2016-02-26 00:00:00|   Gill|   Pen|   27|   19.99|539.73|     0.033|           18.0|
|Central|2016-04-18 00:00:00|Andrews|Pencil|   75|    1.99|149.25|     0.033|            5.0|
+-------+-------------------+-------+------+-----+--------+------+----------+---------------+
only showing top 4 rows



# Printing

In [117]:
#Using Show
(
    sample_df_inferred
    .select('Region', 'Rep')
    .show(4)
)

+-------+-------+
| Region|    Rep|
+-------+-------+
|   East|  Jones|
|Central| Kivell|
|Central|Jardine|
|Central|   Gill|
+-------+-------+
only showing top 4 rows



In [120]:
#Using Take, RDD Method, row objects

(
    sample_df_inferred
    .select('Region', 'Rep')
    .take(4)
)

[Row(Region='East', Rep='Jones'),
 Row(Region='Central', Rep='Kivell'),
 Row(Region='Central', Rep='Jardine'),
 Row(Region='Central', Rep='Gill')]

# Sorting Data

In [122]:
#Using Order by
(
    sample_df_inferred
    .orderBy('Rep', 'Region')
    .show(4)
)

+-------------------+-------+-------+------+-----+--------+------+
|          OrderDate| Region|    Rep|  Item|Units|UnitCost| Total|
+-------------------+-------+-------+------+-----+--------+------+
|2017-10-30 00:00:00|Central|Andrews|Pencil|   14|    1.29| 18.06|
|2017-04-10 00:00:00|Central|Andrews|Pencil|   66|    1.99|131.34|
|2016-04-18 00:00:00|Central|Andrews|Pencil|   75|    1.99|149.25|
|2017-12-21 00:00:00|Central|Andrews|Binder|   28|    4.99|139.72|
+-------------------+-------+-------+------+-----+--------+------+
only showing top 4 rows



In [123]:
# Using Sort
(
    sample_df_inferred
    .sort('Rep', 'Region')
    .show(4)
)

+-------------------+-------+-------+------+-----+--------+------+
|          OrderDate| Region|    Rep|  Item|Units|UnitCost| Total|
+-------------------+-------+-------+------+-----+--------+------+
|2017-10-30 00:00:00|Central|Andrews|Pencil|   14|    1.29| 18.06|
|2017-04-10 00:00:00|Central|Andrews|Pencil|   66|    1.99|131.34|
|2016-04-18 00:00:00|Central|Andrews|Pencil|   75|    1.99|149.25|
|2017-12-21 00:00:00|Central|Andrews|Binder|   28|    4.99|139.72|
+-------------------+-------+-------+------+-----+--------+------+
only showing top 4 rows



# Saving Data

## CSV

In [124]:
# Saved as Folder, Partition
(
    sample_df_inferred
    .write
    .mode('overwrite')
    .csv('../data/sample_data_inferred.csv')
)

## Parquet

In [21]:
(
    sample_df_inferred
    .write
    .parquet(
        '../data/sample_data_inferred.parquet'
        , mode='overwrite'
        , partitionBy='Rep'
        , compression='gzip'
    )
)

## JSON

In [22]:
(
    sample_df_inferred
    .write
    .json(
        '../data/sample_data_inferred.json'
        , mode='overwrite'
        , dateFormat='yyyy-mm-dd'
        , compression='gzip'
    )
)

# Pitfalls of using pure Python UDFs

![alt text][logo]

[logo]: https://raw.githubusercontent.com/drabastomek/learningPySpark_video/master/common/images/udf.png


In [137]:
def calculateCommission(value, commissionPercent):
    return value * commissionPercent

In [129]:
import time
start = time.time()
(
    sample_df_inferred
    .join(commission, on=['Region'], how='left_outer')
    .withColumn('CommissionValue', calculateCommission(f.col('Total'), f.col('Commission')))
    .show(4)
)
end = time.time()
print(end - start)

+-------+-------------------+-------+------+-----+--------+------+----------+---------------+
| Region|          OrderDate|    Rep|  Item|Units|UnitCost| Total|Commission|CommissionValue|
+-------+-------------------+-------+------+-----+--------+------+----------+---------------+
|Central|2017-03-02 00:00:00| Kivell|Binder|   50|   19.99| 999.5|     0.033|        32.9835|
|Central|2016-02-09 00:00:00|Jardine|Pencil|   36|    4.99|179.64|     0.033|        5.92812|
|Central|2016-02-26 00:00:00|   Gill|   Pen|   27|   19.99|539.73|     0.033|       17.81109|
|Central|2016-04-18 00:00:00|Andrews|Pencil|   75|    1.99|149.25|     0.033|        4.92525|
+-------+-------------------+-------+------+-----+--------+------+----------+---------------+
only showing top 4 rows

7.692247629165649


In [141]:
from pyspark.sql import SQLContext
#SQLContext.registerFunction('comm',calculateCommission)
#SQLContext.registerFunction("comm", calculateCommission)

# Repartitioning data

In [133]:
# Using underlying RDD
sample_df_inferred.rdd.getNumPartitions()

8

In [135]:
sample_df_repartitioned = sample_df_inferred.repartition(4, 'Rep')
sample_df_repartitioned.rdd.getNumPartitions()

4