In [1]:
import os, sys, json, collections, itertools
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

local_dir = "file:///{d}/".format(d=os.getcwd())

if 'sc' not in globals():
    conf = SparkConf().setAppName('appName').setMaster('local')
    sc = SparkContext(conf=conf)
    spark = SparkSession(sc)

## DataFrames fundamentals

Each record in the "dessert" dataset describes a group visit at a restaurant. Read the data and answer the questions below.

In [2]:
dessert = spark.read.csv("dessert.csv", 
                         header=True, inferSchema=True)\
  .drop('id')\
  .withColumnRenamed('day.of.week', 'weekday')\
  .withColumnRenamed('num.of.guests', 'num_of_guests')\
  .withColumnRenamed('dessert', 'purchase')\
  .withColumnRenamed('hour', 'shift')
dessert.show(5)

+--------+-------------+----------+-----+--------+
| weekday|num_of_guests|     shift|table|purchase|
+--------+-------------+----------+-----+--------+
|  Monday|            2|   evening|   13|    true|
|Saturday|            4|     night|   19|    true|
|Saturday|            4|after-noon|   12|   false|
|Thursday|            3|     night|    5|    true|
|  Monday|            7|      noon|   10|    true|
+--------+-------------+----------+-----+--------+
only showing top 5 rows



In [3]:
dessert.printSchema()

root
 |-- weekday: string (nullable = true)
 |-- num_of_guests: integer (nullable = true)
 |-- shift: string (nullable = true)
 |-- table: integer (nullable = true)
 |-- purchase: boolean (nullable = true)



The DataframeReader object used above is sometimes confusing, so I show below how to first load the data as an RDD, and then modify it into a dataFrame. During this process we also remove the header using a combination of _zipWithIndex()_ and _filter()_ (taken from [here][1]). By looking at the file we see the "schema", which is used by the second _map()_.

[1]: http://stackoverflow.com/a/31798247/3121900

In [4]:
dessert_rdd = sc\
    .textFile("dessert.csv")\
    .map(lambda line: line.split(','))\
    .zipWithIndex()\
    .filter(lambda tup: tup[1] > 0)\
    .map(lambda tup: [tup[0][1],           # weekday
                      int(tup[0][2]),      # num_of_guests
                      tup[0][3],           # shift
                      int(tup[0][4]),      # table
                      tup[0][5]=='TRUE'])  # purchase

columns = ['weekday', 'num_of_guests', 'shift', 'table', 'purchase']
dessert = spark.createDataFrame(dessert_rdd,
                                schema=columns)
dessert.show(5)

+--------+-------------+----------+-----+--------+
| weekday|num_of_guests|     shift|table|purchase|
+--------+-------------+----------+-----+--------+
|  Monday|            2|   evening|   13|    true|
|Saturday|            4|     night|   19|    true|
|Saturday|            4|after-noon|   12|   false|
|Thursday|            3|     night|    5|    true|
|  Monday|            7|      noon|   10|    true|
+--------+-------------+----------+-----+--------+
only showing top 5 rows



### Question 1

How many groups purchased a dessert?

In [5]:
col = dessert.purchase
dessert.where(col).count()

573

### Question 2

How many groups purchased a dessert on Mondays?

In [6]:
col = (dessert.weekday == 'Monday') & (dessert.purchase)
dessert.where(col).count()

66

### Question 3

How many _visitors_ purchased a dessert?

In [7]:
dessert\
    .where(dessert.purchase)\
    .agg({'num_of_guests': 'sum', 'table': 'mean'})\
    .show()

+------------------+------------------+
|sum(num_of_guests)|        avg(table)|
+------------------+------------------+
|              1913|11.849912739965095|
+------------------+------------------+



### Question 4

For each weekday - how many groups purchased a dessert?

In [8]:
dessert\
    .where(dessert.purchase)\
    .groupBy('weekday')\
    .agg({'shift': 'count', 'num_of_guests': 'sum'})\
    .show()

+---------+------------+------------------+
|  weekday|count(shift)|sum(num_of_guests)|
+---------+------------+------------------+
|Wednesday|          91|               297|
|  Tuesday|          89|               306|
|   Friday|          77|               281|
| Thursday|          69|               238|
| Saturday|         128|               411|
|   Monday|          66|               211|
|   Sunday|          53|               169|
+---------+------------+------------------+



### Question 5

Add to _dessert_ a new column called 'no purchase' with the negative of 'purchse'.

In [9]:
dessert = dessert.withColumn('no_purchase', ~dessert.purchase)
dessert.show(5)

+--------+-------------+----------+-----+--------+-----------+
| weekday|num_of_guests|     shift|table|purchase|no_purchase|
+--------+-------------+----------+-----+--------+-----------+
|  Monday|            2|   evening|   13|    true|      false|
|Saturday|            4|     night|   19|    true|      false|
|Saturday|            4|after-noon|   12|   false|       true|
|Thursday|            3|     night|    5|    true|      false|
|  Monday|            7|      noon|   10|    true|      false|
+--------+-------------+----------+-----+--------+-----------+
only showing top 5 rows



### Question 6

Create a pivot table showing how the purchases were influenced by the size of the group.

In [10]:
dessert.crosstab('num_of_guests', 'weekday').show()

+---------------------+------+------+--------+------+--------+-------+---------+
|num_of_guests_weekday|Friday|Monday|Saturday|Sunday|Thursday|Tuesday|Wednesday|
+---------------------+------+------+--------+------+--------+-------+---------+
|                    5|    28|     4|      22|     8|       6|      9|       14|
|                   10|     3|     3|       0|     2|       1|      2|        2|
|                    1|    32|    11|      15|     8|       7|     17|       17|
|                    6|     7|     4|      13|     4|       7|      7|        6|
|                    9|     2|     1|       2|     0|       1|      2|        1|
|                    2|    66|    38|      69|    36|      35|     38|       36|
|                    7|     4|     3|       7|     2|       3|      2|        3|
|                    3|    51|    22|      50|    17|      19|     23|       20|
|                    8|     0|     1|       2|     1|       2|      0|        2|
|                    4|    4

## Exercise 2:
Read the file "weights" into an Dataframe and answer the following questions:
1. Create a new Dataframe with the data of the males only and call it _males_.
1. How many males are in the table? What is the mean height and weight of the males?
1. What is the height of the tallest female who is older than 40?
1. Create a new Dataframe with two columns for the age and the average weight of the people in this age.

## User defined functions

Python functions cannot be run directly in dataframes. python types need to be converted to scala data types.

In [11]:
import pyspark.sql.functions as F
import pyspark.sql.types as T
from inspect import signature

def str_reverese(s):
    return "".join(reversed(s))

str_reverese_udf = F.UserDefinedFunction(str_reverese, T.StringType())

In [12]:
dessert = dessert.withColumn('shift_reversed', str_reverese_udf(F.col("shift")))
dessert.show()

+---------+-------------+----------+-----+--------+-----------+--------------+
|  weekday|num_of_guests|     shift|table|purchase|no_purchase|shift_reversed|
+---------+-------------+----------+-----+--------+-----------+--------------+
|   Monday|            2|   evening|   13|    true|      false|       gnineve|
| Saturday|            4|     night|   19|    true|      false|         thgin|
| Saturday|            4|after-noon|   12|   false|       true|    noon-retfa|
| Thursday|            3|     night|    5|    true|      false|         thgin|
|   Monday|            7|      noon|   10|    true|      false|          noon|
|   Friday|            2|   evening|   19|   false|       true|       gnineve|
|   Monday|            2|   evening|   22|    true|      false|       gnineve|
|   Sunday|            4|      noon|    4|    true|      false|          noon|
| Thursday|            4|   evening|    9|   false|       true|       gnineve|
| Thursday|            1|      noon|   17|    true| 

### A little hack

I wrote a convinience decorator in order to use python3 type annotations.

This code is **unofficial**, it uses **undocumented** features of spark, but it is very useful.

In [15]:
def udf(f):
    returnType = T._type_mappings[signature(f).return_annotation]()
    return F.UserDefinedFunction(f, returnType)

In [16]:
@udf
def str_reverese(s)->str:
    return "".join(reversed(s))

dessert = dessert.withColumn('shift_reversed2', str_reverese_udf(F.col("shift")))
dessert.show()

+---------+-------------+----------+-----+--------+-----------+--------------+---------------+
|  weekday|num_of_guests|     shift|table|purchase|no_purchase|shift_reversed|shift_reversed2|
+---------+-------------+----------+-----+--------+-----------+--------------+---------------+
|   Monday|            2|   evening|   13|    true|      false|       gnineve|        gnineve|
| Saturday|            4|     night|   19|    true|      false|         thgin|          thgin|
| Saturday|            4|after-noon|   12|   false|       true|    noon-retfa|     noon-retfa|
| Thursday|            3|     night|    5|    true|      false|         thgin|          thgin|
|   Monday|            7|      noon|   10|    true|      false|          noon|           noon|
|   Friday|            2|   evening|   19|   false|       true|       gnineve|        gnineve|
|   Monday|            2|   evening|   22|    true|      false|       gnineve|        gnineve|
|   Sunday|            4|      noon|    4|    true

For more complex types, use:
https://github.com/urigoren/decorators4DS/blob/master/decorators4DS/pyspark_udf.py

## Exercise 3:

Write a user-defined-function that calculates the total amount of calories consumed.

1. Hamburger = 300 cal
1. Icecream ball = 130 cal

In [23]:
diet = spark.read.csv("diet.txt", 
                         header=True, inferSchema=True, sep=';')
sdf.show()

+---+----------+-----------+----------+---------------+---------+
| id|jogging.km|spinning.hr|hamburgers|ice.cream.balls|change.kg|
+---+----------+-----------+----------+---------------+---------+
|  1|       269|         29|         6|             35|     -7.5|
|  2|        79|         10|        13|             23|     -0.9|
|  3|       112|         46|         4|             22|     -6.1|
|  4|       172|         27|        14|             28|     -4.7|
|  5|       273|         31|        29|             47|     -7.0|
|  6|        60|         12|        23|             11|      1.2|
|  7|       270|         13|        28|             11|     -6.8|
|  8|       284|         36|        14|             21|     -8.9|
|  9|       198|         35|        18|             17|     -6.1|
| 10|       189|          4|        15|             44|     -1.3|
| 11|        18|          2|         3|              9|      0.7|
| 12|        61|         39|         7|             25|     -2.0|
| 13|     