In [28]:
import pyspark
import pandas as pd
import numpy as np
from pydataset import data
from pyspark.sql.functions import col, expr
from pyspark.sql.functions import lit
np.random.seed(333)

In [7]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/02/11 13:41:36 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/02/11 13:41:37 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
22/02/11 13:41:37 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


## Exercise 1:

In [17]:
# Create a dictionary with 'language' as the key and a list of
# programming languages as the values.
dictionary = {'language':['Python','SQL','Swift','C','LOLCODE','Rockstar','TrumpScript','Chicken']}

In [18]:
# Verify contents of the dictionary.
dictionary

{'language': ['Python',
  'SQL',
  'Swift',
  'C',
  'LOLCODE',
  'Rockstar',
  'TrumpScript',
  'Chicken']}

In [19]:
# Convert dictionary into a Pandas dataframe.
df = pd.DataFrame(dictionary)

In [20]:
# Verify dataframe contents.
df

Unnamed: 0,language
0,Python
1,SQL
2,Swift
3,C
4,LOLCODE
5,Rockstar
6,TrumpScript
7,Chicken


In [21]:
# Create a Spark dataframe from the Pandas dataframe.
spark_df = spark.createDataFrame(df)

In [22]:
spark_df

DataFrame[language: string]

In [23]:
# Print out the schema of the newly created dataframe.
spark_df.printSchema()

root
 |-- language: string (nullable = true)



In [24]:
# Print out the shape of the spark dataframe.
print("DataFrame shape: ", spark_df.count(), " x ", len(spark_df.columns))

DataFrame shape:  8  x  1


In [25]:
# Print the first 5 observations from the Spark dataframe
spark_df.show(5)

+--------+
|language|
+--------+
|  Python|
|     SQL|
|   Swift|
|       C|
| LOLCODE|
+--------+
only showing top 5 rows



## Exercise 2:

Load the mpg dataset as a spark dataframe.
a. Create 1 column of output that contains a message like the one below for each record:

For example, take the data and have it print out this:

'The 1999 audi a4 has a 4 cylinder engine.'

In [26]:
mpg = data('mpg')

In [27]:
mpg

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
...,...,...,...,...,...,...,...,...,...,...,...
230,volkswagen,passat,2.0,2008,4,auto(s6),f,19,28,p,midsize
231,volkswagen,passat,2.0,2008,4,manual(m6),f,21,29,p,midsize
232,volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
233,volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize


In [31]:
mpg = spark.createDataFrame(data('mpg'))

In [32]:
mpg

DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string]

In [34]:
# Note: The pyspark avg and mean functions are aliases of each other
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean

In [42]:
mpg.columns

['manufacturer',
 'model',
 'displ',
 'year',
 'cyl',
 'trans',
 'drv',
 'cty',
 'hwy',
 'fl',
 'class']

In [61]:
mpg.select(concat(mpg.manufacturer, lit(' '), mpg.model).alias('model')).show()

+--------------------+
|               model|
+--------------------+
|             audi a4|
|             audi a4|
|             audi a4|
|             audi a4|
|             audi a4|
|             audi a4|
|             audi a4|
|     audi a4 quattro|
|     audi a4 quattro|
|     audi a4 quattro|
|     audi a4 quattro|
|     audi a4 quattro|
|     audi a4 quattro|
|     audi a4 quattro|
|     audi a4 quattro|
|     audi a6 quattro|
|     audi a6 quattro|
|     audi a6 quattro|
|chevrolet c1500 s...|
|chevrolet c1500 s...|
+--------------------+
only showing top 20 rows



In [77]:
mpg.select(concat(
    lit('The '), 
    col('year'),
    lit(' '),
    col('manufacturer'),
    lit(' '),
    col('model'), 
    lit(' '),
    lit('has a '), 
    col('cyl'),
    lit(" cylinder engine.")
    )).show(truncate=False)

+--------------------------------------------------------------------------------+
|concat(The , year,  , manufacturer,  , model,  , has a , cyl,  cylinder engine.)|
+--------------------------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.                                       |
|The 1999 audi a4 has a 4 cylinder engine.                                       |
|The 2008 audi a4 has a 4 cylinder engine.                                       |
|The 2008 audi a4 has a 4 cylinder engine.                                       |
|The 1999 audi a4 has a 6 cylinder engine.                                       |
|The 1999 audi a4 has a 6 cylinder engine.                                       |
|The 2008 audi a4 has a 6 cylinder engine.                                       |
|The 1999 audi a4 quattro has a 4 cylinder engine.                               |
|The 1999 audi a4 quattro has a 4 cylinder engine.                               |
|The

In [98]:
from pyspark.sql.functions import when, length

In [80]:
mpg.select(mpg.trans,
             when(mpg.trans.like("auto%"), "auto").
              otherwise("manual").alias('when+like')).show()

+----------+---------+
|     trans|when+like|
+----------+---------+
|  auto(l5)|     auto|
|manual(m5)|   manual|
|manual(m6)|   manual|
|  auto(av)|     auto|
|  auto(l5)|     auto|
|manual(m5)|   manual|
|  auto(av)|     auto|
|manual(m5)|   manual|
|  auto(l5)|     auto|
|manual(m6)|   manual|
|  auto(s6)|     auto|
|  auto(l5)|     auto|
|manual(m5)|   manual|
|  auto(s6)|     auto|
|manual(m6)|   manual|
|  auto(l5)|     auto|
|  auto(s6)|     auto|
|  auto(s6)|     auto|
|  auto(l4)|     auto|
|  auto(l4)|     auto|
+----------+---------+
only showing top 20 rows



## Exercise 3:

In [82]:
tips = spark.createDataFrame(data("tips"))

In [84]:
tips.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

### a. What percentage of observations are smokers?

In [105]:
tips.groupBy('smoker').count().withColumn('percentage', col('count') / 244).show()

+------+-----+-------------------+
|smoker|count|         percentage|
+------+-----+-------------------+
|    No|  151| 0.6188524590163934|
|   Yes|   93|0.38114754098360654|
+------+-----+-------------------+



### b. Create a column that contains the tip percentage

In [108]:
tips.select(tips.tip, tips.total_bill).withColumn("ratio", col('tip') / col('total_bill')).show()

+----+----------+-------------------+
| tip|total_bill|              ratio|
+----+----------+-------------------+
|1.01|     16.99|0.05944673337257211|
|1.66|     10.34|0.16054158607350097|
| 3.5|     21.01|0.16658733936220846|
|3.31|     23.68| 0.1397804054054054|
|3.61|     24.59|0.14680764538430255|
|4.71|     25.29|0.18623962040332148|
| 2.0|      8.77|0.22805017103762829|
|3.12|     26.88|0.11607142857142858|
|1.96|     15.04|0.13031914893617022|
|3.23|     14.78| 0.2185385656292287|
|1.71|     10.27| 0.1665043816942551|
| 5.0|     35.26|0.14180374361883155|
|1.57|     15.42|0.10181582360570687|
| 3.0|     18.43|0.16277807921866522|
|3.02|     14.83|0.20364126770060686|
|3.92|     21.58|0.18164967562557924|
|1.67|     10.33| 0.1616650532429816|
|3.71|     16.29|0.22774708410067526|
| 3.5|     16.97|0.20624631703005306|
|3.35|     20.65|0.16222760290556903|
+----+----------+-------------------+
only showing top 20 rows



### c. Calculate the average tip percentage for each combination of sex and smoker.

In [113]:
tips.select(tips.tip, tips.total_bill, tips.sex, tips.smoker).withColumn("ratio", col('tip') / col('total_bill')).groupBy('sex').pivot('smoker').mean('ratio').show()

+------+------------------+-------------------+
|   sex|                No|                Yes|
+------+------------------+-------------------+
|Female|0.1569209707691836|0.18215035269941032|
|  Male|0.1606687151291298|0.15277117520248512|
+------+------------------+-------------------+

