### Lecture

In [1]:
import pandas as pd
import numpy as np
import pyspark

In [2]:
# make the spark object 
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [3]:
spark

In [3]:
import pyspark.sql.functions as F

In [4]:
spark.range(10).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [4]:
np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [6]:
df = spark.createDataFrame(pandas_dataframe)

In [7]:
# shows us the object not the actual df
df

DataFrame[n: bigint, group: string]

In [8]:
# have to call a method to actually see the df
df.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
|  5|    c|
|  6|    a|
|  7|    b|
|  8|    a|
|  9|    b|
| 10|    b|
| 11|    a|
| 12|    b|
| 13|    a|
| 14|    b|
| 15|    b|
| 16|    c|
| 17|    c|
| 18|    a|
| 19|    c|
+---+-----+



In [9]:
# we'll see this is a column object 
df.group

Column<b'group'>

In [10]:
type(df.group)

pyspark.sql.column.Column

In [11]:
# dataframe object, spark is still being lazy
df.select(df.group)

DataFrame[group: string]

In [12]:
df.select(df.group).show()

+-----+
|group|
+-----+
|    b|
|    b|
|    c|
|    a|
|    c|
|    c|
|    a|
|    b|
|    a|
|    b|
|    b|
|    a|
|    b|
|    a|
|    b|
|    b|
|    c|
|    c|
|    a|
|    c|
+-----+



In [13]:
df.select(df.n).show()

+---+
|  n|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
| 10|
| 11|
| 12|
| 13|
| 14|
| 15|
| 16|
| 17|
| 18|
| 19|
+---+



In [14]:
df.select(df.n + 1).show()

+-------+
|(n + 1)|
+-------+
|      1|
|      2|
|      3|
|      4|
|      5|
|      6|
|      7|
|      8|
|      9|
|     10|
|     11|
|     12|
|     13|
|     14|
|     15|
|     16|
|     17|
|     18|
|     19|
|     20|
+-------+



In [15]:
# this is a transformation
n_increment = df.n + 1
n_increment

Column<b'(n + 1)'>

In [16]:
# .show() is the action
df.select(n_increment).show()

+-------+
|(n + 1)|
+-------+
|      1|
|      2|
|      3|
|      4|
|      5|
|      6|
|      7|
|      8|
|      9|
|     10|
|     11|
|     12|
|     13|
|     14|
|     15|
|     16|
|     17|
|     18|
|     19|
|     20|
+-------+



In [None]:
# the fact that the transformation can exist independently of each other 
# spark df are immutable 

In [17]:
# transformation
df.describe()

DataFrame[summary: string, n: string, group: string]

In [19]:
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [21]:
df.describe().n

Column<b'n'>

In [23]:
df.describe().select('n', 'summary').show()

+-----------------+-------+
|                n|summary|
+-----------------+-------+
|               20|  count|
|              9.5|   mean|
|5.916079783099616| stddev|
|                0|    min|
|               19|    max|
+-----------------+-------+



In [26]:
from pydataset import data
data('mpg')

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
6,audi,a4,2.8,1999,6,manual(m5),f,18,26,p,compact
7,audi,a4,3.1,2008,6,auto(av),f,18,27,p,compact
8,audi,a4 quattro,1.8,1999,4,manual(m5),4,18,26,p,compact
9,audi,a4 quattro,1.8,1999,4,auto(l5),4,16,25,p,compact
10,audi,a4 quattro,2.0,2008,4,manual(m6),4,20,28,p,compact


In [27]:
mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [28]:
# can reference columns through sting or df.column name
mpg.select('model', 'manufacturer', 'hwy').show()

+------------------+------------+---+
|             model|manufacturer|hwy|
+------------------+------------+---+
|                a4|        audi| 29|
|                a4|        audi| 29|
|                a4|        audi| 31|
|                a4|        audi| 30|
|                a4|        audi| 26|
|                a4|        audi| 26|
|                a4|        audi| 27|
|        a4 quattro|        audi| 26|
|        a4 quattro|        audi| 25|
|        a4 quattro|        audi| 28|
|        a4 quattro|        audi| 27|
|        a4 quattro|        audi| 25|
|        a4 quattro|        audi| 25|
|        a4 quattro|        audi| 25|
|        a4 quattro|        audi| 25|
|        a6 quattro|        audi| 24|
|        a6 quattro|        audi| 25|
|        a6 quattro|        audi| 23|
|c1500 suburban 2wd|   chevrolet| 20|
|c1500 suburban 2wd|   chevrolet| 15|
+------------------+------------+---+
only showing top 20 rows



In [31]:
mpg.select('model', 'manufacturer', mpg.hwy.alias("highway_mileage")).show()

+------------------+------------+---------------+
|             model|manufacturer|highway_mileage|
+------------------+------------+---------------+
|                a4|        audi|             29|
|                a4|        audi|             29|
|                a4|        audi|             31|
|                a4|        audi|             30|
|                a4|        audi|             26|
|                a4|        audi|             26|
|                a4|        audi|             27|
|        a4 quattro|        audi|             26|
|        a4 quattro|        audi|             25|
|        a4 quattro|        audi|             28|
|        a4 quattro|        audi|             27|
|        a4 quattro|        audi|             25|
|        a4 quattro|        audi|             25|
|        a4 quattro|        audi|             25|
|        a4 quattro|        audi|             25|
|        a6 quattro|        audi|             24|
|        a6 quattro|        audi|             25|


In [32]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [34]:
mpg.select(((mpg.cty + mpg.hwy)/ 2).alias('avg_mileage')).show()

+-----------+
|avg_mileage|
+-----------+
|       23.5|
|       25.0|
|       25.5|
|       25.5|
|       21.0|
|       22.0|
|       22.5|
|       22.0|
|       20.5|
|       24.0|
|       23.0|
|       20.0|
|       21.0|
|       21.0|
|       20.0|
|       19.5|
|       21.0|
|       19.5|
|       17.0|
|       13.0|
+-----------+
only showing top 20 rows



In [37]:
# transformation
avg_mileage_col = ((mpg.cty + mpg.hwy)/ 2).alias('avg_mileage')
# .show() action
mpg.select('*', avg_mileage_col).show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+-----------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|avg_mileage|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+-----------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|       23.5|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|       25.0|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|       25.5|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|       25.5|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|       21.0|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|       22.0|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|       22.5|
|        audi|        a4 quattro|  1.8|1

In [38]:
# string --> string, double --> float, long --> int
mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



In [39]:
from pyspark.sql.functions import col

In [40]:
col

<function pyspark.sql.functions._create_function.<locals>._(col)>

In [42]:
mpg.select(col('hwy'))

DataFrame[hwy: bigint]

### Spark API Mini Exercises

1. Spark Dataframe Basics

    - Use the starter code above to create a pandas dataframe.
    - Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.
    - Show the first 3 rows of the dataframe.
    - Show the first 7 rows of the dataframe.
    - View a summary of the data using .describe.
    - Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.
    - Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.
    - Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.
    - Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [43]:
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [44]:
# Use the starter code above to create a pandas dataframe.
pandas_dataframe

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False
5,0.532338,y,False
6,1.350188,z,False
7,0.861211,x,False
8,1.478686,z,True
9,-1.045377,y,True


In [46]:
# Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [48]:
# Show the first 3 rows of the dataframe.
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



In [49]:
# Show the first 7 rows of the dataframe.
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [50]:
# View a summary of the data using .describe.
df.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885217| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



In [51]:
# Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.
df.select('n', 'abool').show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



In [52]:
# Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.
df.select('group', 'abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



In [53]:
# Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.
df.select('group', df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



In [54]:
# Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.
df.select('group', df.n.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



2. Column Manipulation

    - Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df

    - Use .select to add 4 to the n column. Show the results.

    - Subtract 5 from the n column and view the results.

    - Multiply the n column by 2. View the results along with the original numbers.

    - Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.

    - Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.

    - What happens when you run the code below?
        - df.group + df.abool
    
    - What happens when you run the code below? What is the difference between this and the previous code sample?
        - df.select(df.group + df.abool)
        
    - Try adding various other columns together. What are the results of combining the different data types?

In [181]:
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [182]:
pandas_dataframe

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False
5,0.532338,y,False
6,1.350188,z,False
7,0.861211,x,False
8,1.478686,z,True
9,-1.045377,y,True


In [183]:
# Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df
data = spark.createDataFrame(pandas_dataframe)
data

DataFrame[n: double, group: string, abool: boolean]

In [184]:
# Use .select to add 4 to the n column. Show the results.
data.select(data.n + 4).show()

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
|  4.45181233874579|
|5.3451017084510095|
| 4.532337888294546|
| 5.350187899722527|
|  4.86121137416932|
| 5.478685737435897|
| 2.954622869461466|
|3.2110109750484512|
| 2.738394054680931|
| 4.562846785281032|
|3.7566737481144377|
| 4.913740704859677|
| 4.317350922736336|
| 4.127303280206981|
| 6.150382967381113|
| 4.606288656896298|
|3.9732283500135592|
+------------------+



In [185]:
# Subtract 5 from the n column and view the results.
data.select(data.n - 5).show()

+-------------------+
|            (n - 5)|
+-------------------+
| -5.712390662050588|
| -4.246233621340297|
| -5.044503078338053|
|  -4.54818766125421|
|-3.6548982915489905|
| -4.467662111705454|
|-3.6498121002774733|
|  -4.13878862583068|
| -3.521314262564103|
| -6.045377130538534|
| -5.788989024951549|
| -6.261605945319069|
| -4.437153214718968|
| -5.243326251885563|
| -4.086259295140323|
| -4.682649077263664|
| -4.872696719793019|
|-2.8496170326188874|
| -4.393711343103702|
| -5.026771649986441|
+-------------------+



In [186]:
# Multiply the n column by 2. View the results along with the original numbers.
data.select(data.n, data.n*2).show()

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|
|   0.753766378659703|   1.507532757319406|
|-0.04450307833805...|-0.08900615667610691|
| 0.45181233874578974|  0.9036246774915795|
|  1.3451017084510097|  2.6902034169020195|
|  0.5323378882945463|  1.0646757765890926|
|  1.3501878997225267|  2.7003757994450535|
|  0.8612113741693206|  1.7224227483386412|
|  1.4786857374358966|   2.957371474871793|
| -1.0453771305385342| -2.0907542610770684|
| -0.7889890249515489| -1.5779780499030978|
|  -1.261605945319069|  -2.523211890638138|
|  0.5628467852810314|  1.1256935705620628|
|-0.24332625188556253|-0.48665250377112507|
|  0.9137407048596775|   1.827481409719355|
| 0.31735092273633597|  0.6347018454726719|
| 0.12730328020698067| 0.25460656041396135|
|  2.1503829673811126|   4.300765934762225|
|  0.6062886568962988|  1.2125773137925977|
|-0.02677164998644...|-0.0535432

In [187]:
# Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.
n2 = (data.n * -1).alias('n2')
data.select(data.n, n2).show()

+--------------------+--------------------+
|                   n|                  n2|
+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|
|   0.753766378659703|  -0.753766378659703|
|-0.04450307833805...|0.044503078338053455|
| 0.45181233874578974|-0.45181233874578974|
|  1.3451017084510097| -1.3451017084510097|
|  0.5323378882945463| -0.5323378882945463|
|  1.3501878997225267| -1.3501878997225267|
|  0.8612113741693206| -0.8612113741693206|
|  1.4786857374358966| -1.4786857374358966|
| -1.0453771305385342|  1.0453771305385342|
| -0.7889890249515489|  0.7889890249515489|
|  -1.261605945319069|   1.261605945319069|
|  0.5628467852810314| -0.5628467852810314|
|-0.24332625188556253| 0.24332625188556253|
|  0.9137407048596775| -0.9137407048596775|
| 0.31735092273633597|-0.31735092273633597|
| 0.12730328020698067|-0.12730328020698067|
|  2.1503829673811126| -2.1503829673811126|
|  0.6062886568962988| -0.6062886568962988|
|-0.02677164998644...|0.02677164

In [188]:
# Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.
n3 = (data.n**2).alias('n3')
data.select(data.n, n2, n3).show(5)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [72]:
# What happens when you run the code below?
data.group + data.abool
# it shows a column object

Column<b'(group + abool)'>

In [92]:
# What happens when you run the code below? What is the difference between this and the previous code sample?
data.select((data.group + data.abool))
# is will show an error because it is lazy, it hasn't actually performed the aggregation

AnalysisException: "cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;;\n'Project [(cast(group#1094 as double) + abool#1095) AS (group + abool)#1229]\n+- LogicalRDD [n#1093, group#1094, abool#1095], false\n"

In [None]:
# Try adding various other columns together. What are the results of combining the different data types?


### Lecture

In [93]:
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [94]:
from pyspark.sql.functions import expr

In [100]:
mpg.select(expr('hwy AS highway_average')).show(5)

+---------------+
|highway_average|
+---------------+
|             29|
|             29|
|             31|
|             30|
|             26|
+---------------+
only showing top 5 rows



In [96]:
# allows you to write sql queries instead of python
mpg.createOrReplaceTempView('mpg')

In [99]:
spark.sql("""
SELECT * FROM mpg
""").show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [101]:
mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



In [103]:
mpg.dtypes

[('manufacturer', 'string'),
 ('model', 'string'),
 ('displ', 'double'),
 ('year', 'bigint'),
 ('cyl', 'bigint'),
 ('trans', 'string'),
 ('drv', 'string'),
 ('cty', 'bigint'),
 ('hwy', 'bigint'),
 ('fl', 'string'),
 ('class', 'string')]

In [109]:
mpg.select(mpg.manufacturer, mpg.model, mpg.cyl, mpg.cyl.cast('string').alias('cyl_string')).show()

+------------+------------------+---+----------+
|manufacturer|             model|cyl|cyl_string|
+------------+------------------+---+----------+
|        audi|                a4|  4|         4|
|        audi|                a4|  4|         4|
|        audi|                a4|  4|         4|
|        audi|                a4|  4|         4|
|        audi|                a4|  6|         6|
|        audi|                a4|  6|         6|
|        audi|                a4|  6|         6|
|        audi|        a4 quattro|  4|         4|
|        audi|        a4 quattro|  4|         4|
|        audi|        a4 quattro|  4|         4|
|        audi|        a4 quattro|  4|         4|
|        audi|        a4 quattro|  6|         6|
|        audi|        a4 quattro|  6|         6|
|        audi|        a4 quattro|  6|         6|
|        audi|        a4 quattro|  6|         6|
|        audi|        a6 quattro|  6|         6|
|        audi|        a6 quattro|  6|         6|
|        audi|      

In [111]:
# be careful with this 
mpg.select(mpg.manufacturer.cast('double')).show(5)

+------------+
|manufacturer|
+------------+
|        null|
|        null|
|        null|
|        null|
|        null|
+------------+
only showing top 5 rows



In [113]:
from pyspark.sql.functions import min, max

In [116]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [117]:
mpg.select(min(mpg.cyl).alias('min_cyl'), max(mpg.hwy)).show(5)

+-------+--------+
|min_cyl|max(hwy)|
+-------+--------+
|      4|      44|
+-------+--------+



In [118]:
spark.sql('SELECT min(cyl), max(hwy) FROM mpg').show(5)

+--------+--------+
|min(cyl)|max(hwy)|
+--------+--------+
|       4|      44|
+--------+--------+



### Spark API Mini Exercises

3. Spark SQL

    - Use the starter code above to re-create a spark dataframe.
    - Turn your dataframe into a table that can be queried with spark SQL. Name the table my_df. Answer the rest of the questions in this section with a spark sql query (spark.sql) against my_df. After each step, view the first 7 records from the dataframe.
    - Write a query that shows all of the columns from your dataframe.
    - Write a query that shows just the n and abool columns from the dataframe.
    - Write a query that shows just the n and group columns. Rename the group column to g.
    - Write a query that selects n, and creates two new columns: n2, the original n values halved, and n3: the original n values minus 1.
    - What happens if you make a SQL syntax error in your query?

In [119]:
np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [121]:
pandas_dataframe.head()

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False


In [132]:
my_df = spark.createDataFrame(pandas_dataframe)
my_df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



In [134]:
# # Turn your dataframe into a table that can be queried with spark SQL. Name the table my_df. Answer the rest of the questions in this section with a spark sql query (spark.sql) against my_df. After each step, view the first 7 records from the dataframe.
my_df.createOrReplaceTempView('my_df')
my_df.show()

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
|  0.8612113741693206|    x|false|
|  1.4786857374358966|    z| true|
| -1.0453771305385342|    y| true|
| -0.7889890249515489|    x|false|
|  -1.261605945319069|    y|false|
|  0.5628467852810314|    y| true|
|-0.24332625188556253|    y| true|
|  0.9137407048596775|    y|false|
| 0.31735092273633597|    x|false|
| 0.12730328020698067|    z|false|
|  2.1503829673811126|    y| true|
|  0.6062886568962988|    x|false|
|-0.02677164998644...|    x| true|
+--------------------+-----+-----+



In [135]:
# Write a query that shows all of the columns from your dataframe.
spark.sql("""
SELECT * FROM my_df
""").show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [136]:
# Write a query that shows just the n and abool columns from the dataframe.
spark.sql("""
SELECT n, abool FROM my_df
""").show(7)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
|  0.5323378882945463|false|
|  1.3501878997225267|false|
+--------------------+-----+
only showing top 7 rows



In [137]:
# Write a query that shows just the n and group columns. Rename the group column to g.
spark.sql("""
SELECT n, group AS g FROM my_df
""").show(7)

+--------------------+---+
|                   n|  g|
+--------------------+---+
|  -0.712390662050588|  z|
|   0.753766378659703|  x|
|-0.04450307833805...|  z|
| 0.45181233874578974|  y|
|  1.3451017084510097|  z|
|  0.5323378882945463|  y|
|  1.3501878997225267|  z|
+--------------------+---+
only showing top 7 rows



In [138]:
# Write a query that selects n, and creates two new columns: n2, the original n values halved, and n3: the original n values minus 1.
spark.sql("""
SELECT n, (n / 2) AS n2, (n - 1) AS n3 FROM my_df
""").show()

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|  -0.356195331025294|  -1.712390662050588|
|   0.753766378659703|  0.3768831893298515|-0.24623362134029703|
|-0.04450307833805...|-0.02225153916902...| -1.0445030783380536|
| 0.45181233874578974| 0.22590616937289487| -0.5481876612542103|
|  1.3451017084510097|  0.6725508542255049| 0.34510170845100974|
|  0.5323378882945463| 0.26616894414727316| -0.4676621117054537|
|  1.3501878997225267|  0.6750939498612634| 0.35018789972252673|
|  0.8612113741693206|  0.4306056870846603| -0.1387886258306794|
|  1.4786857374358966|  0.7393428687179483|  0.4786857374358966|
| -1.0453771305385342| -0.5226885652692671|  -2.045377130538534|
| -0.7889890249515489|-0.39449451247577444| -1.7889890249515488|
|  -1.261605945319069| -0.6308029726595346|  -2.261605945319069|
|  0.5628467852810314|  0

In [141]:
# What happens if you make a SQL syntax error in your query?
spark.sql("""
SELECT * AS my_df
""")
# the error message shows the error in the syntax

AnalysisException: "Invalid usage of '*' in expression 'alias';"

4. Type casting

    - Use the starter code above to re-create a spark dataframe.

    - Use .printSchema to view the datatypes in your dataframe.

    - Use .dtypes to view the datatypes in your dataframe.

    - What is the difference between the two code samples below?
        - df.abool.cast('int')
        - df.select(df.abool.cast('int')).show()

    - Use .select and .cast to convert the abool column to an integer type. View the results.

    - Convert the group column to a integer data type and view the results. What happens?

    - Convert the n column to a integer data type and view the results. What happens?

    - Convert the abool column to a string data type and view the results. What happens?

In [142]:
np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [144]:
pandas_dataframe.head(5)

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False


In [148]:
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



In [149]:
# Use .printSchema to view the datatypes in your dataframe.
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



In [154]:
# Use .dtypes to view the datatypes in your dataframe.
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

In [155]:
# What is the difference between the two code samples below?
df.abool.cast('int')
# This code shows the object changing the dtype

Column<b'CAST(abool AS INT)'>

In [162]:
df.select(df.abool.cast('int')).show(5)
# This code shows the column with the dtype changed 

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+
only showing top 5 rows



In [161]:
# Use .select and .cast to convert the abool column to an integer type. View the results.
df.select(df.abool.cast('int')).show(5)
# the value changed to a numeric value, 1 or 0

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+
only showing top 5 rows



In [163]:
# Convert the group column to a integer data type and view the results. What happens?
df.select(df.group.cast('int')).show(5)
# group is a string dtype so it becomes null becasue can't convert to int

+-----+
|group|
+-----+
| null|
| null|
| null|
| null|
| null|
+-----+
only showing top 5 rows



In [164]:
# Convert the n column to a integer data type and view the results. What happens?
df.select(df.n.cast('int')).show(5)

+---+
|  n|
+---+
|  0|
|  0|
|  0|
|  0|
|  1|
+---+
only showing top 5 rows



In [167]:
# Convert the abool column to a string data type and view the results. What happens?
df.select(df.abool.cast('string')).show(5)

+-----+
|abool|
+-----+
|false|
|false|
|false|
|false|
|false|
+-----+
only showing top 5 rows



5. Built-in Functions

    - Use the starter code above to re-create a spark dataframe.
    - Import the necessary functions from pyspark.sql.functions
    - Find the highest n value.
    - Find the lowest n value.
    - Find the average n value.
    - Use concat to change the group column to say, e.g. "Group: x" or "Group: y"
    - Use concat to combine the n and group columns to produce results that look like this: "x: -1.432" or "z: 2.352"

In [168]:
np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [169]:
pandas_dataframe.head()

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False


In [171]:
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



In [172]:
# Import the necessary functions from pyspark.sql.functions
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean

In [175]:
# Find the highest n value.
df.select(max('n')).show(5)

+------------------+
|            max(n)|
+------------------+
|2.1503829673811126|
+------------------+



In [177]:
# Find the lowest n value.
df.select(min('n')).show()

+------------------+
|            min(n)|
+------------------+
|-1.261605945319069|
+------------------+



In [180]:
# Find the average n value.
df.select(mean('n')).show()
df.select(avg('n')).show()
# both work the same

+------------------+
|            avg(n)|
+------------------+
|0.3664026449885217|
+------------------+

+------------------+
|            avg(n)|
+------------------+
|0.3664026449885217|
+------------------+



In [None]:
from pyspark.sql.functions import lit

In [219]:
# Use concat to change the group column to say, e.g. "Group: x" or "Group: y"
df.select(concat(lit('Group:'), df.group, lit(" "), df.n)).show(5)

+---------------------------+
|concat(Group:, group,  , n)|
+---------------------------+
|       Group:z -0.712390...|
|       Group:x 0.7537663...|
|       Group:z -0.044503...|
|       Group:y 0.4518123...|
|       Group:z 1.3451017...|
+---------------------------+
only showing top 5 rows



In [223]:
# Use concat to combine the n and group columns to produce results that look like this: "x: -1.432" or "z: 2.352"
df.select(concat(df.group, lit(":"), df.n)).show(5)

+--------------------+
| concat(group, :, n)|
+--------------------+
|z:-0.712390662050588|
| x:0.753766378659703|
|z:-0.044503078338...|
|y:0.4518123387457...|
|z:1.3451017084510097|
+--------------------+
only showing top 5 rows

