In [76]:
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [77]:
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [78]:
sdf = spark.createDataFrame(pandas_dataframe)
sdf

DataFrame[n: double, group: string, abool: boolean]

### 1. Spark DataFrame Basics

#### Show the first 3 rows of the dataframe.

In [79]:
sdf.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



#### Show the first 7 rows of the dataframe.


In [80]:
sdf.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



#### What is the difference between .show and .head?

In [81]:
sdf.head(3)

[Row(n=-0.712390662050588, group='z', abool=False),
 Row(n=0.753766378659703, group='x', abool=False),
 Row(n=-0.044503078338053455, group='z', abool=False)]

In [82]:
sdf.head(7)

[Row(n=-0.712390662050588, group='z', abool=False),
 Row(n=0.753766378659703, group='x', abool=False),
 Row(n=-0.044503078338053455, group='z', abool=False),
 Row(n=0.45181233874578974, group='y', abool=False),
 Row(n=1.3451017084510097, group='z', abool=False),
 Row(n=0.5323378882945463, group='y', abool=False),
 Row(n=1.3501878997225267, group='z', abool=False)]

#### View a summary of the data using .describe.


In [83]:
sdf.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885217| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



#### Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.

In [84]:
sdf.select('n', 'abool').show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



#### Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.

In [85]:
sdf.select('group', 'abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



#### Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.

In [86]:
sdf.select('group', sdf.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



#### Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.

In [87]:
sdf.select('group', sdf.n.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



### 2. Column Manipulation

#### ii. Use .select to add 4 to the n column. Show the results.

In [88]:
sdf.select(sdf.n + 4).show(3)

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
+------------------+
only showing top 3 rows



#### iii. Subtract 5 from the n column and view the results.


In [89]:
sdf.select(sdf.n - 5).show(3)

+------------------+
|           (n - 5)|
+------------------+
|-5.712390662050588|
|-4.246233621340297|
|-5.044503078338053|
+------------------+
only showing top 3 rows



#### iv. Multiply the n column by 2. View the results along with the original numbers.


In [90]:
sdf.select(sdf.n, sdf.n * 2).show(3)

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|
|   0.753766378659703|   1.507532757319406|
|-0.04450307833805...|-0.08900615667610691|
+--------------------+--------------------+
only showing top 3 rows



#### v. Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.


In [91]:
sdf = sdf.select('group', 'abool', 'n', (sdf.n * -1).alias('n2'))
sdf.show(4)

+-----+-----+--------------------+--------------------+
|group|abool|                   n|                  n2|
+-----+-----+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|
|    x|false|   0.753766378659703|  -0.753766378659703|
|    z|false|-0.04450307833805...|0.044503078338053455|
|    y|false| 0.45181233874578974|-0.45181233874578974|
+-----+-----+--------------------+--------------------+
only showing top 4 rows



#### vi. Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.


In [92]:
sdf = sdf.select('*', (sdf.n ** 2).alias('n3'))
sdf.show(5)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|    z|false|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 5 rows



#### vii. What happens when you run the code below?

`df.group + df.abool`

In [93]:
sdf.group + sdf.abool

Column<'(group + abool)'>

#### viii. What happens when you run the code below? What is the difference between this and the previous code sample?

`df.select(df.group + df.abool)`

In [94]:
sdf.select(sdf.group + sdf.abool)

AnalysisException: cannot resolve '(CAST(group AS DOUBLE) + abool)' due to data type mismatch: differing types in '(CAST(group AS DOUBLE) + abool)' (double and boolean).;
'Project [unresolvedalias((cast(group#1085 as double) + abool#1086), Some(org.apache.spark.sql.Column$$Lambda$3270/0x0000000801279840@4e01bedb))]
+- Project [group#1085, abool#1086, n#1084, n2#1389, POWER(n#1084, cast(2 as double)) AS n3#1411]
   +- Project [group#1085, abool#1086, n#1084, (n#1084 * cast(-1 as double)) AS n2#1389]
      +- LogicalRDD [n#1084, group#1085, abool#1086], false


#### ix. Try adding various other columns together. What are the results of combining the different data types?

In [95]:
sdf.select(sdf.group + sdf.n).show(3)

+-----------+
|(group + n)|
+-----------+
|       null|
|       null|
|       null|
+-----------+
only showing top 3 rows



In [97]:
sdf.select(sdf.abool + sdf.n).show(3)

AnalysisException: cannot resolve '(abool + n)' due to data type mismatch: differing types in '(abool + n)' (boolean and double).;
'Project [unresolvedalias((abool#1086 + n#1084), Some(org.apache.spark.sql.Column$$Lambda$3270/0x0000000801279840@4e01bedb))]
+- Project [group#1085, abool#1086, n#1084, n2#1389, POWER(n#1084, cast(2 as double)) AS n3#1411]
   +- Project [group#1085, abool#1086, n#1084, (n#1084 * cast(-1 as double)) AS n2#1389]
      +- LogicalRDD [n#1084, group#1085, abool#1086], false


In [None]:
sdf.select(sdf.n3 + sdf.n).show(3)

### 3. Type Casting

#### ii. Use .printSchema to view the datatypes in your dataframe.


In [None]:
sdf.printSchema()

#### iii. Use .dtypes to view the datatypes in your dataframe

In [None]:
sdf.dtypes

#### iv. What is the difference between the two code samples below?
`df.abool.cast('int')`

`df.select(df.abool.cast('int')).show()`

In [None]:
sdf.abool.cast('int')

In [None]:
sdf.select(sdf.abool.cast('int')).show(3)

#### v. Use .select and .cast to convert the abool column to an integer type. View the results.


In [None]:
sdf.select(sdf.abool.cast('int')).show(3)

#### vi. Convert the group column to a integer data type and view the results. What happens?


In [None]:
sdf.select(sdf.group.cast('int')).show(3)

#### vii. Convert the n column to a integer data type and view the results. What happens?


In [None]:
sdf.select(sdf.n, sdf.n.cast('int').alias('n_as_int')).show(7)

#### viii. Convert the abool column to a string data type and view the results. What happens?


In [None]:
sdf.select(sdf.abool, sdf.abool.cast('string').alias('abool_as_string')).show(10)

### 4. Built-in Functions

#### ii. Import the necessary functions from pyspark.sql.functions

In [None]:
import pyspark.sql.functions as F

#### ii. Find the highest n value.

In [None]:
sdf.select(F.max(sdf.n)).show()

#### iv. Find the lowest n value.

In [None]:
sdf.select(F.min(sdf.n)).show()

#### v. Find the average n value.

In [None]:
sdf.select(F.mean(sdf.n)).show()

#### vi. Use concat to change the group column to say, e.g. "Group: x" or "Group: y"

In [None]:
sdf.select(F.concat(F.lit('Group: '), sdf.group)).show(5)

#### vii. Use concat to combine the n and group columns to produce results that look like this: "x: -1.432" or "z: 2.352"

In [None]:
sdf.select(F.concat(sdf.group, F.lit(': '), sdf.n)).show(5, truncate=False)

### 5. When / Otherwise

#### ii. Use when and .otherwise to create a column that contains the text "It is true" when abool is true and "It is false"" when abool is false.

In [None]:
sdf.select(sdf.abool,
           F.when(sdf.abool == True, F.lit('It is true'))
            .otherwise('It is False')
            .alias('What is it?')
          ).show(10)

#### iii. Create a column that contains 0 if n is less than 0, otherwise, the original n value.

In [None]:
sdf.select(sdf.n,
           F.when(sdf.n < 0, 0)
            .otherwise(sdf.n)
          ).show(3)

### 6. Filter / Where

#### ii. Use .filter or .where to select just the rows where the group is y and view the results.

In [None]:
sdf.filter(sdf.group == 'y').show(3)

#### iii. Select just the columns where the abool column is false and view the results.

In [98]:
sdf.filter(sdf.abool == False).show(3)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 3 rows



#### iv. Find the columns where the group column is not y.

In [99]:
sdf.filter(sdf.group != 'y').show(3)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 3 rows



#### v. Find the columns where n is positive.

In [100]:
sdf.filter(sdf.n > 0).show(3)

+-----+-----+-------------------+--------------------+-------------------+
|group|abool|                  n|                  n2|                 n3|
+-----+-----+-------------------+--------------------+-------------------+
|    x|false|  0.753766378659703|  -0.753766378659703| 0.5681637535977627|
|    y|false|0.45181233874578974|-0.45181233874578974|0.20413438944294027|
|    z|false| 1.3451017084510097| -1.3451017084510097| 1.8092986060778251|
+-----+-----+-------------------+--------------------+-------------------+
only showing top 3 rows



#### vi. Find the columns where abool is true and the group column is z.

In [101]:
sdf.filter((sdf.abool == True) & (sdf.group == 'z')).show()

+-----+-----+------------------+-------------------+------------------+
|group|abool|                 n|                 n2|                n3|
+-----+-----+------------------+-------------------+------------------+
|    z| true|1.4786857374358966|-1.4786857374358966|2.1865115100963415|
+-----+-----+------------------+-------------------+------------------+



#### vii. Find the columns where abool is true or the group column is z.

In [102]:
sdf.filter((sdf.abool) | (sdf.group == 'z')).show(3)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    z|false|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 3 rows



#### viii. Find the columns where abool is false and n is less than 1

In [103]:
sdf.filter((~ sdf.abool) & (sdf.n < 1)).show(3)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 3 rows



#### ix. Find the columns where abool is false or n is less than 1

In [104]:
sdf.filter((~ sdf.abool) | (sdf.n < 1)).show(3)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 3 rows



### 7. Sorting

#### ii. Sort by the n value.

In [105]:
sdf.sort('n').show(3)

+-----+-----+-------------------+------------------+------------------+
|group|abool|                  n|                n2|                n3|
+-----+-----+-------------------+------------------+------------------+
|    y|false| -1.261605945319069| 1.261605945319069| 1.591649561264422|
|    y| true|-1.0453771305385342|1.0453771305385342|1.0928133450529796|
|    x|false|-0.7889890249515489|0.7889890249515489|0.6225036814939958|
+-----+-----+-------------------+------------------+------------------+
only showing top 3 rows



#### iii. Sort by the group value, both ascending and descending.

In [106]:
sdf.sort('group').show(10) # ascending

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    x|false| 0.31735092273633597|-0.31735092273633597| 0.10071160816160388|
|    x|false| -0.7889890249515489|  0.7889890249515489|  0.6225036814939958|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    x|false|  0.8612113741693206| -0.8612113741693206|  0.7416850309986095|
|    x|false|  0.6062886568962988| -0.6062886568962988|   0.367585935481118|
|    x| true|-0.02677164998644...|0.026771649986440726|7.167212429964917E-4|
|    y| true|  0.5628467852810314| -0.5628467852810314| 0.31679650370119145|
|    y|false|  -1.261605945319069|   1.261605945319069|   1.591649561264422|
|    y|false|  0.9137407048596775| -0.9137407048596775|  0.8349220757174602|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|

In [107]:
sdf.sort('group', ascending=False).show(10) # descending

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    z|false|  1.3501878997225267| -1.3501878997225267|  1.8230073645571279|
|    z|false|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
|    z| true|  1.4786857374358966| -1.4786857374358966|  2.1865115100963415|
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    z|false| 0.12730328020698067|-0.12730328020698067|0.016206125151457036|
|    y| true|-0.24332625188556253| 0.24332625188556253| 0.05920766485667622|
|    y| true|  0.5628467852810314| -0.5628467852810314| 0.31679650370119145|
|    y|false|  0.9137407048596775| -0.9137407048596775|  0.8349220757174602|
|    y| true| -1.0453771305385342|  1.0453771305385342|  1.0928133450529796|

#### iv. Sort by the group value first, then, within each group, sort by n value.

In [108]:
sdf.sort('group', 'n').show()

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    x|false| -0.7889890249515489|  0.7889890249515489|  0.6225036814939958|
|    x| true|-0.02677164998644...|0.026771649986440726|7.167212429964917E-4|
|    x|false| 0.31735092273633597|-0.31735092273633597| 0.10071160816160388|
|    x|false|  0.6062886568962988| -0.6062886568962988|   0.367585935481118|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    x|false|  0.8612113741693206| -0.8612113741693206|  0.7416850309986095|
|    y|false|  -1.261605945319069|   1.261605945319069|   1.591649561264422|
|    y| true| -1.0453771305385342|  1.0453771305385342|  1.0928133450529796|
|    y| true|-0.24332625188556253| 0.24332625188556253| 0.05920766485667622|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|

#### v. Sort by abool, group, and n. Does it matter in what order you specify the columns when sorting?

of course it does

In [109]:
sdf.sort('abool', 'group', 'n').show()

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    x|false| -0.7889890249515489|  0.7889890249515489|  0.6225036814939958|
|    x|false| 0.31735092273633597|-0.31735092273633597| 0.10071160816160388|
|    x|false|  0.6062886568962988| -0.6062886568962988|   0.367585935481118|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    x|false|  0.8612113741693206| -0.8612113741693206|  0.7416850309986095|
|    y|false|  -1.261605945319069|   1.261605945319069|   1.591649561264422|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|    y|false|  0.5323378882945463| -0.5323378882945463|  0.2833836273138969|
|    y|false|  0.9137407048596775| -0.9137407048596775|  0.8349220757174602|
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|

In [110]:
sdf.sort('group', 'abool', 'n').show()

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    x|false| -0.7889890249515489|  0.7889890249515489|  0.6225036814939958|
|    x|false| 0.31735092273633597|-0.31735092273633597| 0.10071160816160388|
|    x|false|  0.6062886568962988| -0.6062886568962988|   0.367585935481118|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    x|false|  0.8612113741693206| -0.8612113741693206|  0.7416850309986095|
|    x| true|-0.02677164998644...|0.026771649986440726|7.167212429964917E-4|
|    y|false|  -1.261605945319069|   1.261605945319069|   1.591649561264422|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|    y|false|  0.5323378882945463| -0.5323378882945463|  0.2833836273138969|
|    y|false|  0.9137407048596775| -0.9137407048596775|  0.8349220757174602|

### 8. Aggregating

#### i. What is the average n value for each group in the group column?

In [111]:
sdf.groupby('group').agg(F.mean('n')).show()

+-----+------------------+
|group|            avg(n)|
+-----+------------------+
|    x|0.2871427762539448|
|    z| 0.590730814237962|
|    y| 0.257601419602374|
+-----+------------------+



#### ii. What is the maximum n value for each group in the group column?

In [112]:
sdf.groupby('group').agg(F.max('n')).show()

+-----+------------------+
|group|            max(n)|
+-----+------------------+
|    x|0.8612113741693206|
|    z|1.4786857374358966|
|    y|2.1503829673811126|
+-----+------------------+



#### iii. What is the minimum n value by abool?

In [113]:
sdf.groupby('abool').agg(F.min('n')).show()

+-----+-------------------+
|abool|             min(n)|
+-----+-------------------+
|false| -1.261605945319069|
| true|-1.0453771305385342|
+-----+-------------------+



#### iv. What is the average n value for each unique combination of the group and abool column?

In [114]:
sdf.groupby('group', 'abool').agg(F.mean('n')).show()

+-----+-----+--------------------+
|group|abool|              avg(n)|
+-----+-----+--------------------+
|    z|false| 0.41313982959837514|
|    x|false|  0.3499256615020219|
|    y|false| 0.15907124664523611|
|    y| true| 0.35613159255951177|
|    z| true|  1.4786857374358966|
|    x| true|-0.02677164998644...|
+-----+-----+--------------------+



### 9. Spark SQL

#### ii. Turn your dataframe into a table that can be queried with spark SQL. Name the table my_df. Answer the rest of the questions in this section with a spark sql query (spark.sql) against my_df. After each step, view the first 7 records from the dataframe.

In [115]:
sdf.createOrReplaceTempView('my_df')

#### iii. What happens if you make a SQL syntax error in your query?

In [116]:
spark.sql('''SELECT n n2 n3 FROM my_df''')

ParseException: 
mismatched input 'n3' expecting {<EOF>, ';'}(line 1, pos 12)

== SQL ==
SELECT n n2 n3 FROM my_df
------------^^^


#### iv. Write a query that shows all of the columns from your dataframe.

In [117]:
spark.sql('''SELECT * FROM my_df''').show(7)

+-----+-----+--------------------+--------------------+--------------------+
|group|abool|                   n|                  n2|                  n3|
+-----+-----+--------------------+--------------------+--------------------+
|    z|false|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|    x|false|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|    z|false|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
|    y|false| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|    z|false|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
|    y|false|  0.5323378882945463| -0.5323378882945463|  0.2833836273138969|
|    z|false|  1.3501878997225267| -1.3501878997225267|  1.8230073645571279|
+-----+-----+--------------------+--------------------+--------------------+
only showing top 7 rows



#### v. Write a query that shows just the n and abool columns from the dataframe.

In [118]:
spark.sql('''SELECT n, abool FROM my_df''').show(7)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
|  0.5323378882945463|false|
|  1.3501878997225267|false|
+--------------------+-----+
only showing top 7 rows



#### vi. Write a query that shows just the n and group columns. Rename the group column to g.

In [119]:
spark.sql('''SELECT n, group g FROM my_df''').show(7)

+--------------------+---+
|                   n|  g|
+--------------------+---+
|  -0.712390662050588|  z|
|   0.753766378659703|  x|
|-0.04450307833805...|  z|
| 0.45181233874578974|  y|
|  1.3451017084510097|  z|
|  0.5323378882945463|  y|
|  1.3501878997225267|  z|
+--------------------+---+
only showing top 7 rows



#### vii. Write a query that selects n, and creates two new columns: n2, the original n values halved, and n3: the original n values minus 1.

In [120]:
spark.sql('''SELECT n, 
                    n / 2 as n2,
                    n - 1 as n3
                FROM my_df'''
         ).show(7)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|  -0.356195331025294|  -1.712390662050588|
|   0.753766378659703|  0.3768831893298515|-0.24623362134029703|
|-0.04450307833805...|-0.02225153916902...| -1.0445030783380536|
| 0.45181233874578974| 0.22590616937289487| -0.5481876612542103|
|  1.3451017084510097|  0.6725508542255049| 0.34510170845100974|
|  0.5323378882945463| 0.26616894414727316| -0.4676621117054537|
|  1.3501878997225267|  0.6750939498612634| 0.35018789972252673|
+--------------------+--------------------+--------------------+
only showing top 7 rows

