# Spark API Minilesson

In [1]:
import pandas as pd
import numpy as np

np.random.seed(13)

import pyspark

#create the spark session 
spark = pyspark.sql.SparkSession.builder.getOrCreate()

### Spark Dataframe Basics

**1. Use the starter code above to create a pandas dataframe.**

In [2]:
pandas_dataframe = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

**2.  Convert the pandas dataframe to a spark dataframe. From this point forward, do all of your work with the spark dataframe, not the pandas dataframe.**

In [3]:
#convert any pandas dataframe into a spark dataframe with a simple method call
df = spark.createDataFrame(pandas_dataframe)

**3.  Show the first 3 rows of the dataframe.**

In [4]:
# use .show to view first three rows
df.show(3)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



**4. Show the first 7 rows of the dataframe.**

In [5]:
# use .show to view first seven rows
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



**5. View a summary of the data using .describe.**

In [6]:
df.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885217| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



**6. Use `.select` to create a new dataframe with just the `n` and `abool` columns. View the first 5 rows of this dataframe.**

In [7]:
# create a new dataframe with two columns: 'n' and 'abool'
df.select(df.n, df.abool).show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



**7. Use `.select` to create a new dataframe with just the `group` and `abool` columns. View the first 5 rows of this dataframe.**

In [8]:
# create a new dataframe with two columns: 'group' and 'abool'
df.select(df.group, df.abool).show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



**8. Use `.select` to create a new dataframe with the `group` column and the `abool` column renamed to `a_boolean_value`. Show the first 3 rows of this dataframe.**

In [9]:
# create a new dataframe with two columns: 'group' and 'abool', but rename abool
df.select(df.group, df.abool.alias("a_boolean_value")).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



**9. Use `.select` to create a new dataframe with the `group` column and the `n` column renamed to `a_numeric_value`. Show the first 6 rows of this dataframe.**


In [10]:
# create a new dataframe with two columns: 'group' and 'n', but rename n
df.select(df.group, df.n.alias("a_numeric_value")).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



### Column Manipulation

**10. Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named `df`**

In [14]:
#create the spark session 
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [15]:
#create data frame 
df = pd.DataFrame({
    "n": np.random.randn(20),
    "group": np.random.choice(list("xyz"), 20),
    "abool": np.random.choice([True, False], 20),
})

In [18]:
from pydataset import data

#convert to spark dataframe:
df = spark.createDataFrame(df)

**11. Use `.select` to add 4 to the `n` column. Show the results.**

In [22]:
df.select(df.n + 1).show(5)

+------------------+
|           (n + 1)|
+------------------+
|0.2790003265646185|
|1.5622617060581323|
|0.6851419238674241|
| 0.570102918131196|
|1.8149954430712238|
+------------------+
only showing top 5 rows



**12. Subtract 5 from the `n` column and view the results.**

In [23]:
df.select(df.n - 5).show(5)

+------------------+
|           (n - 5)|
+------------------+
|-5.720999673435381|
|-4.437738293941868|
|-5.314858076132576|
|-5.429897081868804|
|-4.185004556928776|
+------------------+
only showing top 5 rows



**13. Multiply the `n` column by 2. View the results along with the original numbers.**

In [24]:
df.select(df.n, df.n * 2).show(5)

+--------------------+-------------------+
|                   n|            (n * 2)|
+--------------------+-------------------+
| -0.7209996734353815| -1.441999346870763|
|  0.5622617060581323| 1.1245234121162646|
| -0.3148580761325759|-0.6297161522651518|
|-0.42989708186880404|-0.8597941637376081|
|  0.8149954430712237| 1.6299908861424475|
+--------------------+-------------------+
only showing top 5 rows



**14. Add a new column named `n2` that is the `n` value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original `n` value as well as `n2`.**

In [29]:
df.n2 = df.select(df.n, df.n * (-1)).show(4)

+--------------------+-------------------+
|                   n|           (n * -1)|
+--------------------+-------------------+
| -0.7209996734353815| 0.7209996734353815|
|  0.5622617060581323|-0.5622617060581323|
| -0.3148580761325759| 0.3148580761325759|
|-0.42989708186880404|0.42989708186880404|
+--------------------+-------------------+
only showing top 4 rows



**15. Add a new column named `n3` that is the n value squared. Show the first 5 rows of your dataframe. You should see both `n`, `n2`, and `n3`.**

In [36]:
df.n3 = df.select(df.n, df.n * (-1), df.n**2).show(5)

+--------------------+-------------------+-------------------+
|                   n|           (n * -1)|        POWER(n, 2)|
+--------------------+-------------------+-------------------+
| -0.7209996734353815| 0.7209996734353815| 0.5198405290939268|
|  0.5622617060581323|-0.5622617060581323|0.31613822609940156|
| -0.3148580761325759| 0.3148580761325759|0.09913560810590695|
|-0.42989708186880404|0.42989708186880404| 0.1848115009993132|
|  0.8149954430712237|-0.8149954430712237| 0.6642175722268603|
+--------------------+-------------------+-------------------+
only showing top 5 rows



**16. What happens when you run the code below?**

        ```python
        df.group + df.abool
        ```

In [34]:
pd
df.group + df.abool

Column<'(group + abool)'>

**17. What happens when you run the code below? What is the difference between this and the previous code sample?**

        ```python
        df.select(df.group + df.abool)
        ```

In [35]:
pd
df.select(df.group + df.abool)

AnalysisException: cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;
'Project [(cast(group#196 as double) + abool#197) AS (group + abool)#294]
+- LogicalRDD [n#195, group#196, abool#197], false


**18. Try adding various other columns together. What are the results of combining the different data types?**