In [0]:
# Install Java, Spark, and Findspark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.3.2/spark-2.3.2-bin-hadoop2.7.tgz
!tar xf spark-2.3.2-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.2-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()


In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("filtering").getOrCreate()

In [3]:
# Load in data
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/dataviz-curriculum/day_1/demographics.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("demographics.csv"), sep=",", header=True)
df.show()

+---+--------------------+---+------------+---------+--------+------------------+---------------+------+-------------+
| id|                name|age|height_meter|weight_kg|children|        occupation|academic_degree|salary|     location|
+---+--------------------+---+------------+---------+--------+------------------+---------------+------+-------------+
|  0|       Darlena Avila| 58|        1.87|       53|       1|     Choreographer|            PhD|    68| South Dakota|
|  1|            Yan Boyd| 65|         1.8|       40|       0|         Cellarman|       Bachelor|    73|     Delaware|
|  2|         Joette Lane| 32|         1.8|       73|       1|Veterinary Surgeon|         Master|    69| South Dakota|
|  3|        Jazmine Hunt| 61|        1.79|       89|       0|            Hawker|            PhD|    88|    Louisiana|
|  4|      Remedios Gomez| 23|        1.64|       51|       2|     Choreographer|       Bachelor|    83|West Virginia|
|  5|        Myung Brewer| 20|        1.68|     

In [4]:
# What occupation had the highest salary?
df.orderBy(df["Salary"].desc()).select("occupation", "Salary").limit(1).show()

+-----------------+------+
|       occupation|Salary|
+-----------------+------+
|Medical Physicist|    90|
+-----------------+------+



In [5]:
# What occupation had the lowest salary?
df.orderBy(df["Salary"]).select("occupation", "Salary").limit(1).show()

+--------------+------+
|    occupation|Salary|
+--------------+------+
|Window Dresser|    65|
+--------------+------+



In [6]:
# What is the mean salary of this dataset?
from pyspark.sql.functions import mean
df.select(mean("Salary")).show()

+-----------+
|avg(Salary)|
+-----------+
|     77.738|
+-----------+



In [7]:
# What is the max and min of the Salary column?
from pyspark.sql.functions import max, min
df.select(max("Salary"), min("Salary")).show()

+-----------+-----------+
|max(Salary)|min(Salary)|
+-----------+-----------+
|         90|         65|
+-----------+-----------+



In [8]:
# Show all of the occupations where salaries were above 80k
from pyspark.sql.functions import count
df.filter("Salary > 80").select("occupation").head(10)

[Row(occupation='Hawker'),
 Row(occupation='Choreographer'),
 Row(occupation='Millwright'),
 Row(occupation='Medical Physicist'),
 Row(occupation='Scientist'),
 Row(occupation='Claims Adjustor'),
 Row(occupation='Planning Technician'),
 Row(occupation='Booking Clerk'),
 Row(occupation='Sub-Postmaster'),
 Row(occupation='Shelf Filler')]

In [9]:
df.describe()

DataFrame[summary: string, id: string, name: string, age: string, height_meter: string, weight_kg: string, children: string, occupation: string, academic_degree: string, salary: string, location: string]