In [0]:
# Install Java, Spark, and *Findspark*
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/spark-2.3.2/spark-2.3.2-bin-hadoop2.7.tgz
!tar xf spark-2.3.2-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.3.2-bin-hadoop2.7"

In [0]:
#### Start a Spark Session
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("demographics").getOrCreate()

In [4]:
from pyspark import SparkFiles
url = "https://s3.amazonaws.com/dataviz-curriculum/day_1/demographics.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("demographics.csv"), sep=",", header=True)
df.show()

+---+--------------------+---+------------+---------+--------+------------------+---------------+------+-------------+
| id|                name|age|height_meter|weight_kg|children|        occupation|academic_degree|salary|     location|
+---+--------------------+---+------------+---------+--------+------------------+---------------+------+-------------+
|  0|       Darlena Avila| 58|        1.87|       53|       1|     Choreographer|            PhD|    68| South Dakota|
|  1|            Yan Boyd| 65|         1.8|       40|       0|         Cellarman|       Bachelor|    73|     Delaware|
|  2|         Joette Lane| 32|         1.8|       73|       1|Veterinary Surgeon|         Master|    69| South Dakota|
|  3|        Jazmine Hunt| 61|        1.79|       89|       0|            Hawker|            PhD|    88|    Louisiana|
|  4|      Remedios Gomez| 23|        1.64|       51|       2|     Choreographer|       Bachelor|    83|West Virginia|
|  5|        Myung Brewer| 20|        1.68|     

In [5]:
### Print the column names
df.columns

['id',
 'name',
 'age',
 'height_meter',
 'weight_kg',
 'children',
 'occupation',
 'academic_degree',
 'salary',
 'location']

In [6]:
# Print out the first 10 rows
df.head(10)

[Row(id='0', name='Darlena Avila', age='58', height_meter='1.87', weight_kg='53', children='1', occupation='Choreographer', academic_degree='PhD', salary='68', location='South Dakota'),
 Row(id='1', name='Yan Boyd', age='65', height_meter='1.8', weight_kg='40', children='0', occupation='Cellarman', academic_degree='Bachelor', salary='73', location='Delaware'),
 Row(id='2', name='Joette Lane', age='32', height_meter='1.8', weight_kg='73', children='1', occupation='Veterinary Surgeon', academic_degree='Master', salary='69', location='South Dakota'),
 Row(id='3', name='Jazmine Hunt', age='61', height_meter='1.79', weight_kg='89', children='0', occupation='Hawker', academic_degree='PhD', salary='88', location='Louisiana'),
 Row(id='4', name='Remedios Gomez', age='23', height_meter='1.64', weight_kg='51', children='2', occupation='Choreographer', academic_degree='Bachelor', salary='83', location='West Virginia'),
 Row(id='5', name='Myung Brewer', age='20', height_meter='1.68', weight_kg='60

In [7]:
# Select the age, height_meter, and weight_kg columns and use describe to show the summary statistics
df.select(["age", "height_meter", "weight_kg"]).describe().show()

+-------+------------------+------------------+------------------+
|summary|               age|      height_meter|         weight_kg|
+-------+------------------+------------------+------------------+
|  count|              1000|              1000|              1000|
|   mean|            42.933|1.7519499999999995|            64.011|
| stddev|14.255445581556843|0.1436897499623555|15.005733939099779|
|    min|                18|               1.5|                38|
|    max|                67|                 2|                90|
+-------+------------------+------------------+------------------+



In [8]:
# Print the schema to see the types
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: string (nullable = true)
 |-- height_meter: string (nullable = true)
 |-- weight_kg: string (nullable = true)
 |-- children: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- academic_degree: string (nullable = true)
 |-- salary: string (nullable = true)
 |-- location: string (nullable = true)



In [9]:
# Rename the Salary column to `Salary (1k)` and show only this new column
df = df.withColumnRenamed('Salary', 'Salary (1k)')
df.select("Salary (1k)").show()

+-----------+
|Salary (1k)|
+-----------+
|         68|
|         73|
|         69|
|         88|
|         83|
|         65|
|         72|
|         65|
|         87|
|         72|
|         73|
|         90|
|         78|
|         69|
|         75|
|         77|
|         76|
|         90|
|         79|
|         77|
+-----------+
only showing top 20 rows



In [10]:
# Create a new column called `Salary` where the values are the `Salary (1k)` * 1000
# Show the columns `Salary` and `Salary (1k)`
df = df.withColumn("Salary", df["Salary (1k)"] * 1000)
df.select(["Salary", "Salary (1k)"]).show()

+-------+-----------+
| Salary|Salary (1k)|
+-------+-----------+
|68000.0|         68|
|73000.0|         73|
|69000.0|         69|
|88000.0|         88|
|83000.0|         83|
|65000.0|         65|
|72000.0|         72|
|65000.0|         65|
|87000.0|         87|
|72000.0|         72|
|73000.0|         73|
|90000.0|         90|
|78000.0|         78|
|69000.0|         69|
|75000.0|         75|
|77000.0|         77|
|76000.0|         76|
|90000.0|         90|
|79000.0|         79|
|77000.0|         77|
+-------+-----------+
only showing top 20 rows

