## Set up

In [None]:
import os

# Find the latest version of spark 2.0 from http://www-us.apache.org/dist/spark/ and update the line below if necessary
spark_version = 'spark-2.4.7'
os.environ['SPARK_VERSION'] = spark_version

# Install dependencies: Spark, hadoop, Java, and Findspark
!apt-get update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Connecting to security.u0% [Connecting to archive.ubuntu.com (91.189.88.142)] [Connecting to security.u0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)                                                                               Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (91.189.88.142)                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [697 B]
Hit:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:6 htt

In [None]:
# Set environment path in order to run PySpark in Google Colab
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

In [None]:
# Create a local Spark session
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

## Import data

In [None]:
# Upload both CSV files
# Data retrieved from Kaggle https://www.kaggle.com/claudiodavi/superhero-set
from google.colab import files
files.upload()

Saving heroes_information.csv to heroes_information.csv
Saving super_hero_powers.csv to super_hero_powers.csv


{'heroes_information.csv': b",name,Gender,Eye color,Race,Hair color,Height,Publisher,Skin color,Alignment,Weight\r\n0,A-Bomb,Male,yellow,Human,No Hair,203.0,Marvel Comics,-,good,441.0\r\n1,Abe Sapien,Male,blue,Icthyo Sapien,No Hair,191.0,Dark Horse Comics,blue,good,65.0\r\n2,Abin Sur,Male,blue,Ungaran,No Hair,185.0,DC Comics,red,good,90.0\r\n3,Abomination,Male,green,Human / Radiation,No Hair,203.0,Marvel Comics,-,bad,441.0\r\n4,Abraxas,Male,blue,Cosmic Entity,Black,-99.0,Marvel Comics,-,bad,-99.0\r\n5,Absorbing Man,Male,blue,Human,No Hair,193.0,Marvel Comics,-,bad,122.0\r\n6,Adam Monroe,Male,blue,-,Blond,-99.0,NBC - Heroes,-,good,-99.0\r\n7,Adam Strange,Male,blue,Human,Blond,185.0,DC Comics,-,good,88.0\r\n8,Agent 13,Female,blue,-,Blond,173.0,Marvel Comics,-,good,61.0\r\n9,Agent Bob,Male,brown,Human,Brown,178.0,Marvel Comics,-,good,81.0\r\n10,Agent Zero,Male,-,-,-,191.0,Marvel Comics,-,good,104.0\r\n11,Air-Walker,Male,blue,-,White,188.0,Marvel Comics,-,bad,108.0\r\n12,Ajax,Male,brown,Cy

In [None]:
# Import data from both CSV files into DataFrames
heroes = spark.read.load(
    "heroes_information.csv",
    format="csv",
    inferSchema="true",
    header="true"
)

superpowers = spark.read.load(
    "super_hero_powers.csv",
    format="csv",
    inferSchema="true",
    header="true"
)

In [None]:
# Create views from the DataFrames so that we can query our data in SQL
heroes.createOrReplaceTempView("heroes")
superpowers.createOrReplaceTempView("superpowers")

## Compare DataFrame API to SQL API
Here we fetch the same results using the DataFrame API and the SQL API

In [None]:
sqlFemales = spark.sql("""
SELECT heroes.name, heroes.`Eye color`, heroes.alignment
FROM heroes
JOIN superpowers ON heroes.name = superpowers.hero_names
WHERE heroes.alignment IN ('bad', 'neutral')
  AND superpowers.invisibility = True
""")

sqlFemales.show()

+---------------+---------+---------+
|           name|Eye color|alignment|
+---------------+---------+---------+
|          Amazo|      red|      bad|
|Living Tribunal|     blue|  neutral|
|  One-Above-All|        -|  neutral|
+---------------+---------+---------+



In [None]:
sqlFemalesByPublisher = spark.sql("""
SELECT publisher, count(1)
FROM heroes
JOIN superpowers ON heroes.name = superpowers.hero_names
WHERE heroes.gender = 'Female'
GROUP BY heroes.publisher
ORDER BY count(1) DESC
LIMIT 10
""")

sqlFemalesByPublisher.show()

+-----------------+--------+
|        publisher|count(1)|
+-----------------+--------+
|    Marvel Comics|      99|
|        DC Comics|      54|
|     NBC - Heroes|       6|
|Dark Horse Comics|       5|
|             null|       2|
|      ABC Studios|       2|
|             SyFy|       2|
|     Image Comics|       2|
|    HarperCollins|       1|
|        Star Trek|       1|
+-----------------+--------+



In [None]:
sqlQuery = spark.sql("""
SELECT heroes.*
FROM heroes
LEFT JOIN superpowers ON heroes.name = superpowers.hero_names
WHERE hero_names IS NULL
""")

sqlQuery.show()

+---+-----------------+------+---------+-----+----------+------+-------------+----------+---------+------+
|_c0|             name|Gender|Eye color| Race|Hair color|Height|    Publisher|Skin color|Alignment|Weight|
+---+-----------------+------+---------+-----+----------+------+-------------+----------+---------+------+
|  8|         Agent 13|Female|     blue|    -|     Blond| 173.0|Marvel Comics|         -|     good|  61.0|
| 16|Alfred Pennyworth|  Male|     blue|Human|     Black| 178.0|    DC Comics|         -|     good|  72.0|
| 45|          Arsenal|  Male|        -|Human|         -| -99.0|    DC Comics|         -|     good| -99.0|
| 64|      Batgirl III|Female|        -|    -|         -| -99.0|    DC Comics|         -|     good| -99.0|
| 66|        Batgirl V|Female|        -|    -|         -| -99.0|    DC Comics|         -|     good| -99.0|
| 76|           Beetle|  Male|        -|    -|         -| -99.0|Marvel Comics|         -|      bad| -99.0|
|100|    Black Goliath|  Male|       

In [None]:
sqlQuery = spark.sql("""
SELECT * FROM superpowers WHERE hero_names = 'Magneto'
""")

sqlQuery.show()

+----------+-------+-------------------+------------------+---------------------+---------------+----------+-------+-----------------+------+------------+--------------------+------------+--------------+------------------+-----------------+---------+------------+--------------+-----------+---------+------------+-------------+-----------+-------------+---------------+-------+-------------+-------------+---------+--------+-----------+----------+----------------------+-------------------+--------------+----------------------+-----------+-------------+---------------+-----------+------------+-----+------------+-----+------------+------------------+-----------+-------------+--------------------+-----------+----------+---------------+-------+-----------------+--------------------+------------+----------+---------+---------------+--------+---------------+-----------------+------------+---------------+------------+-------+---------------+-----------------+--------------+----------+------------