# DataFrames Basics

## Prerrequisites

Install Spark and Java in VM

In [1]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

In [2]:
ls -l # check the .tgz is there

total 267680
drwxr-xr-x 1 root root      4096 Dec  7 14:41 [0m[01;34msample_data[0m/
-rw-r--r-- 1 root root 274099817 Oct 15 10:53 spark-3.3.1-bin-hadoop2.tgz


In [3]:
# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

In [4]:
!pip install -q findspark

In [5]:

!pip install py4j

# For maps
!pip install folium
!pip install plotly

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[K     |████████████████████████████████| 200 kB 6.8 MB/s 
[?25hInstalling collected packages: py4j
Successfully installed py4j-0.10.9.7
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Define the environment

In [6]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [7]:
import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("DataFrames Basics") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.3.1'

In [8]:
spark

In [9]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [10]:
# Import sql functions
from pyspark.sql.functions import *

Download datasets

In [11]:
!mkdir -p dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/cars.json -P /dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/movies.json -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/bank.csv -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/vehicles.csv -P /dataset
!ls /dataset

In [12]:
ls -l /dataset

total 1784
-rw-r--r-- 1 root root  461474 Dec  9 17:01 bank.csv
-rw-r--r-- 1 root root   74910 Dec  9 17:00 cars.json
-rw-r--r-- 1 root root 1274347 Dec  9 17:01 movies.json
-rw-r--r-- 1 root root    4370 Dec  9 17:01 vehicles.csv


## Examples

In [14]:
from pyspark.sql.types import Row
from pyspark.sql.functions import *

In [15]:
bankText = spark.sparkContext.textFile("/dataset/bank.csv")

#we have to: remove firt row (headers), map the rest, and create DF
bank = bankText.map(lambda lineaCsv: lineaCsv.split(";"))\
.filter(lambda s: s[0] != "\"age\"") \
.map(lambda row: Row(int(row[0]), row[1].replace("\"", ""), row[2].replace("\"", ""), row[3].replace("\"", ""), row[5].replace("\"", ""))) \
.toDF(["age", "job", "marital", "education", "balance"]) \
.withColumn("age", col("age").cast("int"))

bank.show(3)

+---+----------+-------+---------+-------+
|age|       job|marital|education|balance|
+---+----------+-------+---------+-------+
| 30|unemployed|married|  primary|   1787|
| 33|  services|married|secondary|   4789|
| 35|management| single| tertiary|   1350|
+---+----------+-------+---------+-------+
only showing top 3 rows



Read directly from JSON file to a DF

In [16]:
carsDF = spark.read.option("inferSchema", True).json("/dataset/cars.json") # inferSchema requires one extra pass over the data

# if None is set, it uses de default value (default = False) you can also pass the schema manually

Read directly from csv

In [17]:
bankDF = spark.read.option("header", "true").option("delimiter", ";").csv("/dataset/bank.csv")
bankDF.show(3)

+---+----------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
|age|       job|marital|education|default|balance|housing|loan| contact|day|month|duration|campaign|pdays|previous|poutcome|  y|
+---+----------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
| 30|unemployed|married|  primary|     no|   1787|     no|  no|cellular| 19|  oct|      79|       1|   -1|       0| unknown| no|
| 33|  services|married|secondary|     no|   4789|    yes| yes|cellular| 11|  may|     220|       1|  339|       4| failure| no|
| 35|management| single| tertiary|     no|   1350|    yes|  no|cellular| 16|  apr|     185|       1|  330|       1| failure| no|
+---+----------+-------+---------+-------+-------+-------+----+--------+---+-----+--------+--------+-----+--------+--------+---+
only showing top 3 rows



Showing a DF and print schema

In [None]:
carsDF.show(2)
carsDF.printSchema()

+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|Acceleration|Cylinders|Displacement|Horsepower|Miles_per_Gallon|                Name|Origin|Weight_in_lbs|      Year|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
|        12.0|        8|       307.0|       130|            18.0|chevrolet chevell...|   USA|         3504|1970-01-01|
|        11.5|        8|       350.0|       165|            15.0|   buick skylark 320|   USA|         3693|1970-01-01|
+------------+---------+------------+----------+----------------+--------------------+------+-------------+----------+
only showing top 2 rows

root
 |-- Acceleration: double (nullable = true)
 |-- Cylinders: long (nullable = true)
 |-- Displacement: double (nullable = true)
 |-- Horsepower: long (nullable = true)
 |-- Miles_per_Gallon: double (nullable = true)
 |-- Name: string (nullable = true)
 |-- 

Get Rows

In [18]:
carsDF.take(2)

[Row(Acceleration=12.0, Cylinders=8, Displacement=307.0, Horsepower=130, Miles_per_Gallon=18.0, Name='chevrolet chevelle malibu', Origin='USA', Weight_in_lbs=3504, Year='1970-01-01'),
 Row(Acceleration=11.5, Cylinders=8, Displacement=350.0, Horsepower=165, Miles_per_Gallon=15.0, Name='buick skylark 320', Origin='USA', Weight_in_lbs=3693, Year='1970-01-01')]

Count

In [19]:
carsDF.count()

406

Schema

In [None]:
# obtain a schema
carsSchema = carsDF.schema
print(type(carsSchema))
print(carsSchema)

<class 'pyspark.sql.types.StructType'>
StructType([StructField('Acceleration', DoubleType(), True), StructField('Cylinders', LongType(), True), StructField('Displacement', DoubleType(), True), StructField('Horsepower', LongType(), True), StructField('Miles_per_Gallon', DoubleType(), True), StructField('Name', StringType(), True), StructField('Origin', StringType(), True), StructField('Weight_in_lbs', LongType(), True), StructField('Year', StringType(), True)])


Custom Schemas

In [20]:
example = spark.sparkContext.parallelize([("chevrolet chevelle malibu",18,"1970-01-01","USA"),
    ("buick skylark 320",15,"1970-01-01","USA")])

In [None]:
exampleDF = spark.createDataFrame(example)
exampleDF.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: long (nullable = true)
 |-- _3: string (nullable = true)
 |-- _4: string (nullable = true)



With columns names

In [None]:
names = list(["name", "weight", "date", "country"])

In [None]:
example2DF = example.toDF(names)
example2DF.printSchema()

root
 |-- name: string (nullable = true)
 |-- weight: long (nullable = true)
 |-- date: string (nullable = true)
 |-- country: string (nullable = true)



In [21]:
# importing sql types
from pyspark.sql.types import *

In [None]:
# custom schema
customSchema = StructType([ \
    StructField('name', StringType(), True), \
    StructField('weight', StringType(), True), \
    StructField('date', StringType(), True), \
    StructField('country', StringType(), True)])

In [None]:
example3DF = spark.createDataFrame(example, customSchema)
example3DF.printSchema()

root
 |-- name: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- date: string (nullable = true)
 |-- country: string (nullable = true)



In [None]:
example3DF.show(2, False)

+-------------------------+------+----------+-------+
|name                     |weight|date      |country|
+-------------------------+------+----------+-------+
|chevrolet chevelle malibu|18    |1970-01-01|USA    |
|buick skylark 320        |15    |1970-01-01|USA    |
+-------------------------+------+----------+-------+



In [None]:
# we can also specify schema with DDL (Data Definition Language)
customSchema2 = "`name` STRING NOT NULL, `weight` INT, `date` STRING, `country` STRING"

In [None]:
example4DF = spark.createDataFrame(example, customSchema2)
example4DF.printSchema()

root
 |-- name: string (nullable = false)
 |-- weight: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- country: string (nullable = true)



In [None]:
print(type(example2DF.collect()[0]["weight"]))
print(type(example3DF.collect()[0]["weight"]))

<class 'int'>
<class 'str'>


## Exercises
1) Create a manual DF describing smartphones
  - maker
  - model
  - screen dimension
  - camera megapixels
  
2) Read another file from the dataset/ folder, e.g. movies.json
  - print its schema
  - count the number of rows, call count()

3) Take a look to vehicles.csv. Read the file to a DF but this time with your own schema

Exercise 1

Exercise 2

Exercise 3