# Big Data Fundamentals with PySpark

## Installation

In [2]:
!apt-get update # Update apt-get repository.
!apt-get install openjdk-8-jdk-headless -qq > /dev/null # Install Java.
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz # Download Apache Sparks.
!tar xf spark-3.1.1-bin-hadoop3.2.tgz # Unzip the tgz file.
!pip install -q findspark # Install findspark. Adds PySpark to the System path during runtime.

# Set environment variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

!ls

# Initialize findspark
import findspark
findspark.init()

# Create a PySpark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Waiting for headers] [Connecting to ppa.laun                                                                                                    Get:2 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Hit:6 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease
Hit:7 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:8 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:9 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Fetched 229 kB in

Creating a SparkSession

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [4]:
# print the tables in the catalog

print(spark.catalog.listTables())

[]


In [5]:
print(spark.catalog.listDatabases())

[Database(name='default', description='default database', locationUri='file:/content/spark-warehouse')]


In [6]:
# read csv

url = "https://raw.githubusercontent.com/mwaskom/seaborn-data/master/flights.csv"

from pyspark import SparkFiles

spark.sparkContext.addFile(url)

flights = spark.read.csv(SparkFiles.get("flights.csv"), header=True, inferSchema= True)

In [10]:
# regardons les données

flights.createOrReplaceTempView("flights")

query = "FROM flights SELECT * LIMIT 10"

flights10 = spark.sql(query)

flights10.show()


+----+---------+----------+
|year|    month|passengers|
+----+---------+----------+
|1949|  January|       112|
|1949| February|       118|
|1949|    March|       132|
|1949|    April|       129|
|1949|      May|       121|
|1949|     June|       135|
|1949|     July|       148|
|1949|   August|       148|
|1949|September|       136|
|1949|  October|       119|
+----+---------+----------+



In [12]:
query = "SELECT count(*) as N FROM flights"

result = spark.sql(query)

result.show()

+---+
|  N|
+---+
|144|
+---+



In [13]:
print(spark.catalog.listTables())

[Table(name='flights', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


In [14]:
flights = spark.table("flights")

In [15]:
flights.show()

+----+---------+----------+
|year|    month|passengers|
+----+---------+----------+
|1949|  January|       112|
|1949| February|       118|
|1949|    March|       132|
|1949|    April|       129|
|1949|      May|       121|
|1949|     June|       135|
|1949|     July|       148|
|1949|   August|       148|
|1949|September|       136|
|1949|  October|       119|
|1949| November|       104|
|1949| December|       118|
|1950|  January|       115|
|1950| February|       126|
|1950|    March|       141|
|1950|    April|       135|
|1950|      May|       125|
|1950|     June|       149|
|1950|     July|       170|
|1950|   August|       170|
+----+---------+----------+
only showing top 20 rows



In [16]:
flights.printSchema()

root
 |-- year: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- passengers: integer (nullable = true)



In [17]:
from pyspark.sql.functions import col

flights = flights.withColumn("new_passengers", col("passengers") * 1.05 )

flights.show()

+----+---------+----------+------------------+
|year|    month|passengers|    new_passengers|
+----+---------+----------+------------------+
|1949|  January|       112|117.60000000000001|
|1949| February|       118|             123.9|
|1949|    March|       132|             138.6|
|1949|    April|       129|135.45000000000002|
|1949|      May|       121|127.05000000000001|
|1949|     June|       135|            141.75|
|1949|     July|       148|             155.4|
|1949|   August|       148|             155.4|
|1949|September|       136|             142.8|
|1949|  October|       119|            124.95|
|1949| November|       104|             109.2|
|1949| December|       118|             123.9|
|1950|  January|       115|            120.75|
|1950| February|       126|             132.3|
|1950|    March|       141|            148.05|
|1950|    April|       135|            141.75|
|1950|      May|       125|            131.25|
|1950|     June|       149|156.45000000000002|
|1950|     Ju

In [19]:
filter_flights = flights.filter("passengers > 200")

filter_flights.show()

+----+---------+----------+------------------+
|year|    month|passengers|    new_passengers|
+----+---------+----------+------------------+
|1952|     June|       218|             228.9|
|1952|     July|       230|             241.5|
|1952|   August|       242|254.10000000000002|
|1952|September|       209|219.45000000000002|
|1953|    March|       236|             247.8|
|1953|    April|       235|            246.75|
|1953|      May|       229|240.45000000000002|
|1953|     June|       243|            255.15|
|1953|     July|       264|             277.2|
|1953|   August|       272|             285.6|
|1953|September|       237|248.85000000000002|
|1953|  October|       211|            221.55|
|1953| December|       201|            211.05|
|1954|  January|       204|214.20000000000002|
|1954|    March|       235|            246.75|
|1954|    April|       227|238.35000000000002|
|1954|      May|       234|245.70000000000002|
|1954|     June|       264|             277.2|
|1954|     Ju

In [21]:
select1 = flights.select("passengers", "month")

select2 = flights.filter(flights.month == "June").filter(flights.month == "May")


In [22]:
avg_speed = (flights.passengers/(flights.passengers + 10)).alias("avg_speed")

In [24]:
avg_speed

Column<'(passengers / (passengers + 10)) AS `avg_speed`'>

In [27]:
speed2 = flights.selectExpr("month", "passengers")

In [28]:
speed2.show()

+---------+----------+
|    month|passengers|
+---------+----------+
|  January|       112|
| February|       118|
|    March|       132|
|    April|       129|
|      May|       121|
|     June|       135|
|     July|       148|
|   August|       148|
|September|       136|
|  October|       119|
| November|       104|
| December|       118|
|  January|       115|
| February|       126|
|    March|       141|
|    April|       135|
|      May|       125|
|     June|       149|
|     July|       170|
|   August|       170|
+---------+----------+
only showing top 20 rows



In [29]:
import pyspark.sql.functions as F

In [31]:
best_month = flights.groupBy("month")

best_month_mean = best_month.avg("passengers")

best_month_mean.show()

+---------+------------------+
|    month|   avg(passengers)|
+---------+------------------+
|     July| 351.3333333333333|
| November|232.83333333333334|
| February|             235.0|
|  January|            241.75|
|    March| 270.1666666666667|
|  October| 266.5833333333333|
|      May| 271.8333333333333|
|   August| 351.0833333333333|
|    April| 267.0833333333333|
|     June| 311.6666666666667|
| December| 261.8333333333333|
|September| 302.4166666666667|
+---------+------------------+



In [32]:
print(spark.version)

3.1.1


In [44]:
numb = range(1, 100)
print(list(numb))

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99]


In [47]:
from pyspark.context import SparkContext

sc = SparkContext.getOrCreate()

In [48]:
numb = range(1, 100)

spark_data = sc.parallelize(numb)

In [53]:
items = [1, 2, 3, 4]

list(map(lambda x: x+2, items))

[3, 4, 5, 6]

In [52]:
rdd = sc.parallelize(["hello world", "how are you"])

rdd_split = rdd.flatMap(lambda x: x.split(" "))

print(rdd_split.collect())

['hello', 'world', 'how', 'are', 'you']
