# 0. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import pyspark as sp
import findspark

from pyspark.sql import SparkSession

# 1. Find Spark

Adding pyspark to sys.path at runtime using the library findspark

In [2]:
findspark.init()
findspark.find()

'C:\\spark-3.4.1-bin-hadoop3'

# 2. Creating SparkSession

One aspect of the explanation why SparkSession is preferable over SparkContext in SparkSession Vs SparkContext battle is that SparkSession unifies all of Spark’s numerous contexts, removing the developer’s need to worry about generating separate contexts.

In [None]:
#Create the SparkSession
spark = SparkSession.builder.getOrCreate()

#print the session
print(spark)

# 3. Creating test Pyspark dataframe

In [None]:
# Create pandas dataframe to try
pd_temp = pd.DataFrame(np.random.random(10))
pd_temp.head()

In [None]:
# Create spark_temp dataframe from pandas dataframe
spark_temp = spark.createDataFrame(pd_temp)
spark_temp.show()

In [None]:
# Examine the tables in the catalog
print(spark.catalog.listTables())

In [None]:
# Add spark_temp to the catalog
spark_temp.createOrReplaceTempView("temp")

# Examine the tables in the catalog again
print(spark.catalog.listTables())

# 4. Importing Csv to Pyspark dataframe

In [None]:
# Defining path where csv is 
file_path = "airports.csv"

# Read in the airports data
airports = spark.read.csv(file_path, header=True)

# Show the data
airports.show()

In [None]:
type(airports)

In [None]:
spark.catalog.listTables()

In [None]:
# Add aiports to the catalog
airports.createOrReplaceTempView("Aiports_data")

# Examine the tables in the catalog again
print(spark.catalog.listTables())

# 5. General Example SQL

### 5.1 Importing csv

In [None]:
flights = spark.read.csv('flights_small.csv', header=True)

In [None]:
flights.show()

In [None]:
#Move the dataframe to list tables
flights.createOrReplaceTempView('flights')
spark.catalog.listTables()

### 5.2 From table to dataframe

In [None]:
# Create the DataFrame flights from the list table
flights_1 = spark.table('flights')

# Show the head
print(flights_1.show())

### 5.3 Creating new colummns

In [None]:
# Add duration_hrs
from pyspark.sql.functions import round
flights = flights.withColumn('duration_hrs', round(flights.air_time / 60,3))

In [None]:
flights.show()

### 5.4 Filtering Data I

In [None]:
# Filter flights with a SQL string
long_flights1 = flights.filter('distance > 1000')
print(long_flights1.show(5))

In [None]:
# Filter flights with a boolean column
long_flights2 = flights.filter(flights.distance > 1000)

# Examine the data to check they're equal
print(long_flights2.show(5))

### 5.5 Filtering Data II

In [None]:
# Select the first set of columns
flights.select("tailnum","origin", "dest").show()

In [None]:
# Select the second set of columns
temp=flights.select(flights.origin, flights.dest, flights.carrier)
temp.show()

In [None]:
# Define first filter
filterA = flights.origin == "SEA"
filterA

In [None]:
# Define second filter
filterB = flights.dest == "PDX"
filterB

In [None]:
# Filter the data, first by filterA then by filterB
selected2 = temp.filter(filterA).filter(filterB)
selected2.show()

### 5.6 Filtering Data III

In [None]:
# Define avg_speed
avg_speed = (flights.distance/(flights.air_time/60)).alias("avg_speed")
avg_speed

In [None]:
# Select the correct columns
speed1 = flights.select("origin", "dest", "tailnum", avg_speed)
speed1.show()

In [None]:
# Create the same table using a SQL expression
speed2 = flights.selectExpr("origin", "dest", "tailnum", "distance/(air_time/60) as avg_speed")
speed2.show()

### 5.7 Aggregating

In [None]:
flights.describe()

In [None]:
flights = flights.withColumn("distance", flights.distance.cast("float"))

In [None]:
flights = flights.withColumn("air_time", flights.air_time.cast("float"))

In [None]:
flights.describe('air_time', 'distance').show()

In [None]:
# Find the shortest flight from PDX in terms of distance
flights.filter(flights.origin == "PDX").groupBy().min("distance").show()

In [None]:
# Find the longest flight from SEA in terms of duration
flights.filter(flights.origin == "SEA").groupBy().max("air_time").show()

In [None]:
# Average duration of Delta flights
flights.filter(flights.carrier == "DL").filter(flights.origin == "SEA").groupBy().avg('air_time').show()

In [None]:
# Total hours in the air
flights.withColumn("duration_hrs", flights.air_time/60).groupBy().sum("duration_hrs").show()

### 5.8 Grouping and Aggregating I

In [None]:
# Group by tailnum
by_plane = flights.groupBy("tailnum")

In [None]:
# Number of flights each plane made
by_plane.count().show()

In [None]:
# Group by origin
by_origin = flights.groupBy("origin")

In [None]:
# Average duration of flights from PDX and SEA
by_origin.avg("air_time").show()

### 5.9 Grouping and Aggregating II

In [None]:
flights

In [None]:
flights = flights.withColumn("dep_delay", flights.dep_delay.cast("float"))
flights.show(5)

### 5.9 Grouping and Aggregating II

In [None]:
# Import pyspark.sql.functions as F
import pyspark.sql.functions as F

In [None]:
# Group by month and dest
by_month_dest = flights.groupBy("month", "dest")

# Average departure delay by month and destination
by_month_dest.avg("dep_delay").show()

In [None]:
# Standard deviation
by_month_dest.agg(F.stddev("dep_delay")).show()

### 5.10 Joining

In [None]:
airports.show()

In [None]:
# Rename the faa column
airports = airports.withColumnRenamed("faa", "dest")
airports.show()

In [None]:
# Join the DataFrames
flights_with_airports = flights.join(airports, on="dest", how="leftouter")

In [None]:
flights_with_airports.show()

In [None]:
# Examine the data again
flights_with_airports.limit(5).toPandas()

https://github.com/ozlerhakan/datacamp/blob/master/Introduction%20to%20PySpark/introduction-to-pySpark.ipynb