# 0. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import pyspark as sp
import findspark

from pyspark.sql import SparkSession

# 1. Find Spark

Adding pyspark to sys.path at runtime using the library findspark

In [2]:
findspark.init()
findspark.find()

'C:\\spark-3.4.1-bin-hadoop3'

# 2. Creating SparkSession

One aspect of the explanation why SparkSession is preferable over SparkContext in SparkSession Vs SparkContext battle is that SparkSession unifies all of Spark’s numerous contexts, removing the developer’s need to worry about generating separate contexts.

In [3]:
#Create the SparkSession
spark = SparkSession.builder.getOrCreate()

#print the session
print(spark)

<pyspark.sql.session.SparkSession object at 0x000001CC448F69A0>


# 3. Creating test Pyspark dataframe

In [4]:
# Create pandas dataframe to try
pd_temp = pd.DataFrame(np.random.random(10))
pd_temp.head()

Unnamed: 0,0
0,0.390212
1,0.516217
2,0.825791
3,0.766156
4,0.413412


In [5]:
# Create spark_temp dataframe from pandas dataframe
spark_temp = spark.createDataFrame(pd_temp)
spark_temp.head(5)

[Row(0=0.3902117246181368),
 Row(0=0.5162169001864194),
 Row(0=0.8257910132912478),
 Row(0=0.766156313750286),
 Row(0=0.4134116640919866)]

In [6]:
# Examine the tables in the catalog
print(spark.catalog.listTables())

[]


In [7]:
# Add spark_temp to the catalog
spark_temp.createOrReplaceTempView("temp")

# Examine the tables in the catalog again
print(spark.catalog.listTables())

[Table(name='temp', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]


# 4. Importing Csv to Pyspark dataframe

In [9]:
# Defining path where csv is 
file_path = "airports.csv"

# Read in the airports data
airports = spark.read.csv(file_path, header=True)

# Show the data
airports.show()

+---+--------------------+----------------+-----------------+----+---+---+
|faa|                name|             lat|              lon| alt| tz|dst|
+---+--------------------+----------------+-----------------+----+---+---+
|04G|   Lansdowne Airport|      41.1304722|      -80.6195833|1044| -5|  A|
|06A|Moton Field Munic...|      32.4605722|      -85.6800278| 264| -5|  A|
|06C| Schaumburg Regional|      41.9893408|      -88.1012428| 801| -6|  A|
|06N|     Randall Airport|       41.431912|      -74.3915611| 523| -5|  A|
|09J|Jekyll Island Air...|      31.0744722|      -81.4277778|  11| -4|  A|
|0A9|Elizabethton Muni...|      36.3712222|      -82.1734167|1593| -4|  A|
|0G6|Williams County A...|      41.4673056|      -84.5067778| 730| -5|  A|
|0G7|Finger Lakes Regi...|      42.8835647|      -76.7812318| 492| -5|  A|
|0P2|Shoestring Aviati...|      39.7948244|      -76.6471914|1000| -5|  U|
|0S9|Jefferson County ...|      48.0538086|     -122.8106436| 108| -8|  A|
|0W3|Harford County Ai...

In [10]:
type(airports)

pyspark.sql.dataframe.DataFrame

In [12]:
spark.catalog.listTables()

[Table(name='temp', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [13]:
# Add aiports to the catalog
airports.createOrReplaceTempView("Aiports_data")

# Examine the tables in the catalog again
print(spark.catalog.listTables())

[Table(name='Aiports_data', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True), Table(name='temp', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]
