In [0]:

# Usage of spark object in PySpark shell
spark.version

Out[2]: '3.2.1'

In [0]:
# Create SparkSession from builder
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[1]").appName('SparkByExamples.com').getOrCreate()
spark

'''
master() – If you are running it on the cluster you need to use your master name as an argument to master(). usually, it would be either yarn or mesos depends on your cluster setup.

Use local[x] when running in Standalone mode. x should be an integer value and should be greater than 0; this represents how many partitions it should create when using RDD, DataFrame, and Dataset. Ideally, x value should be the number of CPU cores you have.
appName() – Used to set your application name.

getOrCreate() – This returns a SparkSession object if already exists, and creates a new one if not exist.
'''

In [0]:
# Create new SparkSession
spark2 = SparkSession.newSession
print(spark2)

<function SparkSession.newSession at 0x7fee6621c280>


In [0]:
# Get Existing SparkSession
spark3 = SparkSession.builder.getOrCreate
print(spark3)

<bound method SparkSession.Builder.getOrCreate of <pyspark.sql.session.SparkSession.Builder object at 0x7fee66376310>>


In [0]:
# Usage of config()
spark = SparkSession.builder \
      .master("local[1]") \
      .appName("SparkByExamples.com") \
      .config("spark.some.config.option", "config-value") \
      .getOrCreate()

In [0]:
# Enabling Hive to use in Spark -- enableHiveSupport()
spark = SparkSession.builder \
      .master("local[1]") \
      .appName("SparkByExamples.com") \
      .config("spark.sql.warehouse.dir", "<path>/spark-warehouse") \
      .enableHiveSupport() \
      .getOrCreate()

# Set Config
spark.conf.set("spark.executor.memory", "5g")
# Get a Spark Config
partions = spark.conf.get("spark.sql.shuffle.partitions")
print(partions)

In [0]:
# Create DataFrame
df = spark.createDataFrame(
    [("Scala", 25000), ("Spark", 35000), ("PHP", 21000)],["language","price"])
df.show()


+--------+-----+
|language|price|
+--------+-----+
|   Scala|25000|
|   Spark|35000|
|     PHP|21000|
+--------+-----+



In [0]:
# Spark SQL
df.createOrReplaceTempView("sample_table")
df2 = spark.sql("SELECT * FROM sample_table")
df2.show()

+--------+-----+
|language|price|
+--------+-----+
|   Scala|25000|
|   Spark|35000|
|     PHP|21000|
+--------+-----+



In [0]:
# Create Hive table & query it.  
spark.table("sample_table").write.saveAsTable("sample_hive_table")
df3 = spark.sql("SELECT * FROM sample_hive_table")
df3.show()

+--------+-----+
|language|price|
+--------+-----+
|   Scala|25000|
|   Spark|35000|
|     PHP|21000|
+--------+-----+



In [0]:
# Get metadata from the Catalog
# List databases
dbs = spark.catalog.listDatabases()
print(dbs)

[Database(name='default', description='Default Hive database', locationUri='dbfs:/user/hive/warehouse')]


In [0]:
# List Tables
tbls = spark.catalog.listTables()
print(tbls)


[Table(name='sample_hive_table', database='default', description=None, tableType='MANAGED', isTemporary=False), Table(name='sample_table', database=None, description=None, tableType='TEMPORARY', isTemporary=True)]


8. SparkSession Commonly Used Methods

version() – Returns the Spark version where your application is running, probably the Spark version your cluster is configured with.

createDataFrame() – This creates a DataFrame from a collection and an RDD

getActiveSession() – returns an active Spark session.

read() – Returns an instance of DataFrameReader class, this is used to read records from csv, parquet, avro, and more file formats into DataFrame.

readStream() – Returns an instance of DataStreamReader class, this is used to read streaming data. that can be used to read streaming data into DataFrame.

sparkContext() – Returns a SparkContext.

sql() – Returns a DataFrame after executing the SQL mentioned.

sqlContext() – Returns SQLContext.

stop() – Stop the current SparkContext.

table() – Returns a DataFrame of a table or view.

udf() – Creates a PySpark UDF to use it on DataFrame, Dataset, and SQL