In [1]:
# https://spark.apache.org/docs/latest/sql-getting-started.html

# Starting Point: SparkSession
# __requires__="pyspark==3.1.2"
# import pkg_resources
# pkg_resources.require("pyspark==3.1.2")
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("test python spark") \
    .getOrCreate()

print("spark.version=",spark.version)

spark.version= 3.2.1


In [2]:
# Creating DataFrames
df = spark.read.json("D:/workspace/code_challenge/datalake/people.json")
df.show()

+----+--------+
| age|    name|
+----+--------+
|null|    test|
|   5|     Ali|
|   5|Mohammad|
|  39|  Narges|
|  46|    Adel|
+----+--------+



In [3]:
## Untyped Dataset Operations (aka DataFrame Operations)

# Print the schema in a tree format
df.printSchema()

# Select only the "name" column
df.select("name").show()

# Select everybody, but increment the age by 1
df.select(df["name"],df["age"]+1).show()

# Select people older than 21
df.filter(df["age"]>21).show()

# Count people by age
df.groupby("age").count().show()


root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+--------+
|    name|
+--------+
|    test|
|     Ali|
|Mohammad|
|  Narges|
|    Adel|
+--------+

+--------+---------+
|    name|(age + 1)|
+--------+---------+
|    test|     null|
|     Ali|        6|
|Mohammad|        6|
|  Narges|       40|
|    Adel|       47|
+--------+---------+

+---+------+
|age|  name|
+---+------+
| 39|Narges|
| 46|  Adel|
+---+------+

+----+-----+
| age|count|
+----+-----+
|  39|    1|
|null|    1|
|   5|    2|
|  46|    1|
+----+-----+



In [4]:
## Running SQL Queries Programmatically

# Register the DataFrame as a SQL temporary view
df.createOrReplaceTempView("people")

sql_df = spark.sql("select * from people")
sql_df.show()

+----+--------+
| age|    name|
+----+--------+
|null|    test|
|   5|     Ali|
|   5|Mohammad|
|  39|  Narges|
|  46|    Adel|
+----+--------+



In [5]:
## Global Temporary View

# Register the DataFrame as a global temporary view
df.createOrReplaceGlobalTempView("people")
# Global temporary view is tied to a system preserved database `global_temp`
spark.sql("select * from global_temp.people").show()

# Global temporary view is cross-session
spark.newSession().sql("select * from global_temp.people").show()


+----+--------+
| age|    name|
+----+--------+
|null|    test|
|   5|     Ali|
|   5|Mohammad|
|  39|  Narges|
|  46|    Adel|
+----+--------+

+----+--------+
| age|    name|
+----+--------+
|null|    test|
|   5|     Ali|
|   5|Mohammad|
|  39|  Narges|
|  46|    Adel|
+----+--------+



In [6]:
## Creating Datasets

# Datasets are similar to RDDs, however, instead of using Java serialization or Kryo
# they use a specialized Encoder to serialize the objects for processing or transmitting
# over the network. While both encoders and standard serialization are responsible
# for turning an object into bytes, encoders are code generated dynamically
# and use a format that allows Spark to perform many operations like filtering, sorting
# and hashing without deserializing the bytes back into an object.
# SCALA/JAVA


In [7]:
## Interoperating with RDDs

## Inferring the Schema Using Reflection

from pyspark.sql import Row
sc = spark.sparkContext
# Load a text file and convert each line to a Row.
lines = sc.textFile("D:/workspace/code_challenge/datalake/people.txt")
# parts = lines.map(lambda l: l.split(","))
# people = parts.map(lambda p:Row(name=p[0],age=int(p[1])))
parts = lines.map(lambda l: l.split(","))
people = parts.map(lambda p: Row(name=p[0], age=(int(p[1]) if len(p[1])>0 else None)))

# Infer the schema, and register the DataFrame as a table.
df_people = spark.createDataFrame(people)
# EX: Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
# ANS : set PYSPARK_PYTHON=python
df_people.createOrReplaceTempView("people")

# SQL can be run over DataFrames that have been registered as a table.
kids = spark.sql("select * from people where age<10 and age>3")

# The results of SQL queries are Dataframe objects.
# rdd returns the content as an :class:`pyspark.RDD` of :class:`Row`.
kidNames = kids.rdd.map(lambda k: "name:"+k.name).collect()
for name in kidNames:
    print(name)

name:mohammad
name:ali


In [10]:
## Programmatically Specifying the Schema

# Import data types
from pyspark.sql.types import StringType, StructType, StructField

sc = spark.sparkContext
# Load a text file and convert each line to a Row.
lines = sc.textFile("D:/workspace/code_challenge/datalake/people.txt")
parts = lines.map(lambda l:l.split(","))
# Each line is converted to a tuple.
people = parts.map(lambda p:(p[0],p[1].strip()))

# The schema is encoded in a string.
schemaString = "name age"

fields = [StructField(field_name,StringType(),True) for field_name in schemaString.split(" ")]
schema = StructType(fields)

# Apply the schema to the RDD.
schemaPeople = spark.createDataFrame(people,schema)

# Creates a temporary view using the DataFrame
schemaPeople.createOrReplaceTempView("people")

# SQL can be run over DataFrames that have been registered as a table.
results = spark.sql("select name from people")

results.show()

+--------+
|    name|
+--------+
|    test|
|mohammad|
|     ali|
|  narges|
|    adel|
+--------+



In [None]:
## Scalar Functions

# Scalar functions are functions that return a single value per row, as opposed to aggregation functions, which return a value for a group of rows. Spark SQL supports a variety of Built-in Scalar Functions. It also supports User Defined Scalar Functions.

In [None]:
## Aggregate Functions

# Aggregate functions are functions that return a single value on a group of rows. The Built-in Aggregation Functions provide common aggregations such as count(), count_distinct(), avg(), max(), min(), etc. Users are not limited to the predefined aggregate functions and can create their own. For more details about user defined aggregate functions, please refer to the documentation of User Defined Aggregate Functions.