In [1]:
# Example of basic SparkSQL commands

from pyspark.sql import SparkSession
# Create a Spark session obj
sp = SparkSession.builder.getOrCreate()

# CSV file

In [2]:
# Create a DF from persons.csv
df = spark.read.load('./databases/persons.csv', format='csv', header=True, inferSchema=True)

df.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)



In [3]:
df.show()

+-------+----+
|   Name| Age|
+-------+----+
| Justin|  19|
|   Andy|  30|
|Michael|null|
+-------+----+



In [4]:
# How to rename columns
df2 = df.withColumnRenamed("nome", "eta")

In [5]:
df2.printSchema()
df2.show()

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)

+-------+----+
|   Name| Age|
+-------+----+
| Justin|  19|
|   Andy|  30|
|Michael|null|
+-------+----+



# JSON file
**NOTE:** in JSON files, spark authomatically infers the datatypes of the columns

In [17]:
# Example with an input json file - inline format
df = spark.read.load( "./databases/persons.json", format="json" )

df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+---+-------+
|age|   name|
+---+-------+
| 30|   John|
| 13|Michael|
| 19|  Perdo|
+---+-------+



In [13]:
# One JSON file for each input file
df = spark.read.load("./databases/p1.json", format="json", multiLine=True)

df.printSchema()
df.show()

root
 |-- age: long (nullable = true)
 |-- name: string (nullable = true)

+---+-------+
|age|   name|
+---+-------+
| 30|Michael|
+---+-------+



In [19]:
# How to rename columns
df2 = df.withColumnRenamed("age", "eta") # ("nameOfOldColumn", "nameOfNewColumn")

df2.printSchema()
df2.show()

root
 |-- eta: long (nullable = true)
 |-- name: string (nullable = true)

+---+-------+
|eta|   name|
+---+-------+
| 30|   John|
| 13|Michael|
| 19|  Perdo|
+---+-------+



### **Example - change columns name and build an RDD**
Create a DF from a persons_noheader.csv and rename its columns

In [20]:
dfnoheader = spark.read.load("./databases/persons_noheader.csv",\
                            format='csv',\
                            header=False,\
                            inferSchema=True)\
.withColumnRenamed("_c0", "name")\
.withColumnRenamed("_c1", "age")

dfnoheader.printSchema()
dfnoheader.show()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)

+----------+---+
|      name|age|
+----------+---+
| Arcangelo| 23|
|  Leonardo| 24|
|Margherita| 10|
|  Veronica| 21|
|Alessandro| 61|
+----------+---+



Create an RDD from DF **persons_noheader.csv**

In [21]:
myRDD = dfnoheader.rdd

### **Example - select two of three columns**
Select only name and age columns and add one year to each one

In [31]:
dfPersons = spark.read.load("./databases/persons_age_name_gender.csv",\
                            format='csv',\
                            header=True,\
                            inferSchema=True)
dfPersons.show()
dfPersons.printSchema()

+----------+----+------+
|      Name| Age|Gender|
+----------+----+------+
| Arcangelo|  23|  Male|
|  Leonardo|  24|  Male|
|Margherita|  10|Female|
|  Veronica|  21|Female|
|Alessandro|  61|  Male|
|   Giorgio|null|  Male|
+----------+----+------+

root
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Gender: string (nullable = true)



In [29]:
dfNameAge = dfPersons.select("Name","Age")
dfNameAge.show()

+----------+----+
|      Name| Age|
+----------+----+
| Arcangelo|  23|
|  Leonardo|  24|
|Margherita|  10|
|  Veronica|  21|
|Alessandro|  61|
|   Giorgio|null|
+----------+----+



In [30]:
dfnew = dfNameAge.selectExpr("Name", "Age + 1 AS Newage")
dfnew.show()

+----------+------+
|      Name|Newage|
+----------+------+
| Arcangelo|  24.0|
|  Leonardo|  25.0|
|Margherita|  11.0|
|  Veronica|  22.0|
|Alessandro|  62.0|
|   Giorgio|  null|
+----------+------+



In [32]:
df_filtered = dfnew.filter("Age >= 30")
df_filtered.printSchema()
df_filtered.show()

root
 |-- Name: string (nullable = true)
 |-- Newage: double (nullable = true)

+----------+------+
|      Name|Newage|
+----------+------+
|Alessandro|  62.0|
+----------+------+



### **Example - List of tuples**
Apply Joins

In [33]:
listEmployee = [(105,'Chloe',5),
               (103,'Paul',3),
               (101,'John',1),
               (102,'Lisa',2),
               (104,'Evan',4),
               (106,'Amy',6),]

listDepartment = [(100, 'Other'),
                 (3, 'Engineering'),
                 (2, 'Sales'),
                 (1, 'Marketing'),]

In [35]:
employeeDF = spark.createDataFrame(listEmployee, ['id','name','deptno'])

In [36]:
departmentDF = spark.createDataFrame(listDepartment, ['deptno','deptName'])

In [38]:
employeeDF.show()

+---+-----+------+
| id| name|deptno|
+---+-----+------+
|105|Chloe|     5|
|103| Paul|     3|
|101| John|     1|
|102| Lisa|     2|
|104| Evan|     4|
|106|  Amy|     6|
+---+-----+------+



In [39]:
departmentDF.show()

+------+-----------+
|deptno|   deptName|
+------+-----------+
|   100|      Other|
|     3|Engineering|
|     2|      Sales|
|     1|  Marketing|
+------+-----------+



In [42]:
# Inner Join/Natural Join

# The inner join is the default join in Spark SQL.
# It selects rows that have matching values in both relations

resDF = employeeDF.join(departmentDF, employeeDF.deptno==departmentDF.deptno)

resDF.printSchema()
resDF.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- deptno: long (nullable = true)
 |-- deptno: long (nullable = true)
 |-- deptName: string (nullable = true)

+---+----+------+------+-----------+
| id|name|deptno|deptno|   deptName|
+---+----+------+------+-----------+
|101|John|     1|     1|  Marketing|
|103|Paul|     3|     3|Engineering|
|102|Lisa|     2|     2|      Sales|
+---+----+------+------+-----------+



In [44]:
# Left semi Join

# Return values from the left side of the relation
# that has a march with the right. It is also referred 
# to as a left join

# 'leftsemi', 'left_semi'

resDF = employeeDF.join(departmentDF,\
                        employeeDF.deptno==departmentDF.deptno,\
                       'leftsemi')

resDF.printSchema()
resDF.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- deptno: long (nullable = true)

+---+----+------+
| id|name|deptno|
+---+----+------+
|101|John|     1|
|103|Paul|     3|
|102|Lisa|     2|
+---+----+------+



In [45]:
# Left Outer join

# Return all values from the left relation and the matched values
# from the right relation, or appends NULL if there is no match.
# It is also referred to as a left join

# 'leftouter', 'left', 'left outer'

resDF = employeeDF.join(departmentDF,\
                        employeeDF.deptno==departmentDF.deptno,\
                       'leftouter')

resDF.printSchema()
resDF.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- deptno: long (nullable = true)
 |-- deptno: long (nullable = true)
 |-- deptName: string (nullable = true)

+---+-----+------+------+-----------+
| id| name|deptno|deptno|   deptName|
+---+-----+------+------+-----------+
|106|  Amy|     6|  null|       null|
|105|Chloe|     5|  null|       null|
|101| John|     1|     1|  Marketing|
|103| Paul|     3|     3|Engineering|
|102| Lisa|     2|     2|      Sales|
|104| Evan|     4|  null|       null|
+---+-----+------+------+-----------+



In [48]:
# Right outer join

# Return all values from the left relation and the matched values
# from the right relation, or appends NULL if there is no match.
# It is also referred to as a left join

# 'rightouter', 'right', 'right outer'

resDF = employeeDF.join(departmentDF,\
                        employeeDF.deptno==departmentDF.deptno,\
                       'right')

resDF.printSchema()
resDF.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- deptno: long (nullable = true)
 |-- deptno: long (nullable = true)
 |-- deptName: string (nullable = true)

+----+----+------+------+-----------+
|  id|name|deptno|deptno|   deptName|
+----+----+------+------+-----------+
| 101|John|     1|     1|  Marketing|
|null|null|  null|   100|      Other|
| 103|Paul|     3|     3|Engineering|
| 102|Lisa|     2|     2|      Sales|
+----+----+------+------+-----------+



In [49]:
# Full outer join

# Return all values from both relation, 
# appends NULL on the side where there is no match.
# It is also referred to as a full join

# 'outer', 'full', 'full outer', 'fullouter'

resDF = employeeDF.join(departmentDF,\
                        employeeDF.deptno==departmentDF.deptno,\
                       'outer')

resDF.printSchema()
resDF.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- deptno: long (nullable = true)
 |-- deptno: long (nullable = true)
 |-- deptName: string (nullable = true)

+----+-----+------+------+-----------+
|  id| name|deptno|deptno|   deptName|
+----+-----+------+------+-----------+
| 106|  Amy|     6|  null|       null|
| 105|Chloe|     5|  null|       null|
| 101| John|     1|     1|  Marketing|
|null| null|  null|   100|      Other|
| 103| Paul|     3|     3|Engineering|
| 102| Lisa|     2|     2|      Sales|
| 104| Evan|     4|  null|       null|
+----+-----+------+------+-----------+



In [50]:
# Left anti join (AKA the 'not in' operation)

# Return values from the left relation that has 
# no match with the right.
# It is also referred to as a anti join

# 'leftanti', 'left anti'

resDF = employeeDF.join(departmentDF,\
                        employeeDF.deptno==departmentDF.deptno,\
                       'leftanti')

resDF.printSchema()
resDF.show()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- deptno: long (nullable = true)

+---+-----+------+
| id| name|deptno|
+---+-----+------+
|106|  Amy|     6|
|105|Chloe|     5|
|104| Evan|     4|
+---+-----+------+

