In [5]:
%%bash

# Instal Java
apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Install PySpark
pip install -q pyspark

In [6]:
import os
os.environ['JAVA_HOME'] = '/usr/lib/jvm/java-8-openjdk-amd64'

from pyspark.sql import SparkSession

#spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
from os.path import abspath
warehouse_location = abspath('../content/drive/MyDrive/igti_bootcamps/eng_dados_cloud/mod3/spark-warehouse')

# Class 01 - **Configuring and Scaling Spark**

In [None]:
!pip install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import findspark

findspark.init()

In [None]:
spark = (
    SparkSession
    .builder
    .config('spark.serializer','org.apache.spark.serializer.KryoSerializer')
    .config('spark.driver.memory','8g')
    .getOrCreate()
) 

In [None]:
spark.conf.get('spark.driver.memory')

'8g'

In [None]:
spark.conf.get('spark.serializer')

'org.apache.spark.serializer.KryoSerializer'

 - `spark.master` : Select the deploy mode of Spark application.
 - `spark.driver.memory` : Amount of memory used by the driver of application
 - `spark.executor.memory` : Amount of memory used by the executor
 - `spark.serializer` : Class used to perform an serialization along the execution. It is recommended to use the value 'org.apache.spark.serializer.KryoSerializer' to gain processing speed. It is 10x faster than the default.
 - `spark.executor.heartbeatInterval` : Interval between executor messages to driver. Increasing this value prevents the application from suffering from timeouts.
 - `spark.sql.adaptive.enabled` : Set the Adaptive Query Execution. It is a program that updates the execution plan along the run with metrics collected during the process. Enabling this setting can optimize processing significantly. 
 - `spark.sql.shuffle.partitions` : Number of partitions used in shuffle operations like joins and agg. 
 - `spark.sql.broadcastTimeout` : Timeout limit in seconds.

**Making Spark Scalable**

- `spark.dynamicAllocation.enabled` : Enable dynamic allocation
- `spark.dynamicAllocation.executorIdleTimeout` : configure the maximum idle time of an executor.  
- `spark.synamicAllocation.initialExecutors` : Initial quantity of executors 
- `spark.dynamicAllocation.maxExecutors` : Maximum quantity of executors
- `spark.dynamicAllocation.minExecutors` : Minimum quantity of executors

In [None]:
spark.conf.set('spark.sql.shuffle.partitions',100)

In [None]:
spark.conf.get('spark.sql.shuffle.partitions')

'100'



---



# Class 02 - **In-memory Data Persistence**

In [7]:
import pyspark.sql.functions as f

In [8]:
spark = (
    SparkSession
    .builder
    .getOrCreate()
) 

In [9]:
imdb_path = '/content/drive/MyDrive/igti_bootcamps/eng_dados_cloud/mod3/title_basics.csv'

In [10]:
options_dict = {
    'sep' : '\t' , 
    'header' : 'True'
}

df_titles = (
      spark.read
    .format('csv')
    .options(**options_dict)  
    .load(imdb_path)
)

df_titles.show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [14]:
df_titles_sample = df_titles.sample(fraction=0.1)

In [15]:
df_titles_sample.show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000020|    short|      The Derby 1895|      The Derby 1895|      0|     1895|     \N|             1|Documentary,Short...|
|tt0000030|    short|  Rough Sea at Dover|  Rough Sea at Dover|      0|     1895|     \N|             1|   Documentary,Short|
|tt0000041|    short|   Bataille de neige|   Bataille de neige|      0|     1897|     \N|             1|Comedy,Documentar...|
|tt0000048|    short| The Boxing Kangaroo| The Boxing Kangaroo|      0|     1896|     \N|            \N|               Short|
|tt0000057|    short|Cortège de tzar a...|Cortège de tzar a...|      0|     1896|     \N|            \N|   Documentary

In [11]:
ratings_path = '/content/drive/MyDrive/igti_bootcamps/eng_dados_cloud/mod3/title_ratings.tsv'

In [12]:
options_dict = {
    'sep' : '\t' , 
    'header' : 'True'
}

df_ratings = (
      spark.read
    .format('csv')
    .options(**options_dict)  
    .load(ratings_path)
)

df_ratings.show(5)

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1879|
|tt0000002|          5.9|     248|
|tt0000003|          6.5|    1652|
|tt0000004|          5.8|     161|
|tt0000005|          6.2|    2476|
+---------+-------------+--------+
only showing top 5 rows



In [16]:
int_cols = ['startYear','endYear','runtimeMinutes','isAdult']

for c in int_cols : 
  df_titles_sample = (
      df_titles_sample
      .withColumn(c,f.col(c).cast('int'))
  )

#limpa as string
str_cols = ['primaryTitle','originalTitle','titleType']
for c in str_cols: 
  df_titles_sample = (
      df_titles_sample 
      .withColumn(c, f.initcap(f.trim(f.col(c))))
  )

In [17]:
df_titles_sample.show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000020|    Short|      The Derby 1895|      The Derby 1895|      0|     1895|   null|             1|Documentary,Short...|
|tt0000030|    Short|  Rough Sea At Dover|  Rough Sea At Dover|      0|     1895|   null|             1|   Documentary,Short|
|tt0000041|    Short|   Bataille De Neige|   Bataille De Neige|      0|     1897|   null|             1|Comedy,Documentar...|
|tt0000048|    Short| The Boxing Kangaroo| The Boxing Kangaroo|      0|     1896|   null|          null|               Short|
|tt0000057|    Short|Cortège De Tzar A...|Cortège De Tzar A...|      0|     1896|   null|          null|   Documentary

In [18]:
df_titles_sample.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: integer (nullable = true)
 |-- startYear: integer (nullable = true)
 |-- endYear: integer (nullable = true)
 |-- runtimeMinutes: integer (nullable = true)
 |-- genres: string (nullable = true)



In [19]:
df_join = (
    df_titles_sample
    .replace('\\N',None)
    .withColumn('genres',f.split(f.col('genres'),','))
    .join(df_ratings,'tconst','left')
)

In [20]:
df_join.show()

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+-------------+--------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|averageRating|numVotes|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+-------------+--------+
| tt0000030|    Short|  Rough Sea At Dover|  Rough Sea At Dover|      0|     1895|   null|             1|[Documentary, Short]|          5.2|     800|
| tt0000041|    Short|   Bataille De Neige|   Bataille De Neige|      0|     1897|   null|             1|[Comedy, Document...|          6.8|    1730|
| tt0000048|    Short| The Boxing Kangaroo| The Boxing Kangaroo|      0|     1896|   null|          null|             [Short]|          4.9|     176|
| tt0000061|    Short|       Dancing Girls|       Dancing Girls|      0|     1896|   null|          

In [21]:
df_final = (
    df_join.withColumn('genres', f.explode(f.col('genres')))
    .groupBy('titleType')
    .pivot('genres')
    .agg(f.round(f.mean('averageRating'),2))
    .fillna(0)
)

In [22]:
df_final.show()

+------------+------+-----+---------+---------+---------+------+-----+-----------+-----+------+-------+---------+---------+-------+------+-----+-------+-------+----+----------+-------+------+-----+-----+---------+--------+----+-------+
|   titleType|Action|Adult|Adventure|Animation|Biography|Comedy|Crime|Documentary|Drama|Family|Fantasy|Film-Noir|Game-Show|History|Horror|Music|Musical|Mystery|News|Reality-TV|Romance|Sci-Fi|Short|Sport|Talk-Show|Thriller| War|Western|
+------------+------+-----+---------+---------+---------+------+-----+-----------+-----+------+-------+---------+---------+-------+------+-----+-------+-------+----+----------+-------+------+-----+-----+---------+--------+----+-------+
|   Tvepisode|  7.45| 6.49|     7.36|     7.28|     7.41|  7.35|  7.5|       7.51| 7.56|  7.33|    7.5|      0.0|     7.08|   7.65|  7.36| 7.09|    7.4|   7.53|6.88|      7.01|   7.43|  7.43| 7.04| 7.15|     6.93|    7.43| 7.8|    7.7|
|       Video|  5.98| 6.49|     5.86|     6.34|     7.29

In [23]:
df_final.explain('formatted')

== Physical Plan ==
AdaptiveSparkPlan (21)
+- HashAggregate (20)
   +- Exchange (19)
      +- HashAggregate (18)
         +- HashAggregate (17)
            +- Exchange (16)
               +- HashAggregate (15)
                  +- Project (14)
                     +- Generate (13)
                        +- Project (12)
                           +- SortMergeJoin LeftOuter (11)
                              :- Sort (6)
                              :  +- Exchange (5)
                              :     +- Project (4)
                              :        +- Filter (3)
                              :           +- Sample (2)
                              :              +- Scan csv  (1)
                              +- Sort (10)
                                 +- Exchange (9)
                                    +- Filter (8)
                                       +- Scan csv  (7)


(1) Scan csv 
Output [3]: [tconst#16, titleType#17, genres#24]
Batched: false
Location: InMemoryFileIndex 

In [None]:
%%time

df_final.show()

+------------+------+-----+---------+---------+---------+------+-----+-----------+-----+------+-------+---------+---------+-------+------+-----+-------+-------+----+----------+-------+------+-----+-----+---------+--------+----+-------+
|   titleType|Action|Adult|Adventure|Animation|Biography|Comedy|Crime|Documentary|Drama|Family|Fantasy|Film-Noir|Game-Show|History|Horror|Music|Musical|Mystery|News|Reality-TV|Romance|Sci-Fi|Short|Sport|Talk-Show|Thriller| War|Western|
+------------+------+-----+---------+---------+---------+------+-----+-----------+-----+------+-------+---------+---------+-------+------+-----+-------+-------+----+----------+-------+------+-----+-----+---------+--------+----+-------+
|     Tvmovie|  5.25|  6.5|     5.89|     6.78|     6.77|  6.47| 6.29|       7.08| 6.51|  6.29|   6.45|      0.0|      0.0|    7.0|  5.65| 7.38|   7.14|   6.03|6.97|      7.58|   6.07|  5.44|  0.0| 6.98|      0.0|    5.67|7.02|   6.27|
|    Tvseries|  6.95| 6.15|     6.95|     6.93|     7.19

In [None]:
df_final.cache()
df_final.count()

10

In [None]:
%%time

df_final.show()

+------------+------+-----+---------+---------+---------+------+-----+-----------+-----+------+-------+---------+---------+-------+------+-----+-------+-------+----+----------+-------+------+-----+-----+---------+--------+----+-------+
|   titleType|Action|Adult|Adventure|Animation|Biography|Comedy|Crime|Documentary|Drama|Family|Fantasy|Film-Noir|Game-Show|History|Horror|Music|Musical|Mystery|News|Reality-TV|Romance|Sci-Fi|Short|Sport|Talk-Show|Thriller| War|Western|
+------------+------+-----+---------+---------+---------+------+-----+-----------+-----+------+-------+---------+---------+-------+------+-----+-------+-------+----+----------+-------+------+-----+-----+---------+--------+----+-------+
|     Tvmovie|  5.25|  6.5|     5.89|     6.78|     6.77|  6.47| 6.29|       7.08| 6.51|  6.29|   6.45|      0.0|      0.0|    7.0|  5.65| 7.38|   7.14|   6.03|6.97|      7.58|   6.07|  5.44|  0.0| 6.98|      0.0|    5.67|7.02|   6.27|
|    Tvseries|  6.95| 6.15|     6.95|     6.93|     7.19

In [None]:
df_final.unpersist()

DataFrame[titleType: string, Action: double, Adult: double, Adventure: double, Animation: double, Biography: double, Comedy: double, Crime: double, Documentary: double, Drama: double, Family: double, Fantasy: double, Film-Noir: double, Game-Show: double, History: double, Horror: double, Music: double, Musical: double, Mystery: double, News: double, Reality-TV: double, Romance: double, Sci-Fi: double, Short: double, Sport: double, Talk-Show: double, Thriller: double, War: double, Western: double]

In [None]:
%%time

df_final.show()

+------------+------+-----+---------+---------+---------+------+-----+-----------+-----+------+-------+---------+---------+-------+------+-----+-------+-------+----+----------+-------+------+-----+-----+---------+--------+----+-------+
|   titleType|Action|Adult|Adventure|Animation|Biography|Comedy|Crime|Documentary|Drama|Family|Fantasy|Film-Noir|Game-Show|History|Horror|Music|Musical|Mystery|News|Reality-TV|Romance|Sci-Fi|Short|Sport|Talk-Show|Thriller| War|Western|
+------------+------+-----+---------+---------+---------+------+-----+-----------+-----+------+-------+---------+---------+-------+------+-----+-------+-------+----+----------+-------+------+-----+-----+---------+--------+----+-------+
|     Tvmovie|  5.25|  6.5|     5.89|     6.78|     6.77|  6.47| 6.29|       7.08| 6.51|  6.29|   6.45|      0.0|      0.0|    7.0|  5.65| 7.38|   7.14|   6.03|6.97|      7.58|   6.07|  5.44|  0.0| 6.98|      0.0|    5.67|7.02|   6.27|
|    Tvseries|  6.95| 6.15|     6.95|     6.93|     7.19

In [None]:
%%time
df_join.count()

CPU times: user 78.5 ms, sys: 10.8 ms, total: 89.3 ms
Wall time: 14.2 s


831093

In [None]:
%%time
df_join.cache()
df_join.count()

CPU times: user 178 ms, sys: 23.2 ms, total: 201 ms
Wall time: 33 s


In [None]:
%%time
df_join.count()

CPU times: user 3.89 ms, sys: 81 µs, total: 3.97 ms
Wall time: 546 ms


831093

## **Removing persisted data**

In [None]:
spark

In [None]:
spark.catalog.clearCache()



---



# Class 03 - **Strategies to Partition Data**

In [None]:
imdb_path = '/content/drive/MyDrive/igti_bootcamps/eng_dados_cloud/mod3/title_basics.csv'

In [None]:
options_dict = {
    'sep' : '\t' , 
    'header' : 'True'
}

df_titles = (
      spark.read
    .format('csv')
    .options(**options_dict)  
    .load(imdb_path)
)

df_titles.show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [None]:
df_ratings.show()

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1879|
|tt0000002|          5.9|     248|
|tt0000003|          6.5|    1652|
|tt0000004|          5.8|     161|
|tt0000005|          6.2|    2476|
|tt0000006|          5.2|     165|
|tt0000007|          5.4|     771|
|tt0000008|          5.4|    2017|
|tt0000009|          5.3|     193|
|tt0000010|          6.9|    6784|
|tt0000011|          5.3|     345|
|tt0000012|          7.4|   11652|
|tt0000013|          5.7|    1798|
|tt0000014|          7.1|    5222|
|tt0000015|          6.2|     994|
|tt0000016|          5.9|    1407|
|tt0000017|          4.6|     306|
|tt0000018|          5.3|     562|
|tt0000019|          5.2|      30|
|tt0000020|          4.8|     332|
+---------+-------------+--------+
only showing top 20 rows



## **Bucketing**

In [24]:
df_titles.write.format('parquet').bucketBy(5,'tconst').saveAsTable('title_basics')

In [None]:
df_ratings.write.format('parquet').bucketBy(5,'tconst').saveAsTable('title_ratings')

In [None]:
df_titles_bucket = spark.sql('SELECT * FROM  title_basics')

In [None]:
df_ratings_bucket = spark.sql('SELECT * FROM title_ratings')

In [None]:
%%time 

df_titles.join(df_ratings,'tconst').count()

CPU times: user 114 ms, sys: 14.4 ms, total: 128 ms
Wall time: 17.2 s


1227178

In [None]:
%time

df_titles_bucket.join(df_ratings_bucket,'tconst').count()

CPU times: user 9 µs, sys: 1 µs, total: 10 µs
Wall time: 14.8 µs


1227178

## **Partitioning by Column**

In [None]:
df_titles.filter('titleType = "short"').explain("formatted")

== Physical Plan ==
* Filter (2)
+- Scan csv  (1)


(1) Scan csv 
Output [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Batched: false
Location: InMemoryFileIndex [file:/content/drive/MyDrive/igti_bootcamps/eng_dados_cloud/mod3/title_basics.csv]
PushedFilters: [IsNotNull(titleType), EqualTo(titleType,short)]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genres:string>

(2) Filter [codegen id : 1]
Input [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Condition : (isnotnull(titleType#17) AND (titleType#17 = short))




In [None]:
(
    df_titles
    .write
    .format('parquet')
    .partitionBy('titleType')
    .save('df_titles_partitioned')
)

In [None]:
df_titles_partitions = spark.read.parquet('/content/df_titles_partitioned')

In [None]:
df_titles_partitions.filter('titleType = "short"').explain("formatted")

== Physical Plan ==
* ColumnarToRow (2)
+- Scan parquet  (1)


(1) Scan parquet 
Output [9]: [tconst#318, primaryTitle#319, originalTitle#320, isAdult#321, startYear#322, endYear#323, runtimeMinutes#324, genres#325, titleType#326]
Batched: true
Location: InMemoryFileIndex [file:/content/df_titles_partitioned]
PartitionFilters: [isnotnull(titleType#326), (titleType#326 = short)]
ReadSchema: struct<tconst:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genres:string>

(2) ColumnarToRow [codegen id : 1]
Input [9]: [tconst#318, primaryTitle#319, originalTitle#320, isAdult#321, startYear#322, endYear#323, runtimeMinutes#324, genres#325, titleType#326]




The filter operation does not appear in the execution plan. In cell [24], when we had the data frame without column partitions, a filter step was 
necessary. 



---



# Class 04 - **Repartitioning  Dataframes**

In [None]:
df_titles.rdd.getNumPartitions()

6

In [None]:
df_titles.repartition(12).rdd.getNumPartitions()

12

In [None]:
df_titles.coalesce(12).rdd.getNumPartitions()

6

the method coalesce can not be used to increase the number of partitions. 

In [None]:
df_titles.repartition(12).explain('formatted')

== Physical Plan ==
AdaptiveSparkPlan (3)
+- Exchange (2)
   +- Scan csv  (1)


(1) Scan csv 
Output [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Batched: false
Location: InMemoryFileIndex [file:/content/drive/MyDrive/igti_bootcamps/eng_dados_cloud/mod3/title_basics.csv]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genres:string>

(2) Exchange
Input [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Arguments: RoundRobinPartitioning(12), REPARTITION_BY_NUM, [id=#645]

(3) AdaptiveSparkPlan
Output [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Arguments: isFinalPlan=false




In [None]:
df_titles.coalesce(6).explain('formatted')

== Physical Plan ==
Coalesce (2)
+- Scan csv  (1)


(1) Scan csv 
Output [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Batched: false
Location: InMemoryFileIndex [file:/content/drive/MyDrive/igti_bootcamps/eng_dados_cloud/mod3/title_basics.csv]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genres:string>

(2) Coalesce
Input [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Arguments: 6




The function coalesce does not perform shuffle of the data. Observe that the execution plan does not exhibit the `Exchange`. The use of coalesce in the repartitioning process makes it more performative. 



---



# Class 05 - Determining which JOIN to use

- **Broadcast Hash Join (BHJ) :** The strategy is to send the complete data to each of the executors in order to perform the shuffle only once. Spark uses this method in some configurations like `spark.sql.autoBroadcastJoinThreshold`. 
- **Sort Merge Join (SMJ) :** It is Spark's default algorithm. In this case data is sent between the executors via shuffle and then sorted so that the data is partitioned correctly and in the same order. 
- **Shuffle Hash Join (SHJ) :** It is an algorithm that also uses shuffle, but it compensates this operation by using a hash map that excludes the need to sort the data. The only condition is that one of the DataFrames is significantly smaller than the other, but not by as much as the BHJ.

In [None]:
df_titles.show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [None]:
df_ratings.show()

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|          5.7|    1879|
|tt0000002|          5.9|     248|
|tt0000003|          6.5|    1652|
|tt0000004|          5.8|     161|
|tt0000005|          6.2|    2476|
|tt0000006|          5.2|     165|
|tt0000007|          5.4|     771|
|tt0000008|          5.4|    2017|
|tt0000009|          5.3|     193|
|tt0000010|          6.9|    6784|
|tt0000011|          5.3|     345|
|tt0000012|          7.4|   11652|
|tt0000013|          5.7|    1798|
|tt0000014|          7.1|    5222|
|tt0000015|          6.2|     994|
|tt0000016|          5.9|    1407|
|tt0000017|          4.6|     306|
|tt0000018|          5.3|     562|
|tt0000019|          5.2|      30|
|tt0000020|          4.8|     332|
+---------+-------------+--------+
only showing top 20 rows



In [None]:
df_titles.join(df_ratings.hint('merge'), 'tconst').explain('formatted')

== Physical Plan ==
AdaptiveSparkPlan (11)
+- Project (10)
   +- SortMergeJoin Inner (9)
      :- Sort (4)
      :  +- Exchange (3)
      :     +- Filter (2)
      :        +- Scan csv  (1)
      +- Sort (8)
         +- Exchange (7)
            +- Filter (6)
               +- Scan csv  (5)


(1) Scan csv 
Output [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Batched: false
Location: InMemoryFileIndex [file:/content/drive/MyDrive/igti_bootcamps/eng_dados_cloud/mod3/title_basics.csv]
PushedFilters: [IsNotNull(tconst)]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genres:string>

(2) Filter
Input [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Condition : isnotnull(tconst#16)

(3) Exchange
Input [9]: [tcons

The method will exchange data between the executor, sort the data and at the end will perform the merge. This strategy is very expensive. 

In [None]:
df_titles.join(df_ratings.hint('shuffle_hash'), 'tconst').explain('formatted')

== Physical Plan ==
AdaptiveSparkPlan (9)
+- Project (8)
   +- ShuffledHashJoin Inner BuildRight (7)
      :- Exchange (3)
      :  +- Filter (2)
      :     +- Scan csv  (1)
      +- Exchange (6)
         +- Filter (5)
            +- Scan csv  (4)


(1) Scan csv 
Output [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Batched: false
Location: InMemoryFileIndex [file:/content/drive/MyDrive/igti_bootcamps/eng_dados_cloud/mod3/title_basics.csv]
PushedFilters: [IsNotNull(tconst)]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genres:string>

(2) Filter
Input [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Condition : isnotnull(tconst#16)

(3) Exchange
Input [9]: [tconst#16, titleType#17, primaryTitle#18, origi

Observe that sort operation does not appear in this strategy. For this reason, this algorithm is more efficient than SMJ. 

In [None]:
df_titles.join(df_ratings.hint('broadcast'), 'tconst').explain('formatted')

== Physical Plan ==
AdaptiveSparkPlan (8)
+- Project (7)
   +- BroadcastHashJoin Inner BuildRight (6)
      :- Filter (2)
      :  +- Scan csv  (1)
      +- BroadcastExchange (5)
         +- Filter (4)
            +- Scan csv  (3)


(1) Scan csv 
Output [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Batched: false
Location: InMemoryFileIndex [file:/content/drive/MyDrive/igti_bootcamps/eng_dados_cloud/mod3/title_basics.csv]
PushedFilters: [IsNotNull(tconst)]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genres:string>

(2) Filter
Input [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Condition : isnotnull(tconst#16)

(3) Scan csv 
Output [3]: [tconst#114, averageRating#115, numVotes#116]
Batched: false
Loc

This is the most efficient execution plan. Operation like, sort and exchange do not appear in this case. 

In [None]:
df_titles.join(f.broadcast(df_ratings), 'tconst').explain('formatted')

== Physical Plan ==
AdaptiveSparkPlan (8)
+- Project (7)
   +- BroadcastHashJoin Inner BuildRight (6)
      :- Filter (2)
      :  +- Scan csv  (1)
      +- BroadcastExchange (5)
         +- Filter (4)
            +- Scan csv  (3)


(1) Scan csv 
Output [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Batched: false
Location: InMemoryFileIndex [file:/content/drive/MyDrive/igti_bootcamps/eng_dados_cloud/mod3/title_basics.csv]
PushedFilters: [IsNotNull(tconst)]
ReadSchema: struct<tconst:string,titleType:string,primaryTitle:string,originalTitle:string,isAdult:string,startYear:string,endYear:string,runtimeMinutes:string,genres:string>

(2) Filter
Input [9]: [tconst#16, titleType#17, primaryTitle#18, originalTitle#19, isAdult#20, startYear#21, endYear#22, runtimeMinutes#23, genres#24]
Condition : isnotnull(tconst#16)

(3) Scan csv 
Output [3]: [tconst#114, averageRating#115, numVotes#116]
Batched: false
Loc

In [2]:
import time
import numpy as np

### **Sort Merge Join**

In [None]:
times = []
for i in range(100) : 
  start = time.time()
  df_titles.join(df_ratings.hint('merge'),'tconst').count()
  end = time.time()
  times.append(end - start)

print('Average :', np.mean(times), '\n', 'STD :', np.std(times))

Average : 24.74466092348099 
 STD : 1.147800849570479


### **Shuffle Hash Join**

In [None]:
times = []
for i in range(100) : 
  start = time.time()
  df_titles.join(df_ratings.hint('shuffle_hash'),'tconst').count()
  end = time.time()
  times.append(end - start)

print('Average :', np.mean(times), '\n', 'STD :', np.std(times))

Average : 20.06209766149521 
 STD : 1.2726082816190447


### **Broadcast Join**

In [25]:
times = []
for i in range(100) : 
  start = time.time()
  df_titles.join(df_ratings.hint('broadcast'),'tconst').count()
  end = time.time()
  times.append(end - start)

print('Average :', np.mean(times), '\n', 'STD :', np.std(times))

Average : 17.349860653877258 
 STD : 0.5105059343897671
