#### Spark Join Transformation

#### Inner Join

In [0]:
%run "../includes/configuration"

In [0]:
movie_df = spark.read.parquet(f"{silver_folder_path}/movies").filter("year_release_date = 2007")

In [0]:
production_country_df = spark.read.parquet(f"{silver_folder_path}/productions_countries")

In [0]:
country_df = spark.read.parquet(f"{silver_folder_path}/countries")

In [0]:
display(movie_df)

In [0]:
display(production_country_df)
display(country_df)

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
                                            movie_df.movie_id == production_country_df.movie_id,
                                            "inner") \
                                        .select(movie_df.movie_id,
                                                movie_df.title,
                                                movie_df.budget,
                                                production_country_df.country_id)

In [0]:
movie_country_df = movie_production_country_df.join(country_df, 
                                                      movie_production_country_df.country_id == country_df.country_id,
                                                      "inner") \
                                              .select(movie_production_country_df.movie_id,
                                                      movie_production_country_df.title,
                                                      movie_production_country_df.budget,
                                                      country_df.country_id,
                                                      country_df.country_name)

In [0]:
display(movie_country_df.select("*"))

###OUTER JOIN

#####Left Outer Join

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
                                            movie_df.movie_id == production_country_df.movie_id,
                                            "left") \
                                        .select(movie_df.movie_id,
                                                movie_df.title,
                                                movie_df.budget,                                                
                                                production_country_df.country_id)

In [0]:
movie_country_df = movie_production_country_df.join(country_df, 
                                                      movie_production_country_df.country_id == country_df.country_id,
                                                      "left") \
                                              .select(movie_production_country_df.movie_id,
                                                      movie_production_country_df.title,
                                                      movie_production_country_df.budget,
                                                      movie_production_country_df.country_id,
                                                      country_df.country_name)

In [0]:
#Mostrando las 8 películas que no están asociadas a ningún país.
display(movie_country_df.filter("country_id IS NULL"))

#####Right Outer Join

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
                                            movie_df.movie_id == production_country_df.movie_id,
                                            "right") \
                                        .select(movie_df.movie_id,
                                                movie_df.title,
                                                movie_df.budget,                                                
                                                production_country_df.country_id)

In [0]:
movie_country_df = movie_production_country_df.join(country_df, 
                                                      movie_production_country_df.country_id == country_df.country_id,
                                                      "right") \
                                              .select(movie_production_country_df.movie_id,
                                                      movie_production_country_df.title,
                                                      movie_production_country_df.budget,
                                                      movie_production_country_df.country_id,
                                                      country_df.country_name)

In [0]:
#Mostrando los 86 países que no están asociados a ninguna película
display(movie_country_df.filter("movie_id IS NULL") \
                        .select("country_id", "country_name") \
                        .distinct())

#### Full Outer Join

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
                                            movie_df.movie_id == production_country_df.movie_id,
                                            "full") \
                                        .select(movie_df.movie_id,
                                                movie_df.title,
                                                movie_df.budget,                                                
                                                production_country_df.country_id)

In [0]:
movie_country_df = movie_production_country_df.join(country_df, 
                                                      movie_production_country_df.country_id == country_df.country_id,
                                                      "full") \
                                              .select(movie_production_country_df.movie_id,
                                                      movie_production_country_df.title,
                                                      movie_production_country_df.budget,
                                                      movie_production_country_df.country_id,
                                                      country_df.country_name)

In [0]:
#Aquí se muestran todas las películas y países independientemente de si hay una película asociada al país. 
display(movie_country_df.distinct())

#### Semi Join - Retorna solamente las columnas del lado izquierdo que tienen correspondencia en el lado derecho, muy parecido a un EXISTS.

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
                                            movie_df.movie_id == production_country_df.movie_id,
                                            "left") \
                                        .select(movie_df.movie_id,
                                                movie_df.title,
                                                movie_df.budget,                                                
                                                production_country_df.country_id)

In [0]:
movie_country_df = movie_production_country_df.join(country_df, 
                                                      movie_production_country_df.country_id == country_df.country_id,
                                                      "semi") \
                                              .select(movie_production_country_df.movie_id,
                                                      movie_production_country_df.title,
                                                      movie_production_country_df.budget,
                                                      movie_production_country_df.country_id)
                                                      #country_df.country_name) ### Se deixar provoca erro

In [0]:
display(movie_country_df)

#### Anti Join - Retorna solamente las columnas del lado izquierdo que no tienen correspondencia en el lado derecho, muy parecido a un NOT EXISTS.

In [0]:
movie_production_country_df = movie_df.join(production_country_df, 
                                            movie_df.movie_id == production_country_df.movie_id,
                                            "left") \
                                        .select(movie_df.movie_id,
                                                movie_df.title,
                                                movie_df.budget,                                                
                                                production_country_df.country_id)

In [0]:
movie_country_df = movie_production_country_df.join(country_df, 
                                                      movie_production_country_df.country_id == country_df.country_id,
                                                      "anti") \
                                              .select(movie_production_country_df.movie_id,
                                                      movie_production_country_df.title,
                                                      movie_production_country_df.budget,
                                                      movie_production_country_df.country_id)
                                                      #country_df.country_name) ### Se deixar provoca erro

In [0]:
display(movie_country_df)

In [0]:
#### Cross Join - Genera un producto cartesiano de las dos tablas

In [0]:
movie_country_df = movie_df.crossJoin(country_df)

In [0]:
display(movie_country_df.count())
display(movie_df.count())
display(country_df.count())
int(movie_df.count()) * int(country_df.count())