<a href="https://colab.research.google.com/github/adilsonalbino/ferramentasetl/blob/main/10_Fazendo_JOINS_com_SQL_e_PYSPARK.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#10-Fazendo JOINS com SQL e PYSPARK

by Adilson Albino

---



In [None]:
#Instalando pyspark no ambiente
!pip install pyspark

#Importando as bibliotecas
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

#Criando uma sparksession
spark = SparkSession.builder\
.appName("Spark Engine")\
.getOrCreate()

In [28]:
file_path_modelo = "/content/drive/MyDrive/Colab Notebooks/PySaprk e SQL/DATASET_CARROS/modelo_carro.csv"
file_path_marcas = "/content/drive/MyDrive/Colab Notebooks/PySaprk e SQL/DATASET_CARROS/marca_carro.csv/"

In [46]:
df_carros = spark.read.csv(file_path_modelo, header=True, inferSchema=True)
df_marcas = spark.read.csv(file_path_marcas, header=True, inferSchema=True)

df_carros.show(5)
df_marcas.show(5)

+--------+------------+---------+---------+
|id_carro|modelo_carro|    preco|cod_marca|
+--------+------------+---------+---------+
|       1|      Avalon|$78401.95|       54|
|       2|         RDX|$95987.38|        1|
|       3|        Golf|$61274.55|       55|
|       4|          EX|$84981.12|       23|
|       5|      Escort|$77466.89|       17|
+--------+------------+---------+---------+
only showing top 5 rows

+------------+---------+
| marca_carro|cod_marca|
+------------+---------+
|       Acura|        1|
|Aston Martin|        2|
|        Audi|        3|
|      Austin|        4|
|         BMW|        5|
+------------+---------+
only showing top 5 rows



#Realizando JOINS com SQL

In [None]:
#Criando tabela temporária
df_carros.createOrReplaceTempView("carros")
df_marcas.createOrReplaceTempView("marcas")

In [None]:
#Utilizando WHERE
df_carros_marcas_sql = spark.sql("""
  SELECT
    a.id_carro,
    a.modelo_carro,
    a.preco,
    b.cod_marca,
    b.marca_carro
  FROM carros a, marcas b
  WHERE a.cod_marca = b.cod_marca
""")
df_carros_marcas_sql.show(5)

+--------+------------+---------+---------+-----------+
|id_carro|modelo_carro|    preco|cod_marca|marca_carro|
+--------+------------+---------+---------+-----------+
|       1|      Avalon|$78401.95|       54|     Toyota|
|       2|         RDX|$95987.38|        1|      Acura|
|       3|        Golf|$61274.55|       55| Volkswagen|
|       4|          EX|$84981.12|       23|   Infiniti|
|       5|      Escort|$77466.89|       17|       Ford|
+--------+------------+---------+---------+-----------+
only showing top 5 rows



In [None]:
#Utilizando INNER
df_carros_marcas_sql = spark.sql("""
  SELECT
    a.id_carro,
    a.modelo_carro,
    a.preco,
    b.cod_marca,
    b.marca_carro
  FROM carros a
  INNER JOIN marcas b
  ON a.cod_marca = b.cod_marca
""")
df_carros_marcas_sql.show(5)


+--------+------------+---------+---------+-----------+
|id_carro|modelo_carro|    preco|cod_marca|marca_carro|
+--------+------------+---------+---------+-----------+
|       1|      Avalon|$78401.95|       54|     Toyota|
|       2|         RDX|$95987.38|        1|      Acura|
|       3|        Golf|$61274.55|       55| Volkswagen|
|       4|          EX|$84981.12|       23|   Infiniti|
|       5|      Escort|$77466.89|       17|       Ford|
+--------+------------+---------+---------+-----------+
only showing top 5 rows



In [None]:
#Trazendo todas as colunas da tabela carros e apenas 1 da tabela marca
df_carros_marcas_sql = spark.sql("""
  SELECT
    a.*,
    b.marca_carro
  FROM carros a
  INNER JOIN marcas b
  ON a.cod_marca = b.cod_marca
""")
df_carros_marcas_sql.show(5)

+--------+------------+---------+---------+-----------+
|id_carro|modelo_carro|    preco|cod_marca|marca_carro|
+--------+------------+---------+---------+-----------+
|       1|      Avalon|$78401.95|       54|     Toyota|
|       2|         RDX|$95987.38|        1|      Acura|
|       3|        Golf|$61274.55|       55| Volkswagen|
|       4|          EX|$84981.12|       23|   Infiniti|
|       5|      Escort|$77466.89|       17|       Ford|
+--------+------------+---------+---------+-----------+
only showing top 5 rows



In [None]:
#Utilizando RIGHT
df_carros_marcas_sql = spark.sql("""
  SELECT
    a.id_carro,
    a.modelo_carro,
    a.preco,
    b.cod_marca,
    b.marca_carro
  FROM marcas b
  RIGHT JOIN carros a
  ON a.cod_marca = b.cod_marca
""")
df_carros_marcas_sql.show(5)

+--------+------------+---------+---------+-----------+
|id_carro|modelo_carro|    preco|cod_marca|marca_carro|
+--------+------------+---------+---------+-----------+
|       1|      Avalon|$78401.95|       54|     Toyota|
|       2|         RDX|$95987.38|        1|      Acura|
|       3|        Golf|$61274.55|       55| Volkswagen|
|       4|          EX|$84981.12|       23|   Infiniti|
|       5|      Escort|$77466.89|       17|       Ford|
+--------+------------+---------+---------+-----------+
only showing top 5 rows



In [None]:
#Utilizando LEFT
df_carros_marcas_sql = spark.sql("""
  SELECT
    a.id_carro,
    a.modelo_carro,
    a.preco,
    b.cod_marca,
    b.marca_carro
  FROM marcas b
  LEFT JOIN carros a
  ON a.cod_marca = b.cod_marca
""")
df_carros_marcas_sql.show(5)

+--------+------------+---------+---------+-----------+
|id_carro|modelo_carro|    preco|cod_marca|marca_carro|
+--------+------------+---------+---------+-----------+
|     997|          CL|$81133.01|        1|      Acura|
|     852|     Integra|$54434.81|        1|      Acura|
|     646|         SLX|$76518.37|        1|      Acura|
|     638|      Legend|$47400.94|        1|      Acura|
|     601|         NSX|$63519.05|        1|      Acura|
+--------+------------+---------+---------+-----------+
only showing top 5 rows



#Realizando JOINS com PYSPARK

In [None]:
#Inner Join
df_innerjoin_spark = df_carros.join(
    df_marcas,
    (df_carros.cod_marca == df_marcas.cod_marca),
    "inner")
df_innerjoin_spark.show(5)

+--------+------------+---------+---------+-----------+---------+
|id_carro|modelo_carro|    preco|cod_marca|marca_carro|cod_marca|
+--------+------------+---------+---------+-----------+---------+
|       1|      Avalon|$78401.95|       54|     Toyota|       54|
|       2|         RDX|$95987.38|        1|      Acura|        1|
|       3|        Golf|$61274.55|       55| Volkswagen|       55|
|       4|          EX|$84981.12|       23|   Infiniti|       23|
|       5|      Escort|$77466.89|       17|       Ford|       17|
+--------+------------+---------+---------+-----------+---------+
only showing top 5 rows



In [None]:
#Inner Left
df_Leftjoin_spark = df_carros.join(
    df_marcas,
    (df_carros.cod_marca == df_marcas.cod_marca),
    "left")
df_Leftjoin_spark.show(5)

+--------+------------+---------+---------+-----------+---------+
|id_carro|modelo_carro|    preco|cod_marca|marca_carro|cod_marca|
+--------+------------+---------+---------+-----------+---------+
|       1|      Avalon|$78401.95|       54|     Toyota|       54|
|       2|         RDX|$95987.38|        1|      Acura|        1|
|       3|        Golf|$61274.55|       55| Volkswagen|       55|
|       4|          EX|$84981.12|       23|   Infiniti|       23|
|       5|      Escort|$77466.89|       17|       Ford|       17|
+--------+------------+---------+---------+-----------+---------+
only showing top 5 rows



In [None]:
#Inner right
df_rightjoin_spark = df_carros.join(
    df_marcas,
    (df_carros.cod_marca == df_marcas.cod_marca),
    "right")
df_rightjoin_spark.show(5)

+--------+------------+---------+---------+-----------+---------+
|id_carro|modelo_carro|    preco|cod_marca|marca_carro|cod_marca|
+--------+------------+---------+---------+-----------+---------+
|     997|          CL|$81133.01|        1|      Acura|        1|
|     852|     Integra|$54434.81|        1|      Acura|        1|
|     646|         SLX|$76518.37|        1|      Acura|        1|
|     638|      Legend|$47400.94|        1|      Acura|        1|
|     601|         NSX|$63519.05|        1|      Acura|        1|
+--------+------------+---------+---------+-----------+---------+
only showing top 5 rows



In [None]:
#Como trazer ou selecionar apenas algumas colunas de uma tabela?
#Inner Join
df_innerjoin_spark = df_carros.join(
    df_marcas,
    (df_carros.cod_marca == df_marcas.cod_marca),
    "inner").select(
        df_carros["*"], #Traz todos as colunas da tabela carros
        df_marcas.marca_carro # traz apenas a coluna marca_carro da tabela marca
    )
df_innerjoin_spark.show(5)

+--------+------------+---------+---------+-----------+
|id_carro|modelo_carro|    preco|cod_marca|marca_carro|
+--------+------------+---------+---------+-----------+
|       1|      Avalon|$78401.95|       54|     Toyota|
|       2|         RDX|$95987.38|        1|      Acura|
|       3|        Golf|$61274.55|       55| Volkswagen|
|       4|          EX|$84981.12|       23|   Infiniti|
|       5|      Escort|$77466.89|       17|       Ford|
+--------+------------+---------+---------+-----------+
only showing top 5 rows

