# ETL Sistema Bancario - Projeto Integrador Grupo 05

## Configurando o ambiente:

In [1]:
!pip install pyspark
!wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar -xvzf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark



In [None]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/home/azureuser/spark-3.3.2-bin-hadoop3"
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import input_file_name

spark = SparkSession.builder\
    .master("local[*]")\
    .appName("Read CSV")\
    .getOrCreate()

## Importando os dados:

### Clientes:

In [9]:
# leitura de todos os arquivos CSV na pasta clientes
clientes = spark.read.option("sep", ";").schema("id int, nome string, email string, data_cadastro timestamp, telefone string").option("header", "false").csv("dados/clientes/*.csv")

# filtrando o dataframe para excluir o arquivo clientes-001.csv
clientes_sem_header = clientes.filter(~input_file_name().rlike("clients-001.csv"))
clientes_com_header = spark.read.option("sep", ";").schema("id int, nome string, email string, data_cadastro timestamp, telefone string").option("header", "true").csv("dados/clientes/clients-001.csv")

# unindo os dois dataframes
clientes = clientes_com_header.union(clientes_sem_header)

clientes.count()
clientes.show()

+---+--------------------+--------------------+-------------------+----------------+
| id|                nome|               email|      data_cadastro|        telefone|
+---+--------------------+--------------------+-------------------+----------------+
|641|Priscila Felix do...|priscila-felix-do...|2021-03-28 18:46:57|+55(30)2227-2428|
| 94|             idelmon|idelmon_94@gmail.com|2019-09-19 12:33:19|+55(29)3027-2026|
|584|Liliane soares da...|liliane-soares-da...|2021-02-10 19:15:30|+55(21)2024-2520|
|580|Fagner jose dos s...|fagner-jose-dos-s...|2021-02-07 01:47:04|+55(24)2624-2029|
| 21|               Cildo|  cildo_21@gmail.com|2019-07-30 11:40:10|+55(21)2222-2422|
|582|Nielton da Silva ...|nielton-da-silva-...|2021-02-09 00:11:22|+55(27)2028-2828|
|586|Armando Teles da ...|armando-teles-da-...|2021-02-12 15:20:14|+55(27)2720-2230|
|151|            Fabricio|fabricio_151@gmai...|2019-10-14 21:16:27|+55(20)2121-2326|
| 83|       Flavio junior|flavio-junior_83@...|2019-09-11 15:24:0

### Transaction-in:

In [None]:
# leitura de todos os arquivos CSV na pasta transaction-in
transaction_in = spark.read.option("sep", ";").schema("id int, cliente_id int, valor double, data timestamp").option("header", "false").csv("/home/azureuser/transaction-in/*.csv")

# filtrando o dataframe para excluir o arquivo transaction-in-001.csv
transaction_in_sem_header = transaction_in.filter(~input_file_name().rlike("transaction-in-001.csv"))
transaction_in_com_header = spark.read.option("sep", ";").schema("id int, cliente_id int, valor double, data timestamp").option("header", "true").csv("/home/azureuser/transaction-in/transaction-in-001.csv")

# unindo os dois dataframes
transaction_in = transaction_in_com_header.union(transaction_in_sem_header)

transaction_in.count()
transaction_in.show()

### Transaction-out:

In [None]:
# leitura de todos os arquivos CSV na pasta transaction-out
transaction_out = spark.read.option("sep", ";").schema("id int, cliente_id int, valor double, data timestamp").option("header", "false").csv("/home/azureuser/transaction-out/*.csv")

# filtrando o dataframe para excluir o arquivo transaction-out-001.csv
transaction_out_sem_header = transaction_out.filter(~input_file_name().rlike("transaction-out-001.csv"))
transaction_out_com_header = spark.read.option("sep", ";")\
    .schema("id int, cliente_id int, valor double, data timestamp")\
    .option("header", "true")\
    .csv("/home/azureuser/transaction-out/transaction-out-001.csv")

# unindo os dois dataframes
transaction_out = transaction_out_com_header.union(transaction_out_sem_header)

transaction_out.count()
transaction_out.show()