<a href="https://colab.research.google.com/github/adilsonalbino/SQL-e-PYSPARK/blob/main/09_Trabalhando_com_datas_no_sql_e_pyspark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#09-Trabalhando com datas no sql e pyspark

by Adilson Albino

In [None]:

#Instalando pyspark no ambiente
!pip install pyspark

#Importando as bibliotecas
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

#Criando uma sparksession
spark = SparkSession.builder\
.appName("Spark Engine")\
.getOrCreate()


In [None]:
#Criando dataset de teste
df_datas_1 = spark.createDataFrame(["2021-07-05T10:00:00.000+0000",
                                    "2020-12-05T00:09:00.000+0000",
                                    "2017-02-23T16:23:00.000-0000"],
                                    "string").toDF("datas")

df_datas_2 = spark.createDataFrame(["05/07/2021 10:41",
                                    "05/12/2020 14:50",
                                    "23/02/2017 23:22"],
                                   "string").toDF("datas")

df_datas_1.show()
df_datas_2.show()


+--------------------+
|               datas|
+--------------------+
|2021-07-05T10:00:...|
|2020-12-05T00:09:...|
|2017-02-23T16:23:...|
+--------------------+

+----------------+
|           datas|
+----------------+
|05/07/2021 10:41|
|05/12/2020 14:50|
|23/02/2017 23:22|
+----------------+



#Utilizando SQL

In [None]:
#Criando tabelas temporárias
df_datas_1.createOrReplaceTempView("df_datas_1")
df_datas_2.createOrReplaceTempView("df_datas_2")

In [None]:
#Duplo check na tipagem dos dataframes
df_datas_1.printSchema()
df_datas_2.printSchema()

root
 |-- datas: string (nullable = true)

root
 |-- datas: string (nullable = true)



In [None]:
df_datas_1_sql = spark.sql("""
    SELECT
      datas,
      CAST(datas AS DATE) AS date,
      CAST(datas AS TIMESTAMP) AS timestamp,
      TO_DATE(datas) AS to_date,
      TO_TIMESTAMP(datas) AS to_timestamp
    FROM df_datas_1
""")

df_datas_1_sql.show(truncate=False)

+----------------------------+----------+-------------------+----------+-------------------+
|datas                       |date      |timestamp          |to_date   |to_timestamp       |
+----------------------------+----------+-------------------+----------+-------------------+
|2021-07-05T10:00:00.000+0000|2021-07-05|2021-07-05 10:00:00|2021-07-05|2021-07-05 10:00:00|
|2020-12-05T00:09:00.000+0000|2020-12-05|2020-12-05 00:09:00|2020-12-05|2020-12-05 00:09:00|
|2017-02-23T16:23:00.000-0000|2017-02-23|2017-02-23 16:23:00|2017-02-23|2017-02-23 16:23:00|
+----------------------------+----------+-------------------+----------+-------------------+



In [None]:
#Checando o tipo de dados novamente
df_datas_1.printSchema()


root
 |-- datas: string (nullable = true)



In [None]:
#Vamos tipar o campo data do dataset que tem apenas datas e colocar o padrão dd/mm/yyyyy
df_datas_2_sql = spark.sql("""
    SELECT
      datas,
      TO_TIMESTAMP(datas, "dd/MM/yyyy HH:mm") AS TIMESATAMP
    FROM df_datas_2
""")

df_datas_2_sql.show(truncate=False)

+----------------+-------------------+
|datas           |TIMESATAMP         |
+----------------+-------------------+
|05/07/2021 10:41|2021-07-05 10:41:00|
|05/12/2020 14:50|2020-12-05 14:50:00|
|23/02/2017 23:22|2017-02-23 23:22:00|
+----------------+-------------------+



In [None]:
#Checando o tipo de dados novamente
df_datas_2_sql.printSchema()

root
 |-- datas: string (nullable = true)
 |-- TIMESATAMP: timestamp (nullable = true)



#Utilizando pyspark

In [None]:
df_datas_1_spark = df_datas_1
df_datas_1_spark = df_datas_1_spark\
  .withColumn("date", to_date("datas"))\
  .withColumn("timestamp", to_timestamp("datas"))
df_datas_1_spark.show(truncate=False)

+----------------------------+----------+-------------------+
|datas                       |date      |timestamp          |
+----------------------------+----------+-------------------+
|2021-07-05T10:00:00.000+0000|2021-07-05|2021-07-05 10:00:00|
|2020-12-05T00:09:00.000+0000|2020-12-05|2020-12-05 00:09:00|
|2017-02-23T16:23:00.000-0000|2017-02-23|2017-02-23 16:23:00|
+----------------------------+----------+-------------------+



In [None]:
df_datas_1_spark.printSchema()

root
 |-- datas: string (nullable = true)
 |-- date: date (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [None]:
df_datas_2_spark = df_datas_2
df_datas_2_spark = df_datas_2_spark\
  .withColumn("date", to_date("datas", "dd/MM/yyyy HH:mm"))\
  .withColumn("timestamp", to_timestamp("datas", "dd/MM/yyyy HH:mm"))
df_datas_2_spark.show(truncate=False)

+----------------+----------+-------------------+
|datas           |date      |timestamp          |
+----------------+----------+-------------------+
|05/07/2021 10:41|2021-07-05|2021-07-05 10:41:00|
|05/12/2020 14:50|2020-12-05|2020-12-05 14:50:00|
|23/02/2017 23:22|2017-02-23|2017-02-23 23:22:00|
+----------------+----------+-------------------+



In [None]:
df_datas_2_spark.printSchema()

root
 |-- datas: string (nullable = true)
 |-- date: date (nullable = true)
 |-- timestamp: timestamp (nullable = true)

