### **Funciones de fecha y hora**

In [1]:
import os

os.environ['JAVA_HOME'] = "C:/Program Files/Java/jdk-11"
os.environ['PYSPARK_PYTHON'] = "C:/Users/usr/anaconda3/envs/pyspark_env/python.exe"
os.environ['PYSPARK_DRIVER_PYTHON'] = "C:/Users/usr/anaconda3/envs/pyspark_env/python.exe"
os.environ['HADOOP_HOME'] = "C:/hadoop-3.4.0"
os.environ['HADOOP_COMMON_LIB_NATIVE_DIR'] = "C:/hadoop-3.4.0/lib/native"
os.environ['PATH'] += os.pathsep + "C:/hadoop-3.4.0/bin"

import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext

In [2]:
data_part1 = spark.read.parquet('./data/convertir/part-00000-6b84f509-ffb8-43d4-820e-866fb80c0d08-c000.snappy.parquet')
data_part2 = spark.read.parquet('./data/convertir/part-00001-6b84f509-ffb8-43d4-820e-866fb80c0d08-c000.snappy.parquet')

data = data_part1.union(data_part2)

In [3]:
data.printSchema()

root
 |-- date: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- date_str: string (nullable = true)
 |-- ts_str: string (nullable = true)



In [4]:
data.show(truncate=False)

+----------+-----------------------+----------+----------------+
|date      |timestamp              |date_str  |ts_str          |
+----------+-----------------------+----------+----------------+
|2021-01-01|2021-01-01 20:10:50.723|01-01-2021|18-08-2021 46:58|
+----------+-----------------------+----------+----------------+



- Vamos a convertir de formato string a formato date y timestamp

In [5]:
from pyspark.sql.functions import col, to_date, to_timestamp

In [6]:
data1 = data.select(
    to_date(col('date')).alias('date1'),
    to_timestamp(col('timestamp')).alias('ts1'),
    to_date(col('date_str'), 'dd-MM-yyyy').alias('date2'),
    to_timestamp(col('ts_str'), 'dd-MM-yyyy mm:ss').alias('ts2')
)

In [7]:
data1.show(truncate=False)

+----------+-----------------------+----------+-------------------+
|date1     |ts1                    |date2     |ts2                |
+----------+-----------------------+----------+-------------------+
|2021-01-01|2021-01-01 20:10:50.723|2021-01-01|2021-08-18 00:46:58|
+----------+-----------------------+----------+-------------------+



In [8]:
data1.printSchema()

root
 |-- date1: date (nullable = true)
 |-- ts1: timestamp (nullable = true)
 |-- date2: date (nullable = true)
 |-- ts2: timestamp (nullable = true)



- Ahora vamos a darle formato a una fecha.

In [9]:
from pyspark.sql.functions import date_format

In [10]:
data1.select(
    date_format(col('date1'), 'dd-MM-yyyy')
).show()

+------------------------------+
|date_format(date1, dd-MM-yyyy)|
+------------------------------+
|                    01-01-2021|
+------------------------------+



- Cálculos con fechas y horas.

In [11]:
df = spark.read.parquet('./data/calculo/calculo.parquet')

In [12]:
df.show()

+------+-------------+------------+-------------------+
|nombre|fecha_ingreso|fecha_salida|       baja_sistema|
+------+-------------+------------+-------------------+
|  Jose|   2021-01-01|  2021-11-14|2021-10-14 15:35:59|
|Mayara|   2021-02-06|  2021-11-25|2021-11-25 10:35:55|
+------+-------------+------------+-------------------+



In [13]:
from pyspark.sql.functions import datediff, months_between, last_day

In [14]:
df.select(
    col('nombre'),
    datediff(col('fecha_salida'), col('fecha_ingreso')).alias('dias'),
    months_between(col('fecha_salida'), col('fecha_ingreso')).alias('meses'),
    last_day(col('fecha_salida')).alias('ultimo_dia_mes')
).show()

+------+----+-----------+--------------+
|nombre|dias|      meses|ultimo_dia_mes|
+------+----+-----------+--------------+
|  Jose| 317|10.41935484|    2021-11-30|
|Mayara| 292| 9.61290323|    2021-11-30|
+------+----+-----------+--------------+



- Sumar y restar fechas.

In [15]:
from pyspark.sql.functions import date_add, date_sub

In [16]:
df.select(
    col('nombre'),
    col('fecha_ingreso'),
    date_add(col('fecha_ingreso'), 14).alias('mas_14_dias'),
    date_sub(col('fecha_ingreso'), 1).alias('menos_1_dia')
).show()

+------+-------------+-----------+-----------+
|nombre|fecha_ingreso|mas_14_dias|menos_1_dia|
+------+-------------+-----------+-----------+
|  Jose|   2021-01-01| 2021-01-15| 2020-12-31|
|Mayara|   2021-02-06| 2021-02-20| 2021-02-05|
+------+-------------+-----------+-----------+



- Extraer valores específicos de una columna date.

In [17]:
from pyspark.sql.functions import year, month, dayofmonth, dayofyear, hour, minute, second

In [18]:
df.select(
    col('baja_sistema'),
    year(col('baja_sistema')),
    month(col('baja_sistema')),
    dayofmonth(col('baja_sistema')),
    dayofyear(col('baja_sistema')),
    hour(col('baja_sistema')),
    minute(col('baja_sistema')),
    second(col('baja_sistema'))
).show()

+-------------------+------------------+-------------------+------------------------+-----------------------+------------------+--------------------+--------------------+
|       baja_sistema|year(baja_sistema)|month(baja_sistema)|dayofmonth(baja_sistema)|dayofyear(baja_sistema)|hour(baja_sistema)|minute(baja_sistema)|second(baja_sistema)|
+-------------------+------------------+-------------------+------------------------+-----------------------+------------------+--------------------+--------------------+
|2021-10-14 15:35:59|              2021|                 10|                      14|                    287|                15|                  35|                  59|
|2021-11-25 10:35:55|              2021|                 11|                      25|                    329|                10|                  35|                  55|
+-------------------+------------------+-------------------+------------------------+-----------------------+------------------+-----------------

### **Funciones para trabajo con strings**

In [19]:
data = spark.read.parquet('./data/data/data.parquet')

- Transformaciones string.

In [20]:
data.show()

+-------+
| nombre|
+-------+
| Spark |
+-------+



- Tiene espacios en blanco al principio y al final.

In [21]:
from pyspark.sql.functions import ltrim, rtrim, trim

In [22]:
data.select(
    ltrim(col('nombre')).alias('ltrim'), # Elimina los espacios a la izquierda.
    rtrim(col('nombre')).alias('rtrim'), # Elimina los espacios a la derecha.
    trim(col('nombre')).alias('trim') # Elimina los espacios a la izquierda y derecha.
).show()

+------+------+-----+
| ltrim| rtrim| trim|
+------+------+-----+
|Spark | Spark|Spark|
+------+------+-----+



- Rellenar string.

In [23]:
from pyspark.sql.functions import col, lpad, rpad

In [24]:
data.select(
    trim(col('nombre')).alias('trim'),
).select(
    lpad(col('trim'), 8, '-').alias('lpad'), # Agrega guiones a la izquierda.
    rpad(col('trim'), 8, '=').alias('rpad') # Agrega igual al final.
).show()

+--------+--------+
|    lpad|    rpad|
+--------+--------+
|---Spark|Spark===|
+--------+--------+



In [25]:
df1 = spark.createDataFrame([('Spark', 'es', 'maravilloso')], ['sujeto', 'verbo', 'adjetivo'])

In [26]:
df1.show()

+------+-----+-----------+
|sujeto|verbo|   adjetivo|
+------+-----+-----------+
| Spark|   es|maravilloso|
+------+-----+-----------+



- Concatenación, mayúsculas, minúsculas y reverso de string.

In [27]:
from pyspark.sql.functions import concat_ws, lower, upper, initcap, reverse

In [28]:
df1.select(
    concat_ws(' ', col('sujeto'), col('verbo'), col('adjetivo')).alias('frase')
).select(
    col('frase'),
    lower(col('frase')).alias('minuscula'),
    upper(col('frase')).alias('mayuscula'),
    initcap(col('frase')).alias('initcap'),
    reverse(col('frase')).alias('reversa')
).show()

+--------------------+--------------------+--------------------+--------------------+--------------------+
|               frase|           minuscula|           mayuscula|             initcap|             reversa|
+--------------------+--------------------+--------------------+--------------------+--------------------+
|Spark es maravilloso|spark es maravilloso|SPARK ES MARAVILLOSO|Spark Es Maravilloso|osollivaram se krapS|
+--------------------+--------------------+--------------------+--------------------+--------------------+



In [29]:
from pyspark.sql.functions import regexp_replace

In [30]:
df2 = spark.createDataFrame([(' voy a casa por mis llaves',)], ['frase'])

In [31]:
df2.show(truncate=False)

+--------------------------+
|frase                     |
+--------------------------+
| voy a casa por mis llaves|
+--------------------------+



In [32]:
df2.select(
    regexp_replace(col('frase'), 'voy|por', 'ir').alias('nueva_frase') # Reemplaza "voy" o "por" por ir.
).show(truncate=False)

+------------------------+
|nueva_frase             |
+------------------------+
| ir a casa ir mis llaves|
+------------------------+



### **Funciones para el trabajo con colecciones**

In [33]:
data = spark.read.parquet('./data/data_collections/parquet/part-00000-96f39196-ef97-4a14-926e-b24a86c2e32d-c000.snappy.parquet')

- Arrays.

In [34]:
data.show(truncate=False)

+-----+--------------------------------------------+
|dia  |tareas                                      |
+-----+--------------------------------------------+
|lunes|[hacer la tarea, buscar agua, lavar el auto]|
+-----+--------------------------------------------+



In [35]:
data.printSchema()

root
 |-- dia: string (nullable = true)
 |-- tareas: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [36]:
from pyspark.sql.functions import col, size, sort_array, array_contains

In [37]:
data.select(
    size(col('tareas')).alias('tamaño'),
    sort_array(col('tareas')).alias('array_ordenado'),
    array_contains(col('tareas'), 'buscar agua').alias('buscar_agua')
).show(truncate=False)

+------+--------------------------------------------+-----------+
|tamaño|array_ordenado                              |buscar_agua|
+------+--------------------------------------------+-----------+
|3     |[buscar agua, hacer la tarea, lavar el auto]|true       |
+------+--------------------------------------------+-----------+



In [38]:
from pyspark.sql.functions import explode

In [39]:
data.select(
    col('dia'),
    explode(col('tareas')).alias('tareas')
).show(truncate=False)

+-----+--------------+
|dia  |tareas        |
+-----+--------------+
|lunes|hacer la tarea|
|lunes|buscar agua   |
|lunes|lavar el auto |
+-----+--------------+



- Formato JSON

In [40]:
json_df_str = spark.read.parquet('./data/data_collections/JSON/part-00000-9f0e2b16-0b50-4212-9b83-654e1b8dd137-c000.snappy.parquet')

In [41]:
json_df_str.show(truncate=False)

+---------------------------------------------------------------------------+
|tareas_str                                                                 |
+---------------------------------------------------------------------------+
|{"dia": "lunes","tareas": ["hacer la tarea","buscar agua","lavar el auto"]}|
+---------------------------------------------------------------------------+



In [42]:
json_df_str.printSchema()

root
 |-- tareas_str: string (nullable = true)



- Convertimos en estructura Spark, para ello debemos describir su estructura.

In [43]:
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

In [44]:
schema_json = StructType(
    [
        StructField('dia', StringType(), True),
        StructField('tareas', ArrayType(StringType()), True)
    ]
)

In [45]:
from pyspark.sql.functions import from_json, to_json

In [46]:
json_df = json_df_str.select(
    from_json(col('tareas_str'), schema_json).alias('por_hacer')
)

In [47]:
json_df.printSchema()

root
 |-- por_hacer: struct (nullable = true)
 |    |-- dia: string (nullable = true)
 |    |-- tareas: array (nullable = true)
 |    |    |-- element: string (containsNull = true)



In [48]:
json_df.select(
    col('por_hacer').getItem('dia'),
    col('por_hacer').getItem('tareas'),
    col('por_hacer').getItem('tareas').getItem(0).alias('primer_tarea')
).show(truncate=False)

+-------------+--------------------------------------------+--------------+
|por_hacer.dia|por_hacer.tareas                            |primer_tarea  |
+-------------+--------------------------------------------+--------------+
|lunes        |[hacer la tarea, buscar agua, lavar el auto]|hacer la tarea|
+-------------+--------------------------------------------+--------------+



In [49]:
json_df.select(
    to_json(col('por_hacer'))
).show(truncate=False)

+-------------------------------------------------------------------------+
|to_json(por_hacer)                                                       |
+-------------------------------------------------------------------------+
|{"dia":"lunes","tareas":["hacer la tarea","buscar agua","lavar el auto"]}|
+-------------------------------------------------------------------------+



### **Funciones when, coalesce y lit**

In [50]:
data = spark.read.parquet('./data/data/when_coalesce_lit.parquet')

In [51]:
data.show()

+------+----+
|nombre|pago|
+------+----+
|  Jose|   1|
| Julia|   2|
| Katia|   1|
|  null|   3|
|  Raul|   3|
+------+----+



In [52]:
from pyspark.sql.functions import col, when, lit, coalesce

In [53]:
data.select(
    col('nombre'),
    when(col('pago') == 1, 'pagado').when(col('pago') == 2, 'sin pagar').otherwise('sin iniciar').alias('pago')
).show()

+------+-----------+
|nombre|       pago|
+------+-----------+
|  Jose|     pagado|
| Julia|  sin pagar|
| Katia|     pagado|
|  null|sin iniciar|
|  Raul|sin iniciar|
+------+-----------+



- Con coalesce toma uno o más valores de columna y devuelve el primero que no es nulo.

In [None]:
data.select(
    coalesce(col('nombre'), lit('sin nombre')).alias('nombre') 
).show()

+----------+
|    nombre|
+----------+
|      Jose|
|     Julia|
|     Katia|
|sin nombre|
|      Raul|
+----------+



### **Funciones definidas por el usuario (UDF)**