In [1]:
import os

os.environ['JAVA_HOME'] = "C:/Program Files/Java/jdk-11"
os.environ['PYSPARK_PYTHON'] = "C:/Users/usr/anaconda3/envs/pyspark_env/python.exe"
os.environ['PYSPARK_DRIVER_PYTHON'] = "C:/Users/usr/anaconda3/envs/pyspark_env/python.exe"
os.environ['HADOOP_HOME'] = "C:/hadoop-3.4.0"
os.environ['HADOOP_COMMON_LIB_NATIVE_DIR'] = "C:/hadoop-3.4.0/lib/native"
os.environ['PATH'] += os.pathsep + "C:/hadoop-3.4.0/bin"

import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

sc = spark.sparkContext

### **Crear un DataFrame a partir de un RDD**

In [2]:
rdd = sc.parallelize([item for item in range(10)]).map(lambda x: (x, x ** 2)) # Queremos que nos devuelva tuplas.

In [3]:
rdd.collect()

[(0, 0),
 (1, 1),
 (2, 4),
 (3, 9),
 (4, 16),
 (5, 25),
 (6, 36),
 (7, 49),
 (8, 64),
 (9, 81)]

In [4]:
df = rdd.toDF(['numero', 'cuadrado']) # Le damos una lista con los nombres de columnas que queremos en el DF.

In [5]:
df.printSchema()

root
 |-- numero: long (nullable = true)
 |-- cuadrado: long (nullable = true)



- long es el tipo de dato.

- nullable es si acepta nulos o no.

In [6]:
df.show()

+------+--------+
|numero|cuadrado|
+------+--------+
|     0|       0|
|     1|       1|
|     2|       4|
|     3|       9|
|     4|      16|
|     5|      25|
|     6|      36|
|     7|      49|
|     8|      64|
|     9|      81|
+------+--------+



- Crear un DF a partir de un RDD con schema.

In [7]:
rdd1 = sc.parallelize([(1, 'Jose', 35.5), (2, 'Teresa', 54.3), (3, 'Katia', 12.7)])

- 2 formas para crear el schema.

In [8]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType

- Vía uno.

In [9]:
esquema1 = StructType(
    [
        StructField('id', IntegerType(), True), # Nombre de la columna, tipo de dato, si admite nulos o no.
        StructField('nombre', StringType(), True),
        StructField('saldo', DoubleType(), True)
    ]
)

Vía dos.

In [10]:
esquema2 = "`id` INT, `nombre` STRING, `saldo` DOUBLE"

In [11]:
df1 = spark.createDataFrame(rdd1, schema = esquema1)

In [12]:
df1.printSchema()

root
 |-- id: integer (nullable = true)
 |-- nombre: string (nullable = true)
 |-- saldo: double (nullable = true)



In [13]:
df1.show() # Muestra por default las primeras 20 filas del DF.

+---+------+-----+
| id|nombre|saldo|
+---+------+-----+
|  1|  Jose| 35.5|
|  2|Teresa| 54.3|
|  3| Katia| 12.7|
+---+------+-----+



In [14]:
df2 = spark.createDataFrame(rdd1, schema = esquema2)

In [15]:
df2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- nombre: string (nullable = true)
 |-- saldo: double (nullable = true)



In [16]:
df2.show()

+---+------+-----+
| id|nombre|saldo|
+---+------+-----+
|  1|  Jose| 35.5|
|  2|Teresa| 54.3|
|  3| Katia| 12.7|
+---+------+-----+



### **Crear un DF a partir de fuentes de datos**

- format() no es opcional, option() y schema() si.

- Crear un DF a partir de un **archivo** de **texto**.

In [17]:
df3 = spark.read.text('./data/data/dataTXT.txt')

In [18]:
df3.show()

+--------------------+
|               value|
+--------------------+
|Estamos en el cur...|
|En este capítulo ...|
|En esta sección e...|
|y en este ejemplo...|
+--------------------+



In [19]:
df3.show(truncate = False) # Para que muestre todo el texto.

+-----------------------------------------------------------------------+
|value                                                                  |
+-----------------------------------------------------------------------+
|Estamos en el curso de pyspark                                         |
|En este capítulo estamos estudiando el API SQL de Saprk                |
|En esta sección estamos creado dataframes a partir de fuentes de datos,|
|y en este ejemplo creamos un dataframe a partir de un texto plano      |
+-----------------------------------------------------------------------+



- Crear un DF a partir de un **CSV**.

In [20]:
df4 = spark.read.csv('./data/data/dataCSV.csv')

In [21]:
df4.show()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+--------------------+--------------------+
|        _c0|          _c1|                 _c2|                 _c3|        _c4|                 _c5|                 _c6|    _c7|   _c8|     _c9|         _c10|                _c11|             _c12|            _c13|                _c14|                _c15|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+--------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|vid

- El nombre de las columnas lo ha generado pero en realidad el nombre de las columnas debería ser la primera fila.

In [22]:
df4 = spark.read.option('header', 'true').csv('./data/data/dataCSV.csv') # Le decimos que el título del campo es la primera fila.

In [23]:
df4.show()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13T17:13:...|     SHANtell martin| 748374| 57527|    2966|        15954|https://i.ytimg.c...|            False|           Fal

- Leer archivo de texto con un **delimitador diferente**.

In [24]:
df5 = spark.read.option('header', 'true').option('delimiter', '|').csv('./data/data/dataTab.txt')

In [25]:
df5.show()

+----+----+----------+-----+
|pais|edad|     fecha|color|
+----+----+----------+-----+
|  MX|  23|2021-02-21| rojo|
|  CA|  56|2021-06-10| azul|
|  US|  32|2020-06-02|verde|
+----+----+----------+-----+



- El delimiter es que en este caso el archivo de texto no estaban los datos delimitados por comas sino por '|'

- Crear un DF a partir de un **JSON** proporcionando un **schema**.

In [26]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType

In [27]:
json_schema = StructType(
    [
        StructField('color', StringType(), True),
        StructField('edad', IntegerType(), True),
        StructField('fecha', DateType(), True),
        StructField('pais', StringType(), True)
    ]
)

In [28]:
df6 = spark.read.schema(json_schema).json('./data/data/dataJSON.json')

In [29]:
df6.show()

+-----+----+----------+----+
|color|edad|     fecha|pais|
+-----+----+----------+----+
| rojo|null|2021-02-21|  MX|
| azul|null|2021-06-10|  CA|
|verde|null|2020-06-02|  US|
+-----+----+----------+----+



In [30]:
df6.printSchema()

root
 |-- color: string (nullable = true)
 |-- edad: integer (nullable = true)
 |-- fecha: date (nullable = true)
 |-- pais: string (nullable = true)



- Crear un DF a partir de un archivo **parquet**.

In [31]:
df7 = spark.read.parquet('./data/data/dataPARQUET.parquet')

In [32]:
df7.show()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13T17:13:...|     SHANtell martin| 748374| 57527|    2966|        15954|https://i.ytimg.c...|            False|           Fal

- Tenemos otra opción para la lectura.

In [33]:
df8 = spark.read.format('parquet').load('./data/data/dataPARQUET.parquet')

In [34]:
df8.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



### **Trabajo con columnas**

- Al igual que en las operaciones con RDD, las operaciones estructuradas (con DF y SQL) tienen 2 categorías: **transformación** y **acción**.

- Los **DF** son **inmutables** y sus operaciones de transformación siempre devuelven un **DF nuevo**.

In [35]:
df9 = spark.read.parquet('./data/data/dataPARQUET.parquet')

In [36]:
df9.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



- **Primera** alternativa para **referirnos** a las columnas.

In [37]:
df9.select('title').show()

+--------------------+
|               title|
+--------------------+
|WE WANT TO TALK A...|
|The Trump Preside...|
|Racist Superman |...|
|Nickelback Lyrics...|
|I Dare You: GOING...|
|2 Weeks with iPho...|
|Roy Moore & Jeff ...|
|5 Ice Cream Gadge...|
|The Greatest Show...|
|Why the rise of t...|
|Dion Lewis' 103-Y...|
|(SPOILERS) 'Shiva...|
|Marshmello - Bloc...|
|Which Countries A...|
|SHOPPING FOR NEW ...|
|    The New SpotMini|
|One Change That W...|
|How does your bod...|
|HomeMade Electric...|
|Founding An Inbre...|
+--------------------+
only showing top 20 rows



- **Segunda** alternativa.

In [38]:
from pyspark.sql.functions import col

In [39]:
df9.select(col('title')).show()

+--------------------+
|               title|
+--------------------+
|WE WANT TO TALK A...|
|The Trump Preside...|
|Racist Superman |...|
|Nickelback Lyrics...|
|I Dare You: GOING...|
|2 Weeks with iPho...|
|Roy Moore & Jeff ...|
|5 Ice Cream Gadge...|
|The Greatest Show...|
|Why the rise of t...|
|Dion Lewis' 103-Y...|
|(SPOILERS) 'Shiva...|
|Marshmello - Bloc...|
|Which Countries A...|
|SHOPPING FOR NEW ...|
|    The New SpotMini|
|One Change That W...|
|How does your bod...|
|HomeMade Electric...|
|Founding An Inbre...|
+--------------------+
only showing top 20 rows



### **Transformaciones: funciones select() y selectExpr()**

- **select()**

In [40]:
df10 = spark.read.parquet('./data/data/dataPARQUET.parquet')

In [41]:
df10.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [42]:
from pyspark.sql.functions import col

In [43]:
df10.select(col('video_id')).show()

+-----------+
|   video_id|
+-----------+
|2kyS6SvSYSE|
|1ZAPwfrtAFY|
|5qpjK5DgCt4|
|puqaWrEC7tY|
|d380meD0W0M|
|gHZ1Qz0KiKM|
|39idVpFF7NQ|
|nc99ccSXST0|
|jr9QtXwC9vc|
|TUmyygCMMGA|
|9wRQljFNDW8|
|VifQlJit6A0|
|5E4ZBSInqUU|
|GgVmn66oK_A|
|TaTleo4cOs8|
|kgaO45SyaO4|
|ZAQs-ctOqXQ|
|YVfyYrEmzgM|
|eNSN6qet1kE|
|B5HORANmzHw|
+-----------+
only showing top 20 rows



In [44]:
df10.select('video_id', 'trending_date').show() # La otra forma.

+-----------+-------------+
|   video_id|trending_date|
+-----------+-------------+
|2kyS6SvSYSE|     17.14.11|
|1ZAPwfrtAFY|     17.14.11|
|5qpjK5DgCt4|     17.14.11|
|puqaWrEC7tY|     17.14.11|
|d380meD0W0M|     17.14.11|
|gHZ1Qz0KiKM|     17.14.11|
|39idVpFF7NQ|     17.14.11|
|nc99ccSXST0|     17.14.11|
|jr9QtXwC9vc|     17.14.11|
|TUmyygCMMGA|     17.14.11|
|9wRQljFNDW8|     17.14.11|
|VifQlJit6A0|     17.14.11|
|5E4ZBSInqUU|     17.14.11|
|GgVmn66oK_A|     17.14.11|
|TaTleo4cOs8|     17.14.11|
|kgaO45SyaO4|     17.14.11|
|ZAQs-ctOqXQ|     17.14.11|
|YVfyYrEmzgM|     17.14.11|
|eNSN6qet1kE|     17.14.11|
|B5HORANmzHw|     17.14.11|
+-----------+-------------+
only showing top 20 rows



- select() presenta la desventaja de que no puedo construir expresiones dentro. Por ejemplo esto dará error:

In [45]:
# df10.select(
#     'likes',
#     'dislikes',
#     ('likes' - 'dislikes') # Una nueva columna que sea la resta de likes y dislikes.
# )

- Tenemos que hacerlo con 'col'

In [46]:
df10.select(
    col('likes'),
    col('dislikes'),
    (col('likes') - col('dislikes')).alias('aceptacion')
).show()

+------+--------+----------+
| likes|dislikes|aceptacion|
+------+--------+----------+
| 57527|    2966|   54561.0|
| 97185|    6146|   91039.0|
|146033|    5339|  140694.0|
| 10172|     666|    9506.0|
|132235|    1989|  130246.0|
|  9763|     511|    9252.0|
| 15993|    2445|   13548.0|
| 23663|     778|   22885.0|
|  3543|     119|    3424.0|
| 12654|    1363|   11291.0|
|   655|      25|     630.0|
|  1576|     303|    1273.0|
|114188|    1333|  112855.0|
|  7848|    1171|    6677.0|
|  7473|     246|    7227.0|
|  9419|      52|    9367.0|
|  8011|     638|    7373.0|
|  5398|      53|    5345.0|
| 11963|      36|   11927.0|
|  8421|     191|    8230.0|
+------+--------+----------+
only showing top 20 rows



- **selectExpr**

In [47]:
df10.selectExpr('likes', 'dislikes', '(likes - dislikes) as aceptacion').show() # La otra forma.

+------+--------+----------+
| likes|dislikes|aceptacion|
+------+--------+----------+
| 57527|    2966|   54561.0|
| 97185|    6146|   91039.0|
|146033|    5339|  140694.0|
| 10172|     666|    9506.0|
|132235|    1989|  130246.0|
|  9763|     511|    9252.0|
| 15993|    2445|   13548.0|
| 23663|     778|   22885.0|
|  3543|     119|    3424.0|
| 12654|    1363|   11291.0|
|   655|      25|     630.0|
|  1576|     303|    1273.0|
|114188|    1333|  112855.0|
|  7848|    1171|    6677.0|
|  7473|     246|    7227.0|
|  9419|      52|    9367.0|
|  8011|     638|    7373.0|
|  5398|      53|    5345.0|
| 11963|      36|   11927.0|
|  8421|     191|    8230.0|
+------+--------+----------+
only showing top 20 rows



In [48]:
df10.selectExpr('count(distinct (video_id)) as videos').show()

+------+
|videos|
+------+
|  6837|
+------+



### **Transformaciones: funciones filter() y where()**

In [49]:
df11 = spark.read.parquet('./data/data/dataPARQUET.parquet')

- **filter()**

In [50]:
from pyspark.sql.functions import col

In [51]:
df11.show()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...|        CaseyNeistat|         22|2017-11-13T17:13:...|     SHANtell martin| 748374| 57527|    2966|        15954|https://i.ytimg.c...|            False|           Fal

In [52]:
df11.filter(col('video_id') == '2kyS6SvSYSE').show() # Nos trae las filas que coinciden con ese valor de video_id.

+-----------+-------------+--------------------+-------------+-----------+--------------------+---------------+-------+-----+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|channel_title|category_id|        publish_time|           tags|  views|likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+-------------+-----------+--------------------+---------------+-------+-----+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|2kyS6SvSYSE|     17.14.11|WE WANT TO TALK A...| CaseyNeistat|         22|2017-11-13T17:13:...|SHANtell martin| 748374|57527|    2966|        15954|https://i.ytimg.c...|            False|           False|                 False|SHANTELL'S CHANNE...|
|2ky

- **where()**

- Es muy similar a filter()

In [53]:
df12 = spark.read.parquet('./data/data/dataPARQUET.parquet').where(col('trending_date') != '17.14.11')

In [54]:
df12.show()

+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|            video_id|       trending_date|               title|       channel_title|         category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|\nCook with confi...|             recipes|              videos| and restaurant g...| dining destinations| and hosting idea...|            

In [55]:
df13 = spark.read.parquet('./data/data/dataPARQUET.parquet').where(col('likes') > 5000)

In [56]:
df13.filter((col('trending_date') != '17.14.11') & (col('likes') > 7000)).show()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|YvfYK0EEhK4|     17.15.11|Brent Pella - Why...|         Brent Pella|         23|2017-11-14T15:32:...|"spirit airlines"...| 462490| 14132|     795|          666|https://i.ytimg.c...|            False|           Fal

- También podríamos haberlo hecho de esta otra forma.

In [57]:
df13.filter(col('trending_date') != '17.14.11').filter(col('likes') > 7000).show()

+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|   video_id|trending_date|               title|       channel_title|category_id|        publish_time|                tags|  views| likes|dislikes|comment_count|      thumbnail_link|comments_disabled|ratings_disabled|video_error_or_removed|         description|
+-----------+-------------+--------------------+--------------------+-----------+--------------------+--------------------+-------+------+--------+-------------+--------------------+-----------------+----------------+----------------------+--------------------+
|YvfYK0EEhK4|     17.15.11|Brent Pella - Why...|         Brent Pella|         23|2017-11-14T15:32:...|"spirit airlines"...| 462490| 14132|     795|          666|https://i.ytimg.c...|            False|           Fal

### **Transformaciones: funciones distinct() y dropDuplicates()**

- Tienen la misma función pero dropDuplicates() nos permite definir la lógica.

In [58]:
df14 = spark.read.parquet('./data/data/dataPARQUET.parquet')

- **distinct()**

In [59]:
df_sin_duplicados = df14.distinct() # Elimina los duplicados.

In [60]:
print('El conteo del dataframe original es {}'.format(df14.count()))
print('El conteo del dataframe sin duplicados es {}'.format(df_sin_duplicados.count()))

El conteo del dataframe original es 48137
El conteo del dataframe sin duplicados es 41497


- **dropDuplicates()**

- También podríamos llamarlo sin llamar ningún parámetro, sería lo mismo que distinct()

In [61]:
dataframe = spark.createDataFrame([(1, 'azul', 567), (2, 'rojo', 487), (1, 'azul', 345), (2, 'verde', 783)]).toDF('id', 'color', 'importe')

In [62]:
dataframe.show()

+---+-----+-------+
| id|color|importe|
+---+-----+-------+
|  1| azul|    567|
|  2| rojo|    487|
|  1| azul|    345|
|  2|verde|    783|
+---+-----+-------+



In [63]:
dataframe.dropDuplicates(['id', 'color']).show() # Aquellas filas en estas columnas que tienen valores iguales.

+---+-----+-------+
| id|color|importe|
+---+-----+-------+
|  1| azul|    567|
|  2| rojo|    487|
|  2|verde|    783|
+---+-----+-------+



### **Transformaciones: funciones sort(), orderBy() y limit()**

In [64]:
from pyspark.sql.functions import col

In [65]:
df15 = spark.read.parquet('./data/data/dataPARQUET.parquet')

In [66]:
df15 = df15.select(col('likes'), col('views'), col('video_id'), col('dislikes')).dropDuplicates(['video_id'])

In [67]:
df15.show()

+-----+-------+-----------+--------+
|likes|  views|   video_id|dislikes|
+-----+-------+-----------+--------+
| 6531| 475965|-0CMnp02rNY|     172|
| 4429| 563746|-0NYY8cqdiQ|      54|
|32752|1566807|-1Hm41N0dUs|     393|
| 5214| 129360|-1yT-K3c6YI|     108|
|  438|  67429|-2RVw2_QyxQ|      23|
|19339|1012527|-2aVkGcI7ZA|     633|
| 1444|  84744|-2b4qSoMnKE|     199|
|10350| 703371|-2wRFv-mScQ|     260|
|73480| 545655|-35jibKqbEo|     727|
|    2|   2863|-37nIo_tLnk|       0|
| 4028| 385104|-39ysKKpE7I|     343|
| 6468| 230360|-3h4Xt9No9o|     177|
|10384| 249601|-3nEHRN6IPg|     370|
|38776| 296237|-4s2MeUgduo|     466|
|71090| 390631|-5aaJJQFvOg|     635|
|21224| 744363|-66xHRJSPxs|     534|
|17882| 363370|-7AZX5Xtiks|     416|
|36960| 908989|-7UzyXO-mzk|     434|
|17120|1815030|-7_ATlZ-zMc|     633|
|  760| 252542|-8ZHXaGILlc|     100|
+-----+-------+-----------+--------+
only showing top 20 rows



- **sort()**

In [68]:
df15.sort('likes').show()

+-----+-----+--------------------+--------+
|likes|views|            video_id|dislikes|
+-----+-----+--------------------+--------+
| null| null|Awesome Games Pla...|    null|
| null| null|Filmed by Lucas F...|    null|
| null| null|    Beautiful Thing:|    null|
| null| null|Bon Appétit Test ...|    null|
| null| null|Filmed at the Bee...|    null|
| null| null|Britton Lane: htt...|    null|
| null| null|Allie Sherlock: h...|    null|
| null| null|Browse thousands ...|    null|
| null| null|   ABOUT BON APPÉTIT|    null|
| null| null|Catch Terry Crews...|    null|
| null| null|    ABOUT EPICURIOUS|    null|
| null| null|Check Out My WEBS...|    null|
| null| null|    ABOUT TEEN VOGUE|    null|
| null| null|Check out the Dam...|    null|
| null| null|         ABOUT VOGUE|    null|
| null| null|          City Song:|    null|
| null| null|Filmed at the Wal...|    null|
| null| null|            Clearly:|    null|
| null| null|Black Panther is ...|    null|
| null| null|Cook with confide..

In [69]:
from pyspark.sql.functions import desc

In [70]:
df15.sort(desc('likes')).show()

+-----+-------+-----------+--------+
|likes|  views|   video_id|dislikes|
+-----+-------+-----------+--------+
|99990|2079137|2v4-L4PkV9U|    2844|
|99973|2465294|DSRSgMp5X1w|   17299|
|99952|3313449|LdhQzXHYLZ4|    5142|
| 9991|  98513|eBnXbImHX-g|      91|
| 9988|1162843|kz1xzBYppW8|    2555|
|99851|1053828|vRf3azp1pak|    1226|
| 9984| 206669|Lydh_saD9EQ|      88|
| 9984| 254807|Ps7GzIV2KP0|     294|
|  998|  71308|Hkx5fveyjIs|      74|
|  998|  54348|Pr6zjrF0Djg|      75|
|  998|  82087|hX643KbiI4s|      93|
|99761|1454233|h5CLO2n6OxQ|     692|
|  997|  27234|nb42DxagyOE|      13|
| 9969| 273905|c47kn_Y4y8A|     127|
| 9946| 242329|QXcbVHFE2bo|     148|
| 9939| 235293|1iGBHh1q0Kg|     232|
| 9926| 467558|hHFuZVGpBC0|     342|
|99254|1552618|0v-6AylRH68|    5195|
| 9925| 166235|flLc6LmAG6c|      50|
| 9921| 594536|e9NOwaiXqqA|     323|
+-----+-------+-----------+--------+
only showing top 20 rows



- **orderBy()**

In [71]:
df15.orderBy(col('views')).show()

+-----+-----+--------------------+--------+
|likes|views|            video_id|dislikes|
+-----+-----+--------------------+--------+
| null| null|Catch Terry Crews...|    null|
| null| null|ABOUT ARCHITECTUR...|    null|
| null| null|Check Out My WEBS...|    null|
| null| null|    ABOUT EPICURIOUS|    null|
| null| null|Check out the Dam...|    null|
| null| null|         ABOUT WIRED|    null|
| null| null|          City Song:|    null|
| null| null|   ABOUT BON APPÉTIT|    null|
| null| null|            Clearly:|    null|
| null| null|Browse thousands ...|    null|
| null| null|Cook with confide...|    null|
| null| null|   ABOUT VANITY FAIR|    null|
| null| null|Directed by Lucas...|    null|
| null| null|Britton Lane: htt...|    null|
| null| null|Arts and entertai...|    null|
| null| null|Awesome Games Pla...|    null|
| null| null|    Beautiful Thing:|    null|
| null| null|  Expert travel tips|    null|
| null| null|Black Panther is ...|    null|
| null| null|             Fashio

In [72]:
df15.orderBy(col('views').desc()).show()

+------+-------+-----------+--------+
| likes|  views|   video_id|dislikes|
+------+-------+-----------+--------+
|126363| 999910|gw82GrEt370|    1034|
| 78088| 999867|cyhU06cXfeU|     690|
| 58552| 998908|QIN5_tJRiyY|    1080|
|151348|9988608|fAIX12F6958|   10274|
| 70972| 998362|LC3fWTXZXxE|    1608|
|  4727|  99796|kOsl3cmK3zg|     152|
|   120|   9977|1L_fPteZOYQ|      11|
|   299|  99674|Yzx_tSlifIw|      95|
|119634| 996318|__1SjDrSMik|    1143|
|  3959|  99619|9ymjcSvEyhk|     158|
|   969|  99577|qLVZiKDNxtw|     197|
|  2136|  99566|pWZnhJay0Y8|     419|
|  6487|  99535|BfbIoUMdKZ0|     116|
|  3761|  99510|bhqH6tTr_Lk|     615|
|   258| 994747|i7FgneNlM14|     398|
| 21094| 994662|-CS84oCtjvc|     714|
|107029| 993913|EWf7P3okX9s|     472|
|  9507| 993593|vYb4_ARPNfo|    3647|
|  7388| 993435|z1FfOwjlqxU|     919|
|   111|   9932|b3KFfgoDzw8|       9|
+------+-------+-----------+--------+
only showing top 20 rows



In [73]:
d16 = spark.createDataFrame([(1, 'azul', 568), (2, 'rojo', 235), (1, 'azul', 456), (2, 'azul', 783)]).toDF('id', 'color', 'importe')

In [74]:
d16.show()

+---+-----+-------+
| id|color|importe|
+---+-----+-------+
|  1| azul|    568|
|  2| rojo|    235|
|  1| azul|    456|
|  2| azul|    783|
+---+-----+-------+



In [75]:
d16.orderBy(col('color').desc(), col('importe')).show()

+---+-----+-------+
| id|color|importe|
+---+-----+-------+
|  2| rojo|    235|
|  1| azul|    456|
|  1| azul|    568|
|  2| azul|    783|
+---+-----+-------+



- **limit()**

In [76]:
top_10 = df15.orderBy(col('views').desc()).limit(10)

In [77]:
top_10.show()

+------+-------+-----------+--------+
| likes|  views|   video_id|dislikes|
+------+-------+-----------+--------+
|126363| 999910|gw82GrEt370|    1034|
| 78088| 999867|cyhU06cXfeU|     690|
| 58552| 998908|QIN5_tJRiyY|    1080|
|151348|9988608|fAIX12F6958|   10274|
| 70972| 998362|LC3fWTXZXxE|    1608|
|  4727|  99796|kOsl3cmK3zg|     152|
|   120|   9977|1L_fPteZOYQ|      11|
|   299|  99674|Yzx_tSlifIw|      95|
|119634| 996318|__1SjDrSMik|    1143|
|  3959|  99619|9ymjcSvEyhk|     158|
+------+-------+-----------+--------+



### **Transformaciones: funciones withColumn() y withColumnRenamed()**

- **withColumn()**

In [78]:
df17 = spark.read.parquet('./data/data/dataPARQUET.parquet')

In [79]:
from pyspark.sql.functions import col

In [80]:
df_valoracion = df17.withColumn('valoracion', col('likes') - col('dislikes'))

In [81]:
df_valoracion.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)
 |-- valoracion: double (nullable = true)



In [82]:
df_valoracion1 = (df17.withColumn('valoracion', (col('likes') - col('dislikes')))
                  .withColumn('res:div', col('valoracion') % 10)
)

In [83]:
df_valoracion1.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)
 |-- valoracion: double (nullable = true)
 |-- res:div: double (nullable = true)



In [84]:
df_valoracion1.select('likes', 'dislikes', 'valoracion', 'res:div').show()

+------+--------+----------+-------+
| likes|dislikes|valoracion|res:div|
+------+--------+----------+-------+
| 57527|    2966|   54561.0|    1.0|
| 97185|    6146|   91039.0|    9.0|
|146033|    5339|  140694.0|    4.0|
| 10172|     666|    9506.0|    6.0|
|132235|    1989|  130246.0|    6.0|
|  9763|     511|    9252.0|    2.0|
| 15993|    2445|   13548.0|    8.0|
| 23663|     778|   22885.0|    5.0|
|  3543|     119|    3424.0|    4.0|
| 12654|    1363|   11291.0|    1.0|
|   655|      25|     630.0|    0.0|
|  1576|     303|    1273.0|    3.0|
|114188|    1333|  112855.0|    5.0|
|  7848|    1171|    6677.0|    7.0|
|  7473|     246|    7227.0|    7.0|
|  9419|      52|    9367.0|    7.0|
|  8011|     638|    7373.0|    3.0|
|  5398|      53|    5345.0|    5.0|
| 11963|      36|   11927.0|    7.0|
|  8421|     191|    8230.0|    0.0|
+------+--------+----------+-------+
only showing top 20 rows



- **withColumnRenamed()**

In [85]:
df_renombrado = df17.withColumnRenamed('video_id', 'id')

In [86]:
df_renombrado.printSchema()

root
 |-- id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



- Spark no arrojará error si el nombre de la columna original no existe.

In [87]:
df_error = df17.withColumnRenamed('nombre_que_no_existe', 'otro_nombre')

In [88]:
df_error.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



### **Transformaciones: funciones drop(), sample() y randomSplit()**

In [89]:
df18 = spark.read.parquet('./data/data/dataPARQUET.parquet')

- **drop()**

- Si no existe la columna en el schema, se ignorará.

In [90]:
df18.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [91]:
df_util = df18.drop('comments_disabled')

In [92]:
df_util.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [93]:
df_util = df18.drop('comments_disabled', 'ratings_disabled', 'thumbnail_link')

In [94]:
df_util.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [95]:
df_util = df18.drop('comments_disabled', 'ratings_disabled', 'thumbnail_link', 'cafe')

In [96]:
df_util.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



- **sample()**

In [97]:
df_muestra = df18.sample(0.8) # El 80% aproximadamente de las filas de DF original.

In [98]:
num_filas = df18.count()
num_filas_muestra = df_muestra.count()

print('El 80% de las filas del dataframe original es {}'.format(num_filas - (num_filas * 0.2)))
print('El número de filas del dataframe muestra es {}'.format(num_filas_muestra))

El 80% de las filas del dataframe original es 38509.6
El número de filas del dataframe muestra es 38530


In [99]:
df_muestra = df18.sample(fraction=0.8, seed=1234)

- con seed tomaremos posteriormente la misma muestra si lo deseamos, especificando la misma seed.

In [100]:
df_muestra = df18.sample(withReplacement=True, fraction=0.8, seed=1234)

- withReplacement permite seleccionar más de una vez la misma fila.

- **randomSplit()**

- Devuelve uno o más DF dependiendo de la cantidad de pesos que le especifiquemos.

In [101]:
train, test = df18.randomSplit([0.8, 0.2], seed=1234) # Si no llega a 1 se normalizará automáticamente para llegar a 1.

### **Trabajo con datos incorrectos o faltantes**

In [102]:
df19 = spark.read.parquet('./data/data/dataPARQUET.parquet')

In [103]:
df19.count()

48137

In [104]:
df19.na.drop().count()

40390

- Otra alternativa.

In [105]:
df19.na.drop('any').count()

40390

Otra alternativa.

In [106]:
df19.dropna().count()

40390

In [107]:
df19.na.drop(subset=['views']).count() # Elimina las filas que tengan valores nulos en la columna 'views'.

41061

In [108]:
df19.na.drop(subset=['views', 'dislikes']).count()

41035

In [109]:
from pyspark.sql.functions import col

In [110]:
df19.orderBy(col('views')).select(col('views'), col('likes'), col('dislikes')).show()

+-----+-----+--------+
|views|likes|dislikes|
+-----+-----+--------+
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
+-----+-----+--------+
only showing top 20 rows



- Imputar un valor a los nulos.

In [117]:
df19.fillna(0).orderBy(col('views')).select(col('views'), col('likes'), col('dislikes')).show()

+-----+-----+--------+
|views|likes|dislikes|
+-----+-----+--------+
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
+-----+-----+--------+
only showing top 20 rows



- Rellenar solo ciertas columnas.

In [118]:
df19.fillna(0, subset = ['likes', 'dislikes']).orderBy(col('views')).select(col('views'), col('likes'), col('dislikes')).show()

+-----+-----+--------+
|views|likes|dislikes|
+-----+-----+--------+
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
| null| null|    null|
+-----+-----+--------+
only showing top 20 rows



### **Acciones sobre un DF en Spark SQL**

- Igual que en el RDD, desencadenarán todas las transformaciones acumuladas en el DAG.

In [119]:
df20 = spark.read.parquet('./data/data/dataPARQUET.parquet')