# Ejercicios Dataframes 2
Realiza los siguientes ejercicios. Soluciona cada uno de ellos empleando la DataFrame API y Spark SQL
1. Inicializa la variable spark (0)

In [1]:
#DataFrame API
import string
import sys
from pyspark import sql

spark = sql.SparkSession.builder \
    .master("spark://spark-master:7077") \
    .appName("04-dfej2") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "hdfs:///spark/logs/history") \
    .config("spark.history.fs.logDirectory", "hdfs:///spark/logs/history") \
    .getOrCreate()

sc = spark.sparkContext

#SQL



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


2. Crea un dataframe llamado mi_df con los datos del archivo *data/retail-data/all/online-retail-dataset.csv* (0)

In [2]:
#DataFrame API
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType
# Definir el esquema

schema = StructType([
    StructField("InvoiceNo", StringType(), True),
    StructField("StockCode", StringType(), True),
    StructField("Description", StringType(), True),
    StructField("Quantity", IntegerType(), True),
    StructField("InvoiceDate", StringType(), True),  # ou TimestampType se se parsea
    StructField("UnitPrice", DoubleType(), True),
    StructField("CustomerID", IntegerType(), True),
    StructField("Country", StringType(), True)
])

# Leer el archivo CSV con el esquema especificado

mi_df = spark.read.csv("hdfs:///user/jovyan/data/retail-data/all/online-retail-dataset.csv", header=True, schema=schema)
#SQL
mi_df.createOrReplaceTempView("retail")



3. Cuenta el número de celdas totales

In [5]:
from pyspark.sql.functions import count
#DataFrame API
mi_df.select(count("*")).show()

#SQL
spark.sql("Select count(*) from retail").show()


+--------+
|count(1)|
+--------+
|  541909|
+--------+

+--------+
|count(1)|
+--------+
|  541909|
+--------+



4. Cuenta el número de "invoiceNo" distintos

In [17]:
from pyspark.sql.functions import countDistinct
#DataFrame API
mi_df.select(countDistinct("InvoiceNo")).show()

#SQL
spark.sql("select count(distinct InvoiceNo) from retail").show()


                                                                                

+-------------------------+
|count(DISTINCT InvoiceNo)|
+-------------------------+
|                    25900|
+-------------------------+

+-------------------------+
|count(DISTINCT InvoiceNo)|
+-------------------------+
|                    25900|
+-------------------------+



5. Obtén el número de factura más bajo y el más alto

In [20]:
#DataFrame API
from pyspark.sql.functions import min, max
mi_df.select(
    min("InvoiceNo").alias("factura_min"),
    max("InvoiceNo").alias("factura_max")
).show()
#SQL
spark.sql("""
    SELECT 
        MIN(InvoiceNo) AS factura_min,
        MAX(InvoiceNo) AS factura_max
    FROM retail
""").show()


+-----------+-----------+
|factura_min|factura_max|
+-----------+-----------+
|     536365|    C581569|
+-----------+-----------+

+-----------+-----------+
|factura_min|factura_max|
+-----------+-----------+
|     536365|    C581569|
+-----------+-----------+



6. Obtén la suma de todos los importes unitarios de los productos vendidos en el Reino Unido

In [18]:
#DataFrame API
from pyspark.sql.functions import sum

mi_df.filter("Country == 'United Kingdom'") \
  .select(sum("UnitPrice").alias("suma")) \
  .show()

#SQL
spark.sql("Select sum (UnitPrice) as suma FROM retail where Country = 'United Kingdom' ").show()


+-----------------+
|             suma|
+-----------------+
|2245715.473997657|
+-----------------+

+-----------------+
|             suma|
+-----------------+
|2245715.473997657|
+-----------------+



7. Obtén la media de todos los importes unitarios de los productos vendidos en el Reino Unido

In [17]:
#DataFrame API
from pyspark.sql.functions import avg

mi_df.filter("Country == 'United Kingdom'") \
  .select(avg("UnitPrice").alias("media")) \
  .show()

#SQL
spark.sql("Select avg (UnitPrice) as media FROM retail where Country = 'United Kingdom' ").show()


                                                                                

+-----------------+
|            media|
+-----------------+
|4.532422174138221|
+-----------------+

+-----------------+
|            media|
+-----------------+
|4.532422174138221|
+-----------------+



8. Obtén la el número total de productos vendidos (quantity) agrupado por países (muestra los 10 primeros)

In [26]:
#DataFrame API

mi_df.groupBy("Country").sum("Quantity").show()
#SQL
spark.sql("SELECT Country, sum(Quantity) FROM retail GROUP BY Country").show()


+---------------+-------------+
|        Country|sum(Quantity)|
+---------------+-------------+
|         Sweden|        35637|
|      Singapore|         5234|
|        Germany|       117448|
|            RSA|          352|
|         France|       110480|
|         Greece|         1556|
|        Belgium|        23152|
|        Finland|        10666|
|          Malta|          944|
|    Unspecified|         3300|
|          Italy|         7999|
|           EIRE|       142637|
|         Norway|        19247|
|          Spain|        26824|
|        Denmark|         8188|
|      Hong Kong|         4769|
|         Israel|         4353|
|        Iceland|         2458|
|Channel Islands|         9479|
|            USA|         1034|
+---------------+-------------+
only showing top 20 rows

+---------------+-------------+
|        Country|sum(Quantity)|
+---------------+-------------+
|         Sweden|        35637|
|      Singapore|         5234|
|        Germany|       117448|
|            R

9. Obtén la media de los precios unitarios de los productos vendidos agrupada por países

In [28]:
#DataFrame API
mi_df.groupBy("Country").avg("UnitPrice").show()

#SQL

spark.sql("SELECT Country, avg(UnitPrice) FROM retail GROUP BY Country").show()

                                                                                

+---------------+------------------+
|        Country|    avg(UnitPrice)|
+---------------+------------------+
|         Sweden| 3.910887445887447|
|      Singapore|109.64580786026204|
|        Germany| 3.966929963138558|
|            RSA| 4.277586206896552|
|         France| 5.028864087881328|
|         Greece| 4.885547945205478|
|        Belgium| 3.644335427742861|
|        Finland|  5.44870503597123|
|          Malta| 5.244173228346455|
|    Unspecified| 2.699573991031391|
|          Italy| 4.831120797011214|
|           EIRE| 5.911077354807337|
|         Norway| 6.012025782688754|
|          Spain| 4.987544413738618|
|        Denmark|3.2569408740359873|
|      Hong Kong| 42.50520833333331|
|         Israel| 3.633131313131315|
|        Iceland|2.6440109890109893|
|Channel Islands| 4.932124010554092|
|            USA|2.2164261168384876|
+---------------+------------------+
only showing top 20 rows

+---------------+------------------+
|        Country|    avg(UnitPrice)|
+-----------

10. Obtén el importe total (quantity * unit price) agrupado por número de factura (invoiceNo)

In [34]:
from pyspark.sql.functions import expr
#DataFrame API
mi_df.withColumn("total", expr("quantity * UnitPrice")).groupBy("InvoiceNo").sum("total").show()
#SQL
spark.sql("SELECT InvoiceNo, sum( quantity*UnitPrice)  FROM retail GROUP BY InvoiceNo").show()


+---------+------------------+
|InvoiceNo|        sum(total)|
+---------+------------------+
|   563020| 605.1400000000001|
|   565747|315.65000000000003|
|   566248|            140.96|
|   566431|            303.36|
|   567163|305.06999999999994|
|   567695|               0.0|
|   567879| 534.2799999999997|
|   568222|            185.85|
|   568711|            112.32|
|   569020|1238.5400000000004|
|   569560|318.44000000000005|
|   569823|219.13999999999996|
|   570234|328.53000000000003|
|   570264|               0.0|
|   570281| 676.3199999999999|
|   570592|           2299.61|
|   571010|               0.0|
|   571906|14.850000000000001|
|   572049| 414.2100000000001|
|   572458|            498.74|
+---------+------------------+
only showing top 20 rows

+---------+---------------------------+
|InvoiceNo|sum((quantity * UnitPrice))|
+---------+---------------------------+
|   563020|          605.1400000000001|
|   565747|         315.65000000000003|
|   566248|                   

11. Crea un DataFrame para cada archivo de notas (notas_fisica, notas_ingles y notas_matemáticas). Realiza un Join que genere un DataFrame mostrando las tres notas para cada alumno. (sólo mostrar los alumnos que tengan nota en las 3 asignaturas)

In [52]:
#DataFrame API
from pyspark.sql.functions import col
notas_fisica = spark.read.option("header",False).option("inferSchema",True).csv("hdfs:/user/jovyan/data/notas/notas_fisica.txt").toDF("alumno","nota_fisica")
notas_ingles = spark.read.option("header",False).option("inferSchema",True).csv("hdfs:/user/jovyan/data/notas/notas_ingles.txt").toDF("alumno","nota_ingles")
notas_matematicas = spark.read.option("header",False).option("inferSchema",True).csv("hdfs:/user/jovyan/data/notas/notas_mates.txt").toDF("alumno","nota_matematicas")


notas_join = notas_fisica.join(notas_matematicas, "alumno", "inner").join(notas_ingles, "alumno", "inner")
notas_join.show()
#.join(notas_ingles)
#notas_join.show()
#SQL
notas_fisica.createOrReplaceTempView("fisica")
notas_matematicas.createOrReplaceTempView("matematicas")
notas_ingles.createOrReplaceTempView("ingles")
spark.sql("SELECT fisica.alumno, nota_fisica, nota_matematicas, nota_ingles FROM fisica, matematicas, ingles WHERE fisica.alumno = matematicas.alumno and matematicas.alumno = ingles.alumno").show()



+---------+-----------+----------------+-----------+
|   alumno|nota_fisica|nota_matematicas|nota_ingles|
+---------+-----------+----------------+-----------+
|    Angel|          9|             6.0|          4|
|    Maria|          3|             2.0|          6|
|    Ramon|          7|             4.5|          8|
|    Jorge|          5|            10.0|          5|
|   Susana|          9|             9.0|          2|
|   Anabel|          2|             8.0|          7|
|    Rocio|          5|             6.0|          4|
|   Carlos|          4|             4.0|          8|
|    Rocio|          7|             6.0|          4|
|   Triana|          3|             3.0|          4|
|   Andres|          4|             4.0|          6|
| Fernando|          9|             5.0|          7|
| Leonardo|          6|             1.0|          4|
|    Oscar|          5|             7.0|          3|
|   Isabel|          8|             8.0|          7|
|Jose Juan|          3|             5.0|      

12. Crea un DataFrame para cada archivo de notas (notas_fisica, notas_ingles y notas_matemáticas). Realiza un Join que genere un DataFrame mostrando las tres notas para cada alumno. Si a un alumno le falta alguna de las notas aparecerá el valor NULL

In [56]:
#DataFrame API
notas_outer_join = notas_fisica.join(notas_matematicas, "alumno", "full_outer").join(notas_ingles, "alumno", "full_outer")
notas_outer_join.show()

#SQL
spark.sql("SELECT fisica.alumno, nota_fisica, nota_matematicas, nota_ingles FROM fisica FULL OUTER JOIN matematicas ON fisica.alumno = matematicas.alumno FULL OUTER JOIN ingles ON coalesce(fisica.alumno, matematicas.alumno) = ingles.alumno").show()



+---------+-----------+----------------+-----------+
|   alumno|nota_fisica|nota_matematicas|nota_ingles|
+---------+-----------+----------------+-----------+
|Alejandro|          3|             5.0|          7|
|   Anabel|          2|             8.0|          7|
|   Andres|          4|             4.0|          6|
|    Angel|          9|             6.0|          4|
|   Carlos|          4|             4.0|          8|
| Fernando|          9|             5.0|          7|
|   Isabel|          8|             8.0|          7|
|    Jorge|          5|            10.0|          5|
|Jose Juan|          3|             5.0|          3|
| Leonardo|          6|             1.0|          4|
|    Maria|          3|             2.0|          6|
|  Nicolas|          7|             2.0|          5|
|    Oscar|          5|             7.0|          3|
|    Pedro|          2|             5.0|       NULL|
|    Ramon|          7|             4.5|          8|
|    Rocio|          5|             6.0|      