In [29]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("s8a-dataframes-actividades").getOrCreate()

# Lectura de CSV con el ; como separador de columnas y con encabezado
df = spark.read.option("inferSchema", "true").json("../nombres.json")
    
df.printSchema()

root
 |-- Ciudad: string (nullable = true)
 |-- Edad: long (nullable = true)
 |-- Nombre: string (nullable = true)



In [30]:
df.show()

+--------+----+------+
|  Ciudad|Edad|Nombre|
+--------+----+------+
|   Elche|  45| Aitor|
|Alicante|  23| Maria|
|   Elche|  19| Laura|
|    Aspe|  45| Sonia|
|   Elche|null| Pedro|
+--------+----+------+



In [31]:
df = df.withColumn("mayor30", df.Edad > 30)
df.show()

+--------+----+------+-------+
|  Ciudad|Edad|Nombre|mayor30|
+--------+----+------+-------+
|   Elche|  45| Aitor|   true|
|Alicante|  23| Maria|  false|
|   Elche|  19| Laura|  false|
|    Aspe|  45| Sonia|   true|
|   Elche|null| Pedro|   null|
+--------+----+------+-------+



In [32]:
df = df.withColumn("FaltanJubilacion", 67-df.Edad)
df.show()

+--------+----+------+-------+----------------+
|  Ciudad|Edad|Nombre|mayor30|FaltanJubilacion|
+--------+----+------+-------+----------------+
|   Elche|  45| Aitor|   true|              22|
|Alicante|  23| Maria|  false|              44|
|   Elche|  19| Laura|  false|              48|
|    Aspe|  45| Sonia|   true|              22|
|   Elche|null| Pedro|   null|            null|
+--------+----+------+-------+----------------+



In [33]:
from pyspark.sql.functions import lit
df = df.withColumn("apellidos", lit("XXX"))
df.show()

+--------+----+------+-------+----------------+---------+
|  Ciudad|Edad|Nombre|mayor30|FaltanJubilacion|apellidos|
+--------+----+------+-------+----------------+---------+
|   Elche|  45| Aitor|   true|              22|      XXX|
|Alicante|  23| Maria|  false|              44|      XXX|
|   Elche|  19| Laura|  false|              48|      XXX|
|    Aspe|  45| Sonia|   true|              22|      XXX|
|   Elche|null| Pedro|   null|            null|      XXX|
+--------+----+------+-------+----------------+---------+



In [34]:
df = df.drop("mayor30", "apellidos")
df.show()

+--------+----+------+----------------+
|  Ciudad|Edad|Nombre|FaltanJubilacion|
+--------+----+------+----------------+
|   Elche|  45| Aitor|              22|
|Alicante|  23| Maria|              44|
|   Elche|  19| Laura|              48|
|    Aspe|  45| Sonia|              22|
|   Elche|null| Pedro|            null|
+--------+----+------+----------------+



In [35]:
from pyspark.sql.functions import year, current_date
df = df.withColumn("AnyoNac", year(current_date())-df.Edad)
df.show()

+--------+----+------+----------------+-------+
|  Ciudad|Edad|Nombre|FaltanJubilacion|AnyoNac|
+--------+----+------+----------------+-------+
|   Elche|  45| Aitor|              22|   1977|
|Alicante|  23| Maria|              44|   1999|
|   Elche|  19| Laura|              48|   2003|
|    Aspe|  45| Sonia|              22|   1977|
|   Elche|null| Pedro|            null|   null|
+--------+----+------+----------------+-------+



In [36]:
from pyspark.sql.functions import monotonically_increasing_id
df= df.withColumn("Id", monotonically_increasing_id())
df.show()

+--------+----+------+----------------+-------+---+
|  Ciudad|Edad|Nombre|FaltanJubilacion|AnyoNac| Id|
+--------+----+------+----------------+-------+---+
|   Elche|  45| Aitor|              22|   1977|  0|
|Alicante|  23| Maria|              44|   1999|  1|
|   Elche|  19| Laura|              48|   2003|  2|
|    Aspe|  45| Sonia|              22|   1977|  3|
|   Elche|null| Pedro|            null|   null|  4|
+--------+----+------+----------------+-------+---+



In [37]:
df = df.select("Id", "Nombre", "Edad", "AnyoNac", "FaltanJubilacion", "Ciudad")
df.show()

+---+------+----+-------+----------------+--------+
| Id|Nombre|Edad|AnyoNac|FaltanJubilacion|  Ciudad|
+---+------+----+-------+----------------+--------+
|  0| Aitor|  45|   1977|              22|   Elche|
|  1| Maria|  23|   1999|              44|Alicante|
|  2| Laura|  19|   2003|              48|   Elche|
|  3| Sonia|  45|   1977|              22|    Aspe|
|  4| Pedro|null|   null|            null|   Elche|
+---+------+----+-------+----------------+--------+

