In [7]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("s8a-dataframes-actividades").getOrCreate()

df = spark.read.option("sep", "\t").option("inferSchema", "true").csv("../movies.tsv")
    
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: integer (nullable = true)



In [8]:
df.show(10)

+-----------------+--------------------+----+
|              _c0|                 _c1| _c2|
+-----------------+--------------------+----+
|McClure, Marc (I)|       Freaky Friday|2003|
|McClure, Marc (I)|        Coach Carter|2005|
|McClure, Marc (I)|         Superman II|1980|
|McClure, Marc (I)|           Apollo 13|1995|
|McClure, Marc (I)|            Superman|1978|
|McClure, Marc (I)|  Back to the Future|1985|
|McClure, Marc (I)|Back to the Futur...|1990|
|Cooper, Chris (I)|  Me, Myself & Irene|2000|
|Cooper, Chris (I)|         October Sky|1999|
|Cooper, Chris (I)|              Capote|2005|
+-----------------+--------------------+----+
only showing top 10 rows



In [45]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

esquema = StructType([
    StructField("interprete", StringType(), False),
    StructField("pelicula", StringType(), False),
    StructField("anyo", IntegerType(), False)
])

In [46]:
# Lectura de CSV con el ; como separador de columnas y con encabezado
df = spark.read.option("sep", "\t").schema(esquema).csv("../movies.tsv")
    
df.printSchema()

root
 |-- interprete: string (nullable = true)
 |-- pelicula: string (nullable = true)
 |-- anyo: integer (nullable = true)



In [47]:
df.show(10)

+-----------------+--------------------+----+
|       interprete|            pelicula|anyo|
+-----------------+--------------------+----+
|McClure, Marc (I)|       Freaky Friday|2003|
|McClure, Marc (I)|        Coach Carter|2005|
|McClure, Marc (I)|         Superman II|1980|
|McClure, Marc (I)|           Apollo 13|1995|
|McClure, Marc (I)|            Superman|1978|
|McClure, Marc (I)|  Back to the Future|1985|
|McClure, Marc (I)|Back to the Futur...|1990|
|Cooper, Chris (I)|  Me, Myself & Irene|2000|
|Cooper, Chris (I)|         October Sky|1999|
|Cooper, Chris (I)|              Capote|2005|
+-----------------+--------------------+----+
only showing top 10 rows



In [48]:
df.describe().show()


[Stage 71:>                                                         (0 + 1) / 1]

+-------+------------------+--------------------+------------------+
|summary|        interprete|            pelicula|              anyo|
+-------+------------------+--------------------+------------------+
|  count|             31393|               31393|             31393|
|   mean|              null|  312.61538461538464|2002.7964514382188|
| stddev|              null|   485.7043414390151| 6.377135379933117|
|    min|   Aaron, Caroline|'Crocodile' Dunde...|              1961|
|    max|von Sydow, Max (I)|                 xXx|              2012|
+-------+------------------+--------------------+------------------+




                                                                                

In [49]:
from pyspark.sql.functions import count_distinct
df.select(count_distinct("pelicula")).show()

+------------------------+
|count(DISTINCT pelicula)|
+------------------------+
|                    1409|
+------------------------+



In [51]:
# actores de la película Superman
dfS1 = df.where(df.pelicula == "Superman").select("interprete")
dfS2 = df.where(df.pelicula == "Superman II").select("interprete")

In [52]:
# interpretes que aparecen en Superman y Superman II
dfS1_2 = dfS1.intersect(dfS2)
dfS1_2.show()

+------------------+
|        interprete|
+------------------+
|  O'Halloran, Jack|
|   Tucker, Burnell|
|  Hollis, John (I)|
|       Beatty, Ned|
|    Stamp, Terence|
|Ratzenberger, John|
|     Hackman, Gene|
|    Fielder, Harry|
|  Perrine, Valerie|
| McClure, Marc (I)|
|   Donner, Richard|
+------------------+



In [59]:
from pyspark.sql.functions import col
df.groupBy("interprete").agg(count("pelicula").alias("cantidad")).filter(col("cantidad") >= 20).show()

+-------------------+--------+
|         interprete|cantidad|
+-------------------+--------+
|Williams, Robin (I)|      22|
|  Murphy, Eddie (I)|      21|
|      Harnell, Jess|      31|
|   Wilson, Owen (I)|      23|
|        Damon, Matt|      27|
|     Travolta, John|      20|
|       Stiller, Ben|      21|
|      Diaz, Cameron|      21|
|      Willis, Bruce|      27|
|   Tatasciore, Fred|      38|
|      Simmons, J.K.|      20|
|Freeman, Morgan (I)|      22|
|         Hanks, Tom|      25|
|    Hoffman, Dustin|      21|
|   Farmer, Bill (I)|      20|
|        Cruise, Tom|      23|
| Jackson, Samuel L.|      32|
|      Welker, Frank|      38|
|    Newman, Laraine|      20|
|    Clooney, George|      20|
+-------------------+--------+
only showing top 20 rows



In [60]:
from pyspark.sql.functions import col
df.groupBy("interprete").count().where(df.interprete == "Murphy, Eddie (I)").show()

+-----------------+-----+
|       interprete|count|
+-----------------+-----+
|Murphy, Eddie (I)|   21|
+-----------------+-----+



In [66]:
df.filter(df.anyo < 1980).groupBy("pelicula").count().show()

+--------------------+-----+
|            pelicula|count|
+--------------------+-----+
|Close Encounters ...|    9|
|       The Godfather|   12|
|One Hundred and O...|    2|
|        The Exorcist|    5|
|      Apocalypse Now|   13|
|     The Jungle Book|    2|
|                Jaws|    5|
|   Kramer vs. Kramer|    6|
|            Superman|   24|
|              Jaws 2|    1|
|           Star Wars|   25|
|      Operación Ogro|    1|
|Saturday Night Fever|    6|
|              Grease|    5|
|           Moonraker|   11|
|               Alien|    6|
+--------------------+-----+



In [70]:
from pyspark.sql.functions import col
df.groupBy("anyo").count().orderBy("count", ascending=False).show(5)

+----+-----+
|anyo|count|
+----+-----+
|2006| 2078|
|2004| 2005|
|2007| 1986|
|2005| 1960|
|2011| 1926|
+----+-----+
only showing top 5 rows

