In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

spark = SparkSession.builder.appName("s8a-dataframes-actividades").getOrCreate()

esquema = StructType([
    StructField("interprete", StringType(), False),
    StructField("pelicula", StringType(), False),
    StructField("anyo", IntegerType(), False)
])

dfm = spark.read.option("sep", "\t").schema(esquema).csv("../movies.tsv")
    
dfm.printSchema()


root
 |-- interprete: string (nullable = true)
 |-- pelicula: string (nullable = true)
 |-- anyo: integer (nullable = true)



In [8]:
dfm.show(10)

+-----------------+--------------------+----+
|       interprete|            pelicula|anyo|
+-----------------+--------------------+----+
|McClure, Marc (I)|       Freaky Friday|2003|
|McClure, Marc (I)|        Coach Carter|2005|
|McClure, Marc (I)|         Superman II|1980|
|McClure, Marc (I)|           Apollo 13|1995|
|McClure, Marc (I)|            Superman|1978|
|McClure, Marc (I)|  Back to the Future|1985|
|McClure, Marc (I)|Back to the Futur...|1990|
|Cooper, Chris (I)|  Me, Myself & Irene|2000|
|Cooper, Chris (I)|         October Sky|1999|
|Cooper, Chris (I)|              Capote|2005|
+-----------------+--------------------+----+
only showing top 10 rows



In [9]:
esquema = StructType([
    StructField("calificacion", DoubleType(), False),
    StructField("pelicula", StringType(), False),
    StructField("anyo", IntegerType(), False)
])

dfr = spark.read.option("sep", "\t").schema(esquema).csv("../movie-ratings.tsv")
    
dfr.printSchema()



root
 |-- calificacion: double (nullable = true)
 |-- pelicula: string (nullable = true)
 |-- anyo: integer (nullable = true)



In [10]:
dfr.show(10)

+------------+--------------------+----+
|calificacion|            pelicula|anyo|
+------------+--------------------+----+
|      1.6339|'Crocodile' Dunde...|1988|
|      7.6177|                  10|1979|
|      1.2864|10 Things I Hate ...|1999|
|      0.3243|           10,000 BC|2008|
|      0.3376|      101 Dalmatians|1996|
|      0.5218|      102 Dalmatians|2000|
|     12.8205|                1066|2012|
|      0.6829|                  12|2007|
|      7.4061|           12 Rounds|2009|
|      2.3677|           127 Hours|2010|
+------------+--------------------+----+
only showing top 10 rows



In [14]:
df = dfm.join(dfr, ["pelicula", "anyo"])

In [16]:
df.count()

                                                                                

31392

In [51]:
# Muestra para cada año, la película con mayor puntuación (año, título de la película, puntuación)
from pyspark.sql.functions import max 
df.groupBy("anyo").agg(max("calificacion").alias("calificacion"), collect_list("interprete")).join(dfr, ["anyo", "calificacion"]).orderBy("anyo").show()

+----+------------+------------------------+--------------------+
|anyo|calificacion|collect_list(interprete)|            pelicula|
+----+------------+------------------------+--------------------+
|1961|      0.6726|    [Wickes, Mary, Wr...|One Hundred and O...|
|1967|      1.3485|    [Howard, Clint, W...|     The Jungle Book|
|1972|      0.5099|    [Brando, Marlon, ...|       The Godfather|
|1973|      0.6581|    [Burstyn, Ellen, ...|        The Exorcist|
|1975|       0.701|    [Grossman, Ted (I...|                Jaws|
|1977|      1.2184|    [Balaban, Bob, Pr...|Saturday Night Fever|
|1978|      1.9793|    [McClure, Marc (I...|              Jaws 2|
|1979|      1.9906|    [Brando, Marlon, ...|      Apocalypse Now|
|1980|      0.8739|    [McClure, Marc (I...|         Superman II|
|1981|      2.1052|    [Balaban, Bob, Ra...|   Absence of Malice|
|1982|      1.2501|    [Kogan, Milt, Lov...|         First Blood|
|1983|      1.4011|    [Johnson, Mike (V...|               Yentl|
|1984|    

In [50]:
from pyspark.sql.functions import collect_list, first 
# first no funciona pq coge la primera película del anyo ... si agrupasemos por anyo y max calificación sí que funcionaría
df.groupBy("anyo").agg(max("calificacion").alias("calificacion"), first("pelicula"), collect_list("interprete")).show()

+----+------------+--------------------+------------------------+
|anyo|calificacion|     first(pelicula)|collect_list(interprete)|
+----+------------+--------------------+------------------------+
|1990|      2.0055|Back to the Futur...|    [McClure, Marc (I...|
|1975|       0.701|                Jaws|    [Grossman, Ted (I...|
|1977|      1.2184|Close Encounters ...|    [Balaban, Bob, Pr...|
|2003|     14.1829|       Freaky Friday|    [McClure, Marc (I...|
|2007|      2.0822|             Beowulf|    [Jolie, Angelina,...|
|2006|      2.0503|   The Good Shepherd|    [Jolie, Angelina,...|
|1978|      1.9793|            Superman|    [McClure, Marc (I...|
|1961|      0.6726|One Hundred and O...|    [Wickes, Mary, Wr...|
|1997|      2.0878|  As Good as It Gets|    [Knight, Shirley ...|
|1988|      1.7632|Who Framed Roger ...|    [Lander, David L....|
|1994|      2.0346|D2: The Mighty Ducks|    [Butters, Mike, M...|
|1973|      0.6581|        The Exorcist|    [Burstyn, Ellen, ...|
|1979|    

In [40]:
df.where((df.anyo == 1990) & (df.calificacion == 2.0055)).select("pelicula").distinct().show()

+-----------------+
|         pelicula|
+-----------------+
|Presumed Innocent|
+-----------------+



In [44]:
df.where((df.anyo == 1990) & (df.pelicula == "Back to the Future Part III")).show()

+--------------------+----+--------------------+------------+
|            pelicula|anyo|          interprete|calificacion|
+--------------------+----+--------------------+------------+
|Back to the Futur...|1990|   McClure, Marc (I)|       1.678|
|Back to the Futur...|1990|            Flea (I)|       1.678|
|Back to the Futur...|1990|   Steenburgen, Mary|       1.678|
|Back to the Futur...|1990|  Sperber, Wendie Jo|       1.678|
|Back to the Futur...|1990|   Thompson, Lea (I)|       1.678|
|Back to the Futur...|1990|   Logan, Ricky Dean|       1.678|
|Back to the Futur...|1990|  McKinney, Bill (I)|       1.678|
|Back to the Futur...|1990|     Shue, Elisabeth|       1.678|
|Back to the Futur...|1990|  Wynne, Christopher|       1.678|
|Back to the Futur...|1990| Fox, Michael J. (I)|       1.678|
|Back to the Futur...|1990|       Tolkan, James|       1.678|
|Back to the Futur...|1990|        Cundey, Dean|       1.678|
|Back to the Futur...|1990|         Cohen, J.J.|       1.678|
|Back to

In [92]:
# Averigua las tres parejas de intérpretes han trabajado juntos en más ocasiones.
# La salida debe tener tres columnas: `interprete1`, `interprete2` y `cantidad`. (necesitas utilizar un *self-join*)


from pyspark.sql.functions import col, count, least, greatest

# le indicamos alias a los campos para eliminar ambigüedades
df4 = (df.alias("a")).join((df.alias("b")), (col("a.anyo") == col("b.anyo")) & (col("a.pelicula") == col("b.pelicula")), "inner")
df4 = df4.filter(col("a.interprete") != col("b.interprete"))
df4 = df4.groupBy(least(col("a.interprete"), col("b.interprete")).alias("interprete1"), greatest(col("a.interprete"), col("b.interprete")).alias("interprete2")).agg(count("*").alias("cantidad")).orderBy("cantidad", ascending=False)
df4.show()


[Stage 210:>                                                        (0 + 1) / 1]

+-----------------+------------------+--------+
|      interprete1|       interprete2|cantidad|
+-----------------+------------------+--------+
| Lynn, Sherry (I)|   McGowan, Mickie|      46|
|  Bergen, Bob (I)|  Lynn, Sherry (I)|      38|
|  Bergen, Bob (I)|   McGowan, Mickie|      38|
|  Angel, Jack (I)|  Lynn, Sherry (I)|      34|
|  Angel, Jack (I)|   McGowan, Mickie|      34|
| Lynn, Sherry (I)|       Rabson, Jan|      32|
|  McGowan, Mickie|       Rabson, Jan|      32|
|Darling, Jennifer|   McGowan, Mickie|      30|
|Sandler, Adam (I)|Schneider, Rob (I)|      28|
|  Bergen, Bob (I)|     Harnell, Jess|      28|
|  Bergen, Bob (I)|       Rabson, Jan|      28|
|Darling, Jennifer|  Lynn, Sherry (I)|      28|
| Farmer, Bill (I)|   McGowan, Mickie|      28|
|    Harnell, Jess|   McGowan, Mickie|      28|
|  Bergen, Bob (I)|   Bumpass, Rodger|      26|
|    Harnell, Jess|  Lynn, Sherry (I)|      26|
|  Angel, Jack (I)|   Bergen, Bob (I)|      26|
| Farmer, Bill (I)|  Lynn, Sherry (I)|  


                                                                                

In [96]:
# Averigua las tres parejas de intérpretes han trabajado juntos en más ocasiones.
# La salida debe tener tres columnas: `interprete1`, `interprete2` y `cantidad`. (necesitas utilizar un *self-join*)


from pyspark.sql.functions import col, count, least, greatest

# le indicamos alias a los campos para eliminar ambigüedades
df4 = (df.alias("a")).join((df.alias("b")), (col("a.anyo") == col("b.anyo")) & (col("a.pelicula") == col("b.pelicula")), "inner")
df4 = df4.filter(col("a.interprete") != col("b.interprete"))
df4 = df4.groupBy(col("a.interprete").alias("interprete1"), col("b.interprete").alias("interprete2")).agg(count("*").alias("cantidad")).orderBy("cantidad", ascending=False)
df4.show()


[Stage 246:>                                                        (0 + 1) / 1]

+------------------+-----------------+--------+
|       interprete1|      interprete2|cantidad|
+------------------+-----------------+--------+
|   McGowan, Mickie| Lynn, Sherry (I)|      23|
|  Lynn, Sherry (I)|  McGowan, Mickie|      23|
|  Lynn, Sherry (I)|  Bergen, Bob (I)|      19|
|   Bergen, Bob (I)| Lynn, Sherry (I)|      19|
|   Bergen, Bob (I)|  McGowan, Mickie|      19|
|   McGowan, Mickie|  Bergen, Bob (I)|      19|
|  Lynn, Sherry (I)|  Angel, Jack (I)|      17|
|   Angel, Jack (I)| Lynn, Sherry (I)|      17|
|   Angel, Jack (I)|  McGowan, Mickie|      17|
|   McGowan, Mickie|  Angel, Jack (I)|      17|
|       Rabson, Jan| Lynn, Sherry (I)|      16|
|       Rabson, Jan|  McGowan, Mickie|      16|
|   McGowan, Mickie|      Rabson, Jan|      16|
|  Lynn, Sherry (I)|      Rabson, Jan|      16|
| Darling, Jennifer|  McGowan, Mickie|      15|
|   McGowan, Mickie|Darling, Jennifer|      15|
|Schneider, Rob (I)|Sandler, Adam (I)|      14|
|     Harnell, Jess|  Bergen, Bob (I)|  


[Stage 249:>                                                        (0 + 4) / 4]

                                                                                