# Analisi di 28 milioni di recensioni di film
# Procuriamoci il Dataset

In [3]:
!wget http://files.grouplens.org/datasets/movielens/ml-latest.zip

--2020-03-14 17:17:04--  http://files.grouplens.org/datasets/movielens/ml-latest.zip
Risoluzione di files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connessione a files.grouplens.org (files.grouplens.org)|128.101.65.152|:80... connesso.
Richiesta HTTP inviata, in attesa di risposta... 200 OK
Lunghezza: 277113433 (264M) [application/zip]
Salvataggio in: "ml-latest.zip.1"


2020-03-14 17:18:43 (2,66 MB/s) - "ml-latest.zip.1" salvato [277113433/277113433]



# Inizializziamo Spark

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("movie_reviews").getOrCreate()

# Importiamo il Dataset in un Dataframe

In [6]:
df = spark.read.load("ml-latest/ratings.csv", format="csv")
df.show()

+------+-------+------+----------+
|   _c0|    _c1|   _c2|       _c3|
+------+-------+------+----------+
|userId|movieId|rating| timestamp|
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
|     1|   1590|   2.5|1256677236|
|     1|   1591|   1.5|1256677475|
|     1|   2134|   4.5|1256677464|
|     1|   2478|   4.0|1256677239|
|     1|   2840|   3.0|1256677500|
|     1|   2986|   2.5|1256677496|
|     1|   3020|   4.0|1256677260|
|     1|   3424|   4.5|1256677444|
|     1|   3698|   3.5|1256677243|
|     1|   3826|   2.0|1256677210|
|     1|   3893|   3.5|1256677486|
|     2|    170|   3.5|1192913581|
|     2|    849|   3.5|1192913537|
|     2|   1186|   3.5|1192913611|
+------+-------+------+----------+
only showing top 20 rows



In [7]:
df.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)
 |-- _c3: string (nullable = true)



In [9]:
df = spark.read.option("header", "true").option("inferSchema", "true").csv("ml-latest/ratings.csv")
df.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
|     1|   1590|   2.5|1256677236|
|     1|   1591|   1.5|1256677475|
|     1|   2134|   4.5|1256677464|
|     1|   2478|   4.0|1256677239|
|     1|   2840|   3.0|1256677500|
|     1|   2986|   2.5|1256677496|
|     1|   3020|   4.0|1256677260|
|     1|   3424|   4.5|1256677444|
|     1|   3698|   3.5|1256677243|
|     1|   3826|   2.0|1256677210|
|     1|   3893|   3.5|1256677486|
|     2|    170|   3.5|1192913581|
|     2|    849|   3.5|1192913537|
|     2|   1186|   3.5|1192913611|
|     2|   1235|   3.0|1192913585|
+------+-------+------+----------+
only showing top 20 rows



In [10]:
df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



# Correggiamo lo schema

In [13]:
from pyspark.sql.types import *

data_schema = [StructField('userID', StringType(), True),
                StructField('movieID', StringType(), True),
                StructField('rating', FloatType(), True),
                StructField('timestamp', IntegerType(), True)]

schema = StructType(fields=data_schema)

In [15]:
df = spark.read.schema(schema) \
    .option("header", "true") \
    .option("inferSchema", "false") \
    .csv("ml-latest/ratings.csv")

df.show()

+------+-------+------+----------+
|userID|movieID|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|1256677221|
|     1|    481|   3.5|1256677456|
|     1|   1091|   1.5|1256677471|
|     1|   1257|   4.5|1256677460|
|     1|   1449|   4.5|1256677264|
|     1|   1590|   2.5|1256677236|
|     1|   1591|   1.5|1256677475|
|     1|   2134|   4.5|1256677464|
|     1|   2478|   4.0|1256677239|
|     1|   2840|   3.0|1256677500|
|     1|   2986|   2.5|1256677496|
|     1|   3020|   4.0|1256677260|
|     1|   3424|   4.5|1256677444|
|     1|   3698|   3.5|1256677243|
|     1|   3826|   2.0|1256677210|
|     1|   3893|   3.5|1256677486|
|     2|    170|   3.5|1192913581|
|     2|    849|   3.5|1192913537|
|     2|   1186|   3.5|1192913611|
|     2|   1235|   3.0|1192913585|
+------+-------+------+----------+
only showing top 20 rows



In [16]:
df.printSchema()

root
 |-- userID: string (nullable = true)
 |-- movieID: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: integer (nullable = true)



In [20]:
from pyspark.sql.functions import  from_unixtime , to_date

df.withColumn('timestamp', to_date(from_unixtime(df["timestamp"]))).show()

+------+-------+------+----------+
|userID|movieID|rating| timestamp|
+------+-------+------+----------+
|     1|    307|   3.5|2009-10-27|
|     1|    481|   3.5|2009-10-27|
|     1|   1091|   1.5|2009-10-27|
|     1|   1257|   4.5|2009-10-27|
|     1|   1449|   4.5|2009-10-27|
|     1|   1590|   2.5|2009-10-27|
|     1|   1591|   1.5|2009-10-27|
|     1|   2134|   4.5|2009-10-27|
|     1|   2478|   4.0|2009-10-27|
|     1|   2840|   3.0|2009-10-27|
|     1|   2986|   2.5|2009-10-27|
|     1|   3020|   4.0|2009-10-27|
|     1|   3424|   4.5|2009-10-27|
|     1|   3698|   3.5|2009-10-27|
|     1|   3826|   2.0|2009-10-27|
|     1|   3893|   3.5|2009-10-27|
|     2|    170|   3.5|2007-10-20|
|     2|    849|   3.5|2007-10-20|
|     2|   1186|   3.5|2007-10-20|
|     2|   1235|   3.0|2007-10-20|
+------+-------+------+----------+
only showing top 20 rows



In [21]:
from pyspark.sql.functions import to_utc_timestamp

df = df.withColumn('timestamp', to_utc_timestamp(from_unixtime(df["timestamp"]), "yyyy-MM-dd hh:mm:ss"))
df.show(5)

+------+-------+------+-------------------+
|userID|movieID|rating|          timestamp|
+------+-------+------+-------------------+
|     1|    307|   3.5|2009-10-27 22:00:21|
|     1|    481|   3.5|2009-10-27 22:04:16|
|     1|   1091|   1.5|2009-10-27 22:04:31|
|     1|   1257|   4.5|2009-10-27 22:04:20|
|     1|   1449|   4.5|2009-10-27 22:01:04|
+------+-------+------+-------------------+
only showing top 5 rows



In [22]:
df.printSchema()

root
 |-- userID: string (nullable = true)
 |-- movieID: string (nullable = true)
 |-- rating: float (nullable = true)
 |-- timestamp: timestamp (nullable = true)

