In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
            .master('local[*]') \
            .appName('spark') \
            .getOrCreate()

In [3]:
movies = spark.read.csv('../../data/title.basics.tsv', sep='\t', header=True, nullValue='\\N')
movies = movies.select('tconst') \
            .filter(movies.titleType == 'movie') \
            .filter(movies.isAdult == 0)

In [5]:
ratings = spark.read.csv('../../data/title.ratings.tsv', sep='\t', header=True, nullValue='\\N')

In [7]:
ratings.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: string (nullable = true)
 |-- numVotes: string (nullable = true)



In [9]:
ratings.show(5, False)

+---------+-------------+--------+
|tconst   |averageRating|numVotes|
+---------+-------------+--------+
|tt0000001|5.7          |1953    |
|tt0000002|5.8          |264     |
|tt0000003|6.5          |1787    |
|tt0000004|5.6          |179     |
|tt0000005|6.2          |2587    |
+---------+-------------+--------+
only showing top 5 rows



Limpeza dos Dados

In [12]:
ratings = ratings.withColumn('averageRating', ratings.averageRating.cast('float')) \
                 .withColumn('numVotes', ratings.numVotes.cast('int')) \
                 .withColumnRenamed('tconst', 'movie_id')


ratings.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- averageRating: float (nullable = true)
 |-- numVotes: integer (nullable = true)



In [13]:
movies_ratings = movies.join(ratings, movies.tconst == ratings.movie_id, how='inner')\
                       .select('tconst','averageRating','numVotes')

movies_ratings.show(10, False)

+---------+-------------+--------+
|tconst   |averageRating|numVotes|
+---------+-------------+--------+
|tt0000630|2.8          |26      |
|tt0000675|4.2          |20      |
|tt0000862|4.4          |17      |
|tt0000941|4.5          |24      |
|tt0001112|3.8          |43      |
|tt0001348|4.0          |12      |
|tt0001531|4.6          |15      |
|tt0001790|6.2          |51      |
|tt0001812|5.5          |14      |
|tt0001911|3.6          |24      |
+---------+-------------+--------+
only showing top 10 rows



In [14]:
del movies
del ratings

In [15]:
movies_ratings.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: float (nullable = true)
 |-- numVotes: integer (nullable = true)



Armazenamento dos Dados

In [16]:
import psycopg2

In [17]:
DATABASE_CONFIG = {
    'host':'localhost',
    'port': '5432',
    'database':'sistema-de-recomendacao',
    'user' :'flask',
    'password':'password'
}

In [18]:
conn = psycopg2.connect(**DATABASE_CONFIG)
cur = conn.cursor()

In [19]:
cur.execute(f'''
            DROP TABLE IF EXISTS ratings_movie;
            
            CREATE TABLE ratings_movie(
                id INT GENERATED ALWAYS AS IDENTITY,
                tconst varchar (50) NOT NULL, 
                averageRating numeric NOT NULL,
                numVotes integer NOT NULL,
                PRIMARY KEY (id)
            );''')

In [20]:
data = [tuple(ratings) for ratings in movies_ratings.collect()]
template = ','.join(['%s'] * len(data))

In [21]:
columns = ','.join(movies_ratings.columns)

In [22]:
query = f'INSERT INTO ratings_movie ({columns}) VALUES {template}'
cur.execute(query, data)

In [23]:
conn.commit()

In [24]:
spark.stop()