In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
            .master('local[*]') \
            .appName('spark') \
            .getOrCreate()

In [3]:
movies = spark.read.csv('data/title.basics.tsv', sep='\t', header=True, nullValue='\\N')
movies = movies.select('tconst') \
            .filter(movies.titleType == 'movie') \
            .filter(movies.isAdult == 0)

In [4]:
akas = spark.read.csv('data/title.akas.tsv', sep='\t', header=True, nullValue='\\N')

In [5]:
akas.printSchema()

root
 |-- titleId: string (nullable = true)
 |-- ordering: string (nullable = true)
 |-- title: string (nullable = true)
 |-- region: string (nullable = true)
 |-- language: string (nullable = true)
 |-- types: string (nullable = true)
 |-- attributes: string (nullable = true)
 |-- isOriginalTitle: string (nullable = true)



In [6]:
akas.show(5, False)

+---------+--------+-------------------------+------+--------+-----------+-------------+---------------+
|titleId  |ordering|title                    |region|language|types      |attributes   |isOriginalTitle|
+---------+--------+-------------------------+------+--------+-----------+-------------+---------------+
|tt0000001|1       |Карменсіта               |UA    |null    |imdbDisplay|null         |0              |
|tt0000001|2       |Carmencita               |DE    |null    |null       |literal title|0              |
|tt0000001|3       |Carmencita - spanyol tánc|HU    |null    |imdbDisplay|null         |0              |
|tt0000001|4       |Καρμενσίτα               |GR    |null    |imdbDisplay|null         |0              |
|tt0000001|5       |Карменсита               |RU    |null    |imdbDisplay|null         |0              |
+---------+--------+-------------------------+------+--------+-----------+-------------+---------------+
only showing top 5 rows



Limpeza dos Dados

In [7]:
movies_akas = movies.join(akas, movies.tconst == akas.titleId, how='inner')\
                    .select('tconst','title','region','language','isOriginalTitle')\

movies_akas.show(10, False)

+---------+------------------------+------+--------+---------------+
|tconst   |title                   |region|language|isOriginalTitle|
+---------+------------------------+------+--------+---------------+
|tt0000862|Faldgruben              |null  |null    |1              |
|tt0000862|Faldgruben              |DK    |null    |0              |
|tt0000941|Locura de amor          |null  |null    |1              |
|tt0000941|Love Crazy              |XWW   |en      |0              |
|tt0000941|Locura de amor          |ES    |null    |0              |
|tt0001051|El sueño milagroso      |ES    |null    |0              |
|tt0001051|El sueño milagroso      |null  |null    |1              |
|tt0001051|Magical Dream           |XWW   |en      |0              |
|tt0001059|O Terremoto de Benavente|null  |null    |1              |
|tt0001059|O Terremoto de Benavente|PT    |null    |0              |
+---------+------------------------+------+--------+---------------+
only showing top 10 rows



In [8]:
del movies
del akas

In [9]:
movies_akas.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- title: string (nullable = true)
 |-- region: string (nullable = true)
 |-- language: string (nullable = true)
 |-- isOriginalTitle: string (nullable = true)



Armazenamento dos Dados

In [11]:
import psycopg2

In [12]:
DATABASE_CONFIG = {
    'host':'localhost',
    'port': '5432',
    'database':'sistema-de-recomendacao',
    'user' :'flask',
    'password':'password'
}

In [18]:
conn = psycopg2.connect(**DATABASE_CONFIG)
cur = conn.cursor()

In [19]:
cur.execute(f'''
            DROP TABLE IF EXISTS akas_movie;
            
            CREATE TABLE akas_movie(
                id INT GENERATED ALWAYS AS IDENTITY,
                tconst varchar (50) NOT NULL, 
                title text NOT NULL,
                region varchar (50), 
                language varchar (50),
                isOriginalTitle integer, 
                PRIMARY KEY (id)
            );''')

In [15]:
data = [tuple(akas) for akas in movies_akas.collect()]
template = ','.join(['%s'] * len(data))

In [16]:
columns = ','.join(movies_akas.columns)

In [20]:
query = f'INSERT INTO akas_movie ({columns}) VALUES {template}'
cur.execute(query, data)

In [21]:
conn.commit()

In [22]:
spark.stop()