In [269]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import when, lit

In [268]:
spark = SparkSession.builder\
            .master('local[*]') \
            .appName('spark') \
            .getOrCreate()

UsageError: Line magic function `%%capture` not found.


In [242]:
df = spark.read.csv('data/title.basics.tsv', sep='\t', header=True, nullValue='\\N')

In [243]:
df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [244]:
df.show(5)

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|   null|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|   null|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|   null|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|   null|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|   null|             1|        Comedy

Limpeza dos Dados

In [245]:
df = df.select('tconst','primaryTitle','startYear','runtimeMinutes','genres') \
        .filter(df.titleType == 'movie') \
        .filter(df.isAdult == 0)

In [246]:
df.show(5)

+---------+--------------------+---------+--------------+--------------------+
|   tconst|        primaryTitle|startYear|runtimeMinutes|              genres|
+---------+--------------------+---------+--------------+--------------------+
|tt0000009|          Miss Jerry|     1894|            45|             Romance|
|tt0000147|The Corbett-Fitzs...|     1897|           100|Documentary,News,...|
|tt0000502|            Bohemios|     1905|           100|                null|
|tt0000574|The Story of the ...|     1906|            70|Action,Adventure,...|
|tt0000591|    The Prodigal Son|     1907|            90|               Drama|
+---------+--------------------+---------+--------------+--------------------+
only showing top 5 rows



In [247]:
df.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [248]:
genres = df.select('tconst','genres')
movies = df.drop('genres')

In [249]:
movies = movies.withColumn('startYear', df.startYear.cast('int'))\
               .withColumn('runtimeMinutes', df.runtimeMinutes.cast('int'))

In [250]:
movies.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- startYear: integer (nullable = true)
 |-- runtimeMinutes: integer (nullable = true)



In [251]:
genres.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- genres: string (nullable = true)



In [252]:
def get_unique_genres(list_genres: list[Row]) -> list[str]:
    
    list_unique_genres = []
    
    for genres in list_genres:
    
        genres = genres.__getitem__('genres')
        
        if genres is None:
            continue
        
        genres = genres.split(',')
        
        for genre in genres:
            if genre not in list_unique_genres:
                list_unique_genres.append(genre)
                
    return list_unique_genres

In [253]:
list_genres = genres.select('genres').distinct().collect()
list_unique_genres = get_unique_genres(list_genres)

In [254]:
for genre in list_unique_genres:
    
    genres = genres.withColumn(f'is{genre}', \
                when((df.genres.contains(genre)), lit(1)) \
                .otherwise(lit(0)) \
            )

Armanezamento dos Dados

In [255]:
import psycopg2

In [256]:
DATABASE_CONFIG = {
    'host':'localhost',
    'port': '5432',
    'database':'sistema-de-recomendacao',
    'user' :'flask',
    'password':'password'
}

In [257]:
conn = psycopg2.connect(**DATABASE_CONFIG)
cur = conn.cursor()

In [258]:
columns_genres = [f'{str(column).replace("-","")} integer NOT NULL' for column in genres.columns if 'is' in column]

In [259]:
cur.execute(f'''
            DROP TABLE IF EXISTS movie;
            DROP TABLE IF EXISTS genres_movie;
            
            CREATE TABLE movie(
                tconst varchar (50) NOT NULL, 
                primaryTitle text NOT NULL, 
                startYear integer, 
                runtimeMinutes integer,
                PRIMARY KEY (tconst)
            );
        
            CREATE TABLE genres_movie(
                id INT GENERATED ALWAYS AS IDENTITY,
                tconst varchar (50) NOT NULL,
                genres varchar(255),
                {', '.join(columns_genres)},
                PRIMARY KEY(id)
            );''')

In [260]:
data = [tuple(movie) for movie in movies.collect()]
template = ','.join(['%s'] * len(data))

In [261]:
columns = ','.join(movies.columns)

In [262]:
query = f'INSERT INTO movie ({columns}) VALUES {template}'
cur.execute(query, data)

In [263]:
data = [tuple(genre) for genre in genres.collect()]
template = ','.join(['%s'] * len(data))

In [264]:
columns = ','.join([str(column).replace('-','') for column in genres.columns])

In [265]:
query = f'INSERT INTO genres_movie ({columns}) VALUES {template}'
cur.execute(query, data)

In [266]:
conn.commit()

In [None]:
spark.stop()