In [None]:
import os
import pandas as pd

os.chdir(os.path.expanduser("~/work/MLOps"))
from src.utils.s3 import s3
from src.utils.db import PostgreSQLDatabase

In [None]:
# It is possible to pass SQL commands directly instead of relying on the helper functions:
with PostgreSQLDatabase() as db:
    db.cursor.execute("SELECT * FROM movies WHERE movie_id = 'tt0029583';")
    query_results = db.cursor.fetchall()
    print(f"{query_results}")

# Summaries

In [None]:
with PostgreSQLDatabase() as db:
    print(f"{len(db.query_data("movies"))} movies")

In [None]:
# Raw reviews
with PostgreSQLDatabase() as db:
    print(f"{len(db.query_data("reviews_raw"))} total reviews")

with PostgreSQLDatabase() as db:
    for movie_id, title in set((movie[0], movie[1]) for movie in db.query_data('movies')):
        nb_reviews = len(db.query_data("reviews_raw", condition=f"movie_id = '{(movie_id)}'"))
        print(f"{nb_reviews} review for {title}")

In [None]:
# Analyzed reviews
with PostgreSQLDatabase() as db:
    print(f"{len(db.query_data("reviews_sentiments"))} total reviews")

with PostgreSQLDatabase() as db:
    for movie_id, title in set((movie[0], movie[1]) for movie in db.query_data('movies')):
        i = 0
        for review_id in set(review[1] for review in db.query_data("reviews_raw", condition=f"movie_id = '{(movie_id)}'")):
            if db.query_data("reviews_sentiments", condition=f"review_id = '{(review_id)}'"):
                i += 1
        print(f"{i} analyzed review for {title}")

# Get tables

In [None]:
with PostgreSQLDatabase() as db:
    movies = db.query_data('movies')
movies = pd.DataFrame(movies)
column_names = {
    'movie_id': 'VARCHAR(10) PRIMARY KEY',
    'title': 'VARCHAR(250)',
    'release_date': 'DATE',
    'nb_reviews': 'INTEGER',
    'scrapping_timestamp': 'TIMESTAMP'
}
movies.columns = column_names.keys()
movies.style.hide(axis="index")

In [None]:
with PostgreSQLDatabase() as db:
    reviews_raw = db.query_data('reviews_raw')
reviews_raw = pd.DataFrame(reviews_raw)
column_names = {
    'movie_id': 'VARCHAR(10) REFERENCES movies(movie_id) ON DELETE CASCADE',
    'review_id': 'VARCHAR(10) PRIMARY KEY',
    'author': 'VARCHAR(150)',
    'title': 'VARCHAR(500)',
    'text': 'TEXT',
    'rating': 'INTEGER',
    'date': 'DATE',
    'upvotes': 'INTEGER',
    'downvotes': 'INTEGER',
    'last_update': 'TIMESTAMP',
    'to_process': 'INTEGER'
}
reviews_raw.columns = column_names.keys()
reviews_raw.head().style.hide(axis="index")

In [None]:
with PostgreSQLDatabase() as db:
    reviews_sentiments = db.query_data('reviews_sentiments')
reviews_sentiments = pd.DataFrame(reviews_sentiments)
column_names = {
    'review_id': 'VARCHAR(10) PRIMARY KEY REFERENCES reviews_raw(review_id) ON DELETE CASCADE',
    'author': 'VARCHAR(150)',
    'story': 'INTEGER',
    'acting': 'INTEGER',
    'visuals': 'INTEGER',
    'sounds': 'INTEGER',
    'values': 'INTEGER',
    'overall': 'INTEGER'
}
reviews_sentiments.columns = column_names.keys()
reviews_sentiments.head().style.hide(axis="index")

In [None]:
# Generate samples
movies.to_csv("data/sample/movies.csv", index=False)
reviews_raw.to_csv("data/sample/reviews_raw.csv", index=False)
reviews_sentiments.to_csv("data/sample/reviews_sentiments.csv", index=False)

# Get values

In [None]:
with PostgreSQLDatabase() as db:
    movies_id = [movie[0] for movie in db.query_data('movies')]
print(movies_id)

In [None]:
movie_id = 'tt0029583'
with PostgreSQLDatabase() as db:
    metadata = db.query_data("movies", condition=f"movie_id = '{(movie_id)}'")
print(metadata)

# Remove entries

In [None]:
with PostgreSQLDatabase() as db:
    db.remove_data("movies", "movie_id", "tt0097874", movie_id=None)

In [None]:
with PostgreSQLDatabase() as db:
    db.remove_data("movies", "movie_id", "tt0097874", movie_id=None)