In [None]:
import s3fs
import os
import pandas as pd
import re

from datetime import datetime
from dotenv import load_dotenv
from src.scrapping import IMDb
from src.utils.db import PostgreSQLDatabase

In [None]:
db = PostgreSQLDatabase()
db.connect()

In [None]:
movie_id = '0089885'
# movie_id = '0489244'
# movie_id = '0101414'
# movie_id = '6208148'

# Scrapping

Record the timestamp before lauching scraping and pass it as an argument to the scrapping functions.

Reviews needing analysis will have the same timestamp as the movie.

Or, simply select review whose timestamp is > (posterior) to that of the movie

In [None]:
scrapper = IMDb()

In [None]:
movie_title, release_date = scrapper.get_movie(movie_id)
total_reviews = scrapper.get_number_of_reviews(movie_id)
reviews_df = scrapper.get_reviews(movie_id)

In [None]:
review_id = '10392322'
upvotes, downvotes = scrapper.get_votes(review_id)

In [None]:
scrapper.close()

# Sentiment analysis

In [None]:
movie = db.query_data('movies', condition=f'movie_id = {movie_id}')
reviews = db.query_data('reviews_raw', condition=f'movie_id = {movie_id}')

In [None]:
print(f"[INFO] Found {len(reviews)} reviews to analyse for {movie[0][1]}")

In [None]:
for review in reviews:
    # ...
    # sentiment-analysis.py --review_id=review[1]

# Backup

In [None]:
S3_ENDPOINT_URL = 'https://' + os.environ['AWS_S3_ENDPOINT']
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

bucket_name = 'maeldieudonne'
destination = bucket_name + '/diffusion/'

In [None]:
for table in ['movies', 'reviews_raw', 'reviews_sentiments']:
    db.backup_table(table)

In [None]:
def get_latest_local_backup(table_name):
    backup_files = [f for f in os.listdir("data/backups") if f.startswith(table_name)]
    
    if not backup_files:
        print(f"[INFO] No local backup found for {table_name}")
        return None

    else:
        latest_backup = max(backup_files, key=lambda f: os.path.getctime(os.path.join("data/backups", f)))
        file_path = os.path.join("data/backups", latest_backup)
        return file_path

In [None]:
for table in ['movies', 'reviews_raw', 'reviews_sentiments']:   
    file_path = get_latest_local_backup(table)
        
    if file_path is not None:
        try:
            fs.put(file_path, destination, content_type="parquet", encoding="utf-8")
            os.remove(file_path)
            print(f"[INFO] Successfully uploaded {file_path} to {destination}")
        except Exception as e:
            print(f"[ERROR] Failed uploading {file_path} to {destination}: {e}")

In [None]:
db.close_connection()