In [None]:
import s3fs
import os
import pandas as pd
import re
import tqdm

from datetime import datetime
from dotenv import load_dotenv
from src.scrapping import IMDb
from src.utils.db import PostgreSQLDatabase

In [None]:
# movie_id = '0089885'
movie_id = '0489244'
# movie_id = '0101414'
# movie_id = '0029583'  # Snow White (1937)
# movie_id = '6208148'  # Snow White (2025)

# Scrapping
## Retrieve data

In [None]:
scrapper = IMDb()

In [None]:
movie_scrap_time = datetime.now().strftime("%Y%m%d_%H%M%S")
movie_title, release_date = scrapper.get_movie(movie_id)
total_reviews = scrapper.get_number_of_reviews(movie_id)

In [None]:
reviews_df = scrapper.get_reviews(movie_id, total_reviews)

In [None]:
# Check for empty reviews
empty_reviews = reviews_df['text'].isna().sum()
if empty_reviews > 0:
    print(f"[WARNING] Text is missing for {empty_reviews} reviews.")

In [None]:
# Get the text hidden behind spoiler markup
for index, row in reviews_df.iterrows():
    if pd.isnull(row['text']) or row['text'].strip() == '':  # Check if 'text' is empty
        review_id = row['review_id']
        spoiler_text = scrapper.get_spoiler(review_id)  # Call the function to get the spoiler
        reviews_df.at[index, 'text'] = spoiler_text  # Replace the 'text' with the spoiler

In [None]:
# Check for empty reviews
empty_reviews = reviews_df['text'].isna().sum()
if empty_reviews > 0:
    print(f"[WARNING] Text is still missing for {empty_reviews} reviews.")
else:
    print(f"[INFO] Text is present for all reviews.")

In [None]:
# Get exact vote counts for values >999
mask = reviews_df['upvotes'].astype(str).str.endswith('K') | reviews_df['downvotes'].astype(str).str.endswith('K')
print(f"[INFO] Found {len(reviews_df[mask])} reviews with rounded votes")

for index, row in reviews_df[mask].iterrows():
    review_id = row['review_id']
    exact_upvotes, exact_downvotes = scrapper.get_votes(review_id)
    reviews_df.loc[index, 'upvotes'] = exact_upvotes
    reviews_df.loc[index, 'downvotes'] = exact_downvotes

reviews_df['upvotes'] = reviews_df['upvotes'].astype(int)
reviews_df['downvotes'] = reviews_df['downvotes'].astype(int)

In [None]:
scrapper.close()

In [None]:
# reviews_df.to_csv('reviews_df', index=False)

## Update the tables

In [None]:
db = PostgreSQLDatabase()
db.connect()

In [None]:
# Movies
# Store the data as a list of tuples
movie_data = [(movie_id, movie_title, release_date, total_reviews, movie_scrap_time)]
db.upsert_movie_data(movie_data)

In [None]:
# reviews_df = pd.read_csv('reviews_df')

In [None]:
# Reviews
# Create a variable to identify reviews needing sentiment analysis
reviews_df['to_process'] = 1

# Store the data as a list of tuples
reviews_list = reviews_df.apply(lambda row: (
    int(row['movie_id']), int(row['review_id']), 
    str(row['author']), str(row['title']), 
    str(row['text']), row['rating'],  # Keeps NaN as NULL in PostgreSQL
    str(row['date']), row['upvotes'],  
    row['downvotes'], str(row['scrapping_timestamp']), row['to_process']  
), axis=1).tolist()

# Replace NaN with None to avoid errors with postgreSQL
reviews_list = [tuple(None if pd.isna(x) else x for x in row) for row in reviews_list]

db.upsert_review_data(reviews_list)

# Sentiment analysis
Do not forget to reset 'to_process' after analysis

In [None]:
reviews_to_process = db.query_data('reviews_raw', condition=f'to_process = 1')
print(f"[INFO] Found {len(reviews_to_process)} reviews to analyse for {len(pd.DataFrame(reviews_to_process)[0].unique())} movies")

In [None]:
from src.analysis import Sentimentalization
analyzer = Sentimentalization()

In [None]:
# Test
GPT_results = analyzer.get_sentiment(reviews_to_process[1])
data = [(reviews_to_process[1][1], *GPT_results)]
db.insert_data('reviews_sentiments', data)

In [None]:
# Problems with empty reviews
for review in tqdm.tqdm(reviews_to_process, desc="Processing Reviews", unit="review"):
    print(review)
    print(review[3])
    print(review[4])
    title = review[3] if not None else ""
    text = review[4] if not None else ""
    print(title + f"\n\n" + text)
    review_id = review[1]
    #GPT_results = analyzer.get_sentiment(review)
    #data = [(review_id, *GPT_results)]
    #db.insert_data('reviews_sentiments', data)

# set to_process=0

# Backup

In [None]:
S3_ENDPOINT_URL = 'https://' + os.environ['AWS_S3_ENDPOINT']
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

bucket_name = 'maeldieudonne'
destination = bucket_name + '/diffusion/'

In [None]:
for table in ['movies', 'reviews_raw', 'reviews_sentiments']:
    db.backup_table(table)

In [None]:
def get_latest_local_backup(table_name):
    backup_files = [f for f in os.listdir("data/backups") if f.startswith(table_name)]
    
    if not backup_files:
        print(f"[INFO] No local backup found for {table_name}")
        return None

    else:
        latest_backup = max(backup_files, key=lambda f: os.path.getctime(os.path.join("data/backups", f)))
        file_path = os.path.join("data/backups", latest_backup)
        return file_path

In [None]:
for table in ['movies', 'reviews_raw', 'reviews_sentiments']:   
    file_path = get_latest_local_backup(table)
        
    if file_path is not None:
        try:
            fs.put(file_path, destination, content_type="parquet", encoding="utf-8")
            os.remove(file_path)
            print(f"[INFO] Successfully uploaded {file_path} to {destination}")
        except Exception as e:
            print(f"[ERROR] Failed uploading {file_path} to {destination}: {e}")

In [None]:
db.close_connection()