***Restore is broken 'cause the structure of the table have changed.***

In [None]:
import os
import pandas as pd
import re
import s3fs

from datetime import datetime

In [None]:
# Set the working directory to work from install/ directory
os.chdir(os.path.expanduser("~/work/MLOps"))
from src.utils.db import PostgreSQLDatabase

In [None]:
# Connect to database
db = PostgreSQLDatabase()
db.connect()

In [None]:
# Connect to S3
S3_ENDPOINT_URL = 'https://' + os.environ['AWS_S3_ENDPOINT']
fs = s3fs.S3FileSystem(client_kwargs={'endpoint_url': S3_ENDPOINT_URL})

bucket_name = 'maeldieudonne'
destination = bucket_name + '/diffusion/'

In [None]:
# Drop existing tables for a clean start (in reverse order of dependency)
for table in ['reviews_sentiments', 'reviews_raw', 'movies']:
    if db.table_exists(table):
        db.drop_table(table)

In [None]:
# Create tables
db.create_table('movies', {
    'movie_id': 'INTEGER PRIMARY KEY',
    'title': 'VARCHAR(250)',
    'release_date': 'DATE',
    'scrapping_timestamp': 'TIMESTAMP'
})
    
db.create_table('reviews_raw', {
    'movie_id': 'INTEGER REFERENCES movies(movie_id) ON DELETE CASCADE',
    'review_id': 'INTEGER PRIMARY KEY',
    'author': 'VARCHAR(150)',
    'title': 'VARCHAR(500)',
    'text': 'TEXT',
    'rating': 'INTEGER',
    'date': 'DATE',
    'upvotes': 'INTEGER',
    'downvotes': 'INTEGER',
    'scrapping_timestamp': 'TIMESTAMP'
})

db.create_table('reviews_sentiments', {
    'review_id': 'INTEGER PRIMARY KEY REFERENCES reviews_raw(review_id) ON DELETE CASCADE',
    'story': 'INTEGER',
    'acting': 'INTEGER',
    'visuals': 'INTEGER',
    'sounds': 'INTEGER',
    'values': 'INTEGER',
    'overall': 'INTEGER'
})

In [None]:
# Get latest backup or sample data for a given table
def extract_timestamp(file_name):
    match = re.search(r'(\d{8}_\d{6})', file_name)
    if match:
        return datetime.strptime(match.group(1), '%Y%m%d_%H%M%S')
    return None
    
def load_latest_backup(table_name):
    # Look for a backup in S3
    all_files = [f['name'] for f in fs.listdir(destination)]
    backup_files = [f for f in all_files if f.startswith(f"{destination}{table_name}")]

    if not backup_files:
        # Look for sample data locally
        try:
            backup = pd.read_csv(f"data/sample/{table_name}.csv")
            print(f"Loading sample data for {table_name}")
            return backup
        except:
            print(f"No distant or local backup found for {table_name}")

    else:
        file_path = max(backup_files, key=extract_timestamp)
        timestamp = extract_timestamp(file_path).strftime('%Y-%m-%d %H:%M:%S')
        with fs.open(f's3://{file_path}', 'rb') as f:
            backup = pd.read_parquet(f)
        print(f"Loading distant backup for {table_name}: {timestamp}")
        return backup

In [None]:
# Load backups or samples
for table in ['movies', 'reviews_raw', 'reviews_sentiments']:
    backup_df = load_latest_backup(table)
    # backup_df = backup_df.where(pd.notna(backup_df), None)
    if backup_df is not None:
        backup_data = [
            tuple(str(value) if isinstance(value, str) else value for value in row)
            for row in backup_df.itertuples(index=False, name=None)
        ]
        db.insert_data(table, backup_data)

In [None]:
db.close_connection()