In [1]:
import os
import re
import uuid
import hashlib
import psycopg2
from pathlib import Path
from dotenv import load_dotenv

In [2]:
# Load environment variables
load_dotenv()

# Database configuration
POSTGRES_HOST = os.getenv('POSTGRES_HOST')
POSTGRES_PORT = os.getenv('POSTGRES_PORT')
POSTGRES_DB = os.getenv('POSTGRES_DB')
POSTGRES_USER = os.getenv('POSTGRES_USER')
POSTGRES_PASSWORD = os.getenv('POSTGRES_PASSWORD')

In [3]:
def normalize_path(path):
    """Convert all paths to use forward slashes consistently"""
    return Path(path).as_posix()

In [4]:
def get_postgres_connection():
    """Establish connection to PostgreSQL database."""
    try:
        conn = psycopg2.connect(
            dbname=POSTGRES_DB,
            user=POSTGRES_USER,
            password=POSTGRES_PASSWORD,
            host=POSTGRES_HOST,
            port=POSTGRES_PORT
        )
        print("✅ Connected to PostgreSQL database.")
        return conn
    except Exception as e:
        print(f"❌ Error connecting to PostgreSQL: {e}")
        return None

In [5]:
def generate_file_hash(file_path):
    """Generate SHA-256 hash of file content"""
    with open(file_path, 'rb') as f:
        return hashlib.sha256(f.read()).hexdigest()

In [6]:
def generate_safe_filename(original_file):
    """
    Generate a safe filename with UUID prefix and sanitized original name.
    Example: 'f47ac10b-...d479_John-Doe-Resume.pdf'
    """
    file_uuid = str(uuid.uuid4())
    original_name = Path(original_file).stem
    sanitized = (
        re.sub(r'[^\w-]', '-', original_name)
        .lower()
        [:50]
    )
    ext = Path(original_file).suffix
    return f"{file_uuid}_{sanitized}{ext}"

In [7]:
def is_resume_already_processed(conn, original_path, file_hash):
    """Check if resume already exists in database by path or content hash"""
    try:
        cursor = conn.cursor()
        cursor.execute("""
            SELECT 1 FROM resumes
            WHERE source_path = %s OR content_hash = %s
            LIMIT 1
        """, (original_path, file_hash))
        return cursor.fetchone() is not None
    except Exception as e:
        print(f"❌ Error checking for duplicates: {e}")
        return False

In [8]:
def process_resume_file(conn, original_path, storage_base_path):
    """Process a single resume file with duplicate checking"""
    try:
        # Normalize all paths
        original_path = normalize_path(original_path)
        storage_base_path = normalize_path(storage_base_path)

        # Generate file content hash
        file_hash = generate_file_hash(original_path)

        # Check for existing resume
        if is_resume_already_processed(conn, original_path, file_hash):
            print(f"⏩ Skipping duplicate: {original_path}")
            return None

        # Create storage directory
        os.makedirs(storage_base_path, exist_ok=True)

        # Generate and save file
        new_filename = generate_safe_filename(original_path)
        new_path = normalize_path(os.path.join(storage_base_path, new_filename))

        with open(original_path, 'rb') as src, open(new_path, 'wb') as dst:
            dst.write(src.read())

        return {
            'uuid': new_filename.split('_')[0],
            'source_path': original_path,
            'storage_path': new_path,
            'content_hash': file_hash
        }
    except Exception as e:
        print(f"❌ Error processing {original_path}: {e}")
        return None

In [9]:
def save_resume_to_db(conn, resume_data):
    """Save resume metadata with content hash"""
    try:
        cursor = conn.cursor()
        cursor.execute("""
            INSERT INTO resumes (uuid, source_path, storage_path, content_hash)
            VALUES (%s, %s, %s, %s)
        """, (
            resume_data['uuid'],
            resume_data['source_path'],
            resume_data['storage_path'],
            resume_data['content_hash']
        ))
        conn.commit()
        return True
    except Exception as e:
        print(f"❌ Database error: {e}")
        conn.rollback()
        return False

In [10]:
def process_resumes_from_folder(folder_path, storage_path):
    """Main processing function with duplicate protection"""
    conn = get_postgres_connection()
    if not conn:
        return False

    try:
        processed_count = 0
        folder_path = normalize_path(folder_path)
        storage_path = normalize_path(storage_path)

        for root, _, files in os.walk(folder_path):
            for file in files:
                original_path = normalize_path(os.path.join(root, file))
                print(f"🔍 Processing: {original_path}")

                resume_data = process_resume_file(conn, original_path, storage_path)
                if resume_data and save_resume_to_db(conn, resume_data):
                    print(f"✅ Saved UUID: {resume_data['uuid']}")
                    processed_count += 1

        print(f"\n🎉 Successfully processed {processed_count} resumes")
        return True
    finally:
        conn.close()
        print("🔒 Database connection closed")

In [11]:
# Helper functions to extract information when needed
def get_original_filename(source_path):
    """Extract original filename from source_path"""
    return Path(source_path).name

def get_processed_filename(storage_path):
    """Extract processed filename from storage_path"""
    return Path(storage_path).name

In [12]:
# Get source folder from environment variable or use default
source_folder = os.getenv('RESUME_SOURCE_FOLDER', 'C:/Users/LENOVO/Desktop/ENSA/GI3/PFE/Datasets/postgres_resumes')
storage_folder = '../../resumes/'

print(f"📂 Source folder: {source_folder}")
print(f"💾 Storage folder: {storage_folder}")

# Verify source folder exists
if not os.path.exists(source_folder):
    print(f"⚠️  Warning: Source folder does not exist: {source_folder}")
    # Try to create it if it's a temporary folder
    if 'resume_upload_' in source_folder:
        print("🔧 Creating temporary source folder...")
        os.makedirs(source_folder, exist_ok=True)
    else:
        print("❌ Source folder not found. Please check the path.")
else:
    print(f"✅ Source folder found with {len(os.listdir(source_folder))} items")

📂 Source folder: C:\Users\LENOVO\AppData\Local\Temp\resume_upload_2i688tou
💾 Storage folder: ../../resumes/
✅ Source folder found with 1 items


In [13]:
process_resumes_from_folder(source_folder, storage_folder)

✅ Connected to PostgreSQL database.
🔍 Processing: C:/Users/LENOVO/AppData/Local/Temp/resume_upload_2i688tou/Resume.pdf
⏩ Skipping duplicate: C:/Users/LENOVO/AppData/Local/Temp/resume_upload_2i688tou/Resume.pdf

🎉 Successfully processed 0 resumes
🔒 Database connection closed


True