In [None]:
import pandas as pd
from gspread_dataframe import get_as_dataframe
import gspread
import os
import shutil
import subprocess
import re

# ---------------------------
# Directories
# ---------------------------
INPUT_DIR = "Downloaded_Universities"
PROCESSED_DIR = os.path.join(INPUT_DIR, "Processed_Universities")
os.makedirs(PROCESSED_DIR, exist_ok=True)

# ---------------------------
# Google Sheet Config
# ---------------------------
SERVICE_ACCOUNT_FILE = "service_account.json"
SHEET_ID = "1eYz8Nvr3BToRrmReXNLR8zQrk4X8tsdKZO_Fj9mNThc"
SHEET_NAME = "Scrapper Running"

# ---------------------------
# Helper functions
# ---------------------------


def safe_filename(name: str) -> str:
    """Clean a string to make it safe for file names."""
    if not name:
        return ""
    return re.sub(r'[.\'<>:"/\\|?*]', '', name).strip().lower()


def update_sheet_status(csv_file_name, status):
    """Update 'Scraping?' column for matching university CSV in the sheet."""
    uni_name_from_csv = safe_filename(os.path.splitext(csv_file_name)[0])

    for i, uni_name in df_sheet['University Name'].items():
        if uni_name and safe_filename(str(uni_name)) == uni_name_from_csv:
            df_sheet.at[i, 'Scraping?'] = status


# ---------------------------
# Authenticate Google Sheet
# ---------------------------
gc = gspread.service_account(filename=SERVICE_ACCOUNT_FILE)
sh = gc.open_by_key(SHEET_ID)
ws = sh.worksheet(SHEET_NAME)
df_sheet = get_as_dataframe(ws, evaluate_formulas=True, header=0)

# Ensure 'Scraping?' column exists
if 'Scraping?' not in df_sheet.columns:
    df_sheet['Scraping?'] = "Pending"

# ---------------------------
# Spider runner
# ---------------------------


def run_spider_on_csvs():
    csv_files = [f for f in os.listdir(
        INPUT_DIR) if f.lower().endswith(".csv")]

    for csv_file in csv_files:
        csv_path = os.path.join(INPUT_DIR, csv_file)
        print(f"🚀 Running spider for: {csv_file}")

        # Mark as processing
        update_sheet_status(csv_file, "Processing")
        ws.update([df_sheet.columns.values.tolist()] +
                  df_sheet.fillna("").values.tolist())

        try:
            subprocess.run([
                "python", "-m", "scrapy", "crawl", "courses", f"-a", f"csv_file={csv_path}"
            ], check=True)

            # Move processed CSV
            shutil.move(csv_path, os.path.join(PROCESSED_DIR, csv_file))
            print(f"✅ Finished and moved: {csv_file}")

            # Mark as processed
            update_sheet_status(csv_file, "Processed")

        except subprocess.CalledProcessError as e:
            print(f"❌ Spider failed for {csv_file}: {e}")
            update_sheet_status(csv_file, "Failed")

    # Push final status to Google Sheet
    ws.update([df_sheet.columns.values.tolist()] +
              df_sheet.fillna("").values.tolist())
    print("✅ Sheet updated with Scraping? statuses")


# ---------------------------
# Run all CSVs
# ---------------------------
if __name__ == "__main__":
    run_spider_on_csvs()

🚀 Running spider for: Aberystwyth University.csv
