In [None]:
# ===============================
# INSTALLERA BIBLIOTEK
# ===============================
!pip install googletrans==4.0.0rc1 python-docx requests beautifulsoup4

from google.colab import drive
import os
import time
import requests
from bs4 import BeautifulSoup
from googletrans import Translator
from docx import Document
from datetime import datetime

# ===============================
# MONTERA GOOGLE DRIVE
# ===============================
drive.mount('/content/drive')

BASE_DIR = "/content/drive/MyDrive/CPS_Artiklar_Pashto"
os.makedirs(BASE_DIR, exist_ok=True)
LOG_FILE = os.path.join(BASE_DIR, "oversatta_logg.txt")

# ===============================
# HJÄLPFUNKTIONER
# ===============================
def load_log():
    if not os.path.exists(LOG_FILE):
        return set()
    with open(LOG_FILE, "r", encoding="utf-8") as f:
        return set(line.strip().split("\t")[0] for line in f if line.strip())

def update_log(url):
    with open(LOG_FILE, "a", encoding="utf-8") as f:
        now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        f.write(f"{url}\t{now}\n")

def fetch_soup(url):
    try:
        r = requests.get(url, timeout=15)
        r.raise_for_status()
        return BeautifulSoup(r.text, "html.parser")
    except Exception as e:
        print(f"❌ Fel vid hämtning av {url}: {e}")
        return None

def get_all_article_links():
    base = "https://www.cpsglobal.org/articles"
    links = set()
    page = 0
    while True:
        url = base if page == 0 else f"{base}?page={page}"
        soup = fetch_soup(url)
        if not soup:
            break
        page_links = [
            a["href"] for a in soup.select("div.view-content a") if a.get("href", "").startswith("/articles/")
        ]
        if not page_links:
            break
        for pl in page_links:
            links.add("https://www.cpsglobal.org" + pl)
        page += 1
        time.sleep(0.5)
    return sorted(links)

def extract_title_and_paragraphs(article_url: str):
    soup = fetch_soup(article_url)
    if not soup:
        return "", []
    h1 = soup.find("h1")
    title = h1.get_text(strip=True) if h1 else "Artikel"
    candidates = []
    scope = soup.find("article") or soup.find("main")
    if scope:
        for p in scope.find_all("p"):
            txt = p.get_text(" ", strip=True)
            if txt:
                candidates.append(txt)
    if not candidates:
        for p in soup.find_all("p"):
            txt = p.get_text(" ", strip=True)
            if txt:
                candidates.append(txt)
    blacklist_snips = ["Subscribe","Stay informed","newsletter","Donate","About Us","FOLLOW US","Share","©","Powered by","CPS shares spiritual wisdom"]
    clean = []
    for t in candidates:
        if any(b.lower() in t.lower() for b in blacklist_snips):
            continue
        if len(t) < 20 and not t.endswith("."):
            continue
        clean.append(t)
    if clean and title and clean[0].strip().lower() == title.strip().lower():
        clean = clean[1:]
    return title or "Artikel", clean

translator = Translator()

def oversatt_och_spara(url, artikelnummer):
    try:
        title, paragraphs = extract_title_and_paragraphs(url)
        if not paragraphs:
            print(f"⚠️ Ingen text hittad i {url}")
            return None
        doc = Document()
        doc.add_heading(title, 0)
        for para in paragraphs:
            try:
                ps = translator.translate(para, src="en", dest="ps").text
            except Exception:
                ps = para
            doc.add_paragraph(ps)
            time.sleep(2)
        safe_title = "".join(c if c.isalnum() else "_" for c in title)[:50]
        filename = os.path.join(BASE_DIR, f"{artikelnummer:04d}_{safe_title}.docx")
        doc.save(filename)
        update_log(url)
        print(f"📝 Sparad: {filename}")
        return filename
    except Exception as e:
        print(f"❌ Fel i oversatt_och_spara: {e}")
        return None

alla = get_all_article_links()
done = load_log()
kvar = [u for u in alla if u not in done]

print(f"Totalt hittade: {len(alla)} artiklar")
print(f"📌 Redan översatta: {len(done)}")
print(f"🧮 Återstår: {len(kvar)}")

MAX = 2
for i, url in enumerate(kvar[:MAX], start=1):
    oversatt_och_spara(url, i)
    time.sleep(5)

print("🚀 Klart!")
