<a href="https://colab.research.google.com/github/bkristensen/Colabs/blob/main/Teo/Hent_synonymer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installere moduler

In [3]:
!pip install gspread oauth2client pyppeteer nest_asyncio




# Importér moduler

In [4]:
import gspread
import sqlite3

import asyncio
import nest_asyncio

import pyppeteer

import google.auth

# from google.colab import auth

# from pyppeteer import launch
# from pyppeteer import chromium_downloader

# Config

In [18]:
conf = {
  "drive_dir":      "/content/drive",
  "data_dir":       "/content/drive/MyDrive/Data",
  "database_dir":   "/content/drive/MyDrive/Data/TagSynonyms.db"
}

print(conf)

{'drive_dir': '/content/drive', 'data_dir': '/content/drive/MyDrive/Data', 'database_dir': '/content/drive/MyDrive/Data/TagSynonyms.db'}


# Init

In [7]:
nest_asyncio.apply()
await pyppeteer.chromium_downloader.download_chromium()

[INFO] Starting Chromium download.
INFO:pyppeteer.chromium_downloader:Starting Chromium download.
100%|██████████| 183M/183M [00:04<00:00, 37.8Mb/s]
[INFO] Beginning extraction
INFO:pyppeteer.chromium_downloader:Beginning extraction
[INFO] Chromium extracted to: /root/.local/share/pyppeteer/local-chromium/1181205
INFO:pyppeteer.chromium_downloader:Chromium extracted to: /root/.local/share/pyppeteer/local-chromium/1181205


TypeError: object NoneType can't be used in 'await' expression

### Test Pypeteer browser

In [5]:

async def test_browser():
    browser = await pyppeteer.launch(headless=True, args=["--no-sandbox"])
    page = await browser.newPage()
    await page.goto("https://example.com")
    content = await page.content()
    await browser.close()
    print("Browser virker!")

await test_browser()

[INFO] Starting Chromium download.
INFO:pyppeteer.chromium_downloader:Starting Chromium download.
100%|██████████| 183M/183M [00:04<00:00, 38.0Mb/s]
[INFO] Beginning extraction
INFO:pyppeteer.chromium_downloader:Beginning extraction
[INFO] Chromium extracted to: /root/.local/share/pyppeteer/local-chromium/1181205
INFO:pyppeteer.chromium_downloader:Chromium extracted to: /root/.local/share/pyppeteer/local-chromium/1181205


Browser virker!


### Opret adgang til Google drev

In [19]:
# Godkend adgang til Google sheets
google.colab.auth.authenticate_user()

# Brug google.auth i stedet for oauth2client
creds, _ = google.auth.default()
gc = gspread.authorize(creds)

google.colab.drive.mount(conf["drive_dir"])

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Hent Pyppeter browser pakke

# SQLite handlinger

### Opret SQLite database

In [21]:
# Opret forbindelse til SQLite-database
conn = sqlite3.connect(conf['database_dir'])
cursor = conn.cursor()

# Opret tabeller
cursor.executescript("""
CREATE TABLE IF NOT EXISTS Tags (
    Id INTEGER PRIMARY KEY AUTOINCREMENT,
    Tag STRING(100) UNIQUE NOT NULL,
    TagId INTEGER
);

CREATE TABLE IF NOT EXISTS Synonyms (
    Id INTEGER PRIMARY KEY AUTOINCREMENT,
    Synonym STRING(100) UNIQUE NOT NULL
);

CREATE TABLE IF NOT EXISTS TagSynonyms (
    TId INTEGER NOT NULL,
    SId INTEGER NOT NULL,
    PRIMARY KEY (UId, SId),
    FOREIGN KEY (UId) REFERENCES Tags(UId),
    FOREIGN KEY (SId) REFERENCES Synonyms(Id)
);
""")

conn.commit()


OperationalError: table "TagSynonyms" has more than one primary key

### Gem ord og synonym

In [None]:
def gem_ord_med_synonymer(ordet, synonymer):
    # Indsæt ord (hvis det ikke allerede findes)
    cursor.execute("INSERT OR IGNORE INTO Tags (Tag) VALUES (?)", (ordet,))
    cursor.execute("SELECT TagId FROM Tags WHERE Text = ?", (ordet,))
    tag_id = cursor.fetchone()[0]

    for synonym in synonymer:
        cursor.execute("INSERT OR IGNORE INTO Synonyms (Text) VALUES (?)", (synonym,))
        cursor.execute("SELECT SynonymId FROM Synonyms WHERE Text = ?", (synonym,))
        synonym_id = cursor.fetchone()[0]

        # Link kun hvis ikke allerede linket
        cursor.execute("""
            INSERT OR IGNORE INTO WordSynonyms (TagId, SynonymId)
            VALUES (?, ?)
        """, (tag_id, synonym_id))

    conn.commit()


In [None]:

async def hent_synonymer(ordet):
    url = f"https://synonymet.dk/ord/{ordet}"
    browser = await pyppeteer.launch(headless=True, args=["--no-sandbox"])
    page = await browser.newPage()
    await page.goto(url)

    try:
        await page.waitForSelector(".wordcloud-span", timeout=3000)
        elementer = await page.querySelectorAll(".wordcloud-span")
        synonymer = []
        for el in elementer:
            tekst = await page.evaluate('(el) => el.textContent', el)
            if tekst.strip():
                synonymer.append(tekst.strip())
    except:
        synonymer = []

    await browser.close()
    return synonymer

In [None]:
# Åbn arket og vælg det første ark
sheet = gc.open("Synonymer").sheet1

# Hent alle ord fra kolonne A, start fra række 2
ordliste = sheet.col_values(1)[1:]

In [None]:
resultater = []
for i, ordet in enumerate(ordliste):
    kol = 2
    print(f"Henter synonymer for: {ordet}")
    syns = asyncio.get_event_loop().run_until_complete(hent_synonymer(ordet))
    if syns:
      for syn in syns:
        kol += 1
        sheet.update_cell(i + 2, kol, syn)

    tekst = ", ".join(syns) if syns else "Ingen synonymer fundet"

    resultater.append(tekst)

    # Skriv i kolonne B (række i + 2 pga. overskrift)
    sheet.update_cell(i + 2, 2, tekst)