<a href="https://colab.research.google.com/github/bkristensen/Colabs/blob/main/Teo/Hent_synonymer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installere moduler

In [None]:
!pip install gspread oauth2client pyppeteer nest_asyncio




# Importér moduler

In [None]:
import gspread
import sqlite3

import asyncio
import nest_asyncio

import pyppeteer

import google.auth

nest_asyncio.apply()


# Funktioner

In [39]:
conf = {
  "drive_dir":      "/content/drive",
  "data_dir":       "/content/drive/MyDrive/Data",
  "database_dir":   "/content/drive/MyDrive/Data/Synonyms.db"
}
resultater = []

async def test_browser():
  browser = await pyppeteer.launch(headless=True, args=["--no-sandbox"])
  page = await browser.newPage()
  await page.goto("https://example.com")
  content = await page.content()
  await browser.close()
  print("Browser virker!")

async def initGoogleAccess():
  # Godkend adgang til Google sheets
  google.colab.auth.authenticate_user()
  google.colab.drive.mount(conf["drive_dir"])

async def hent_synonymer(ordet: str) -> list:
  synonymer: list = []
  browser: pyppeteer.browser.Browser = await pyppeteer.launch(headless=True, args=["--no-sandbox"])
  page: pyppeteer.browser.Page = await browser.newPage()
  await page.goto(f"https://synonymet.dk/ord/{ordet}")
  try:
    await page.waitForSelector(".wordcloud-span", timeout=3000)
    elementer: list = await page.querySelectorAll(".wordcloud-span")
    for el in elementer:
      tekst = (await page.evaluate('(el) => el.textContent', el)).strip()
      if tekst:
        synonymer.append(tekst)
  except Exception as e:
      print(f"Kunne ikke finde synonymer for {ordet}: {e}")
  await browser.close()
  return synonymer

async def initDatabase():
  # Opret tabeller
  conn = sqlite3.connect(conf['database_dir'])
  cursor = conn.cursor()
  cursor.executescript("""
CREATE TABLE IF NOT EXISTS Tags (
    Id INTEGER PRIMARY KEY AUTOINCREMENT,
    Tag STRING(100) UNIQUE NOT NULL
);

CREATE TABLE IF NOT EXISTS Synonyms (
    Id INTEGER PRIMARY KEY AUTOINCREMENT,
    Synonym STRING(100) UNIQUE NOT NULL
);

CREATE TABLE IF NOT EXISTS TagSynonyms (
    TId INTEGER NOT NULL,
    SId INTEGER NOT NULL,
    PRIMARY KEY (TId, SId),
    FOREIGN KEY (TId) REFERENCES Tags(Id),
    FOREIGN KEY (SId) REFERENCES Synonyms(Id)
);

CREATE TABLE IF NOT EXISTS TagIds (
    TId INTEGER NOT NULL,
    TagId INTEGER NOT NULL,
    PRIMARY KEY (TId, TagId),
    FOREIGN KEY (TId) REFERENCES Tags(Id)
);
""")
  conn.commit()
  cursor.close()
  conn.close()

def loadSheetData():
  # Åbn arket og vælg det første ark
  creds, _ = google.auth.default()
  gc = gspread.authorize(creds)
  sheet = gc.open("Synonymer").get_worksheet(2)
  # Hent alle ord fra kolonne A, start fra række 2
  ordliste_ids = sheet.col_values(1)[1:]
  ordliste_tags = sheet.col_values(2)[1:]
  return ordliste_ids, ordliste_tags

def gem_ord_med_synonymer(conn: sqlite3.Connection, ordet: str, tagid: int | None):
  # Indsæt ord (hvis det ikke allerede findes)
  new_tag: bool = False
  id: int | None = None
  fdata: list | None = None

  cursor: sqlite3.Cursor = conn.cursor()
  cursor.execute("SELECT Id FROM Tags WHERE Tag = ?", (ordet,))
  fdata = cursor.fetchone()

  if not fdata:
    new_tag = True
    cursor.execute("INSERT OR IGNORE INTO Tags (Tag) VALUES (?)", (ordet,))
    cursor.execute("SELECT Id FROM Tags WHERE Tag = ?", (ordet,))
    id = cursor.fetchone()[0]
  else:
    id = fdata[0]

  if tagid:
    cursor.execute("INSERT OR IGNORE INTO TagIds (TId, TagId) VALUES (?, ?)", (id, tagid))

  if new_tag:
    print(f"Henter synonymer for: {ordet}")
    syns: list = asyncio.get_event_loop().run_until_complete(hent_synonymer(ordet))
    if syns:
      for syn in syns:
        synonym = f"{syn}".capitalize()
        cursor.execute("INSERT OR IGNORE INTO Synonyms (Synonym) VALUES (?)", (synonym,))
        cursor.execute("SELECT Id FROM Synonyms WHERE Synonym = ?", (synonym,))
        synonym_id = cursor.fetchone()[0]

        # Link kun hvis ikke allerede linket
        cursor.execute("INSERT OR IGNORE INTO TagSynonyms (TId, SId) VALUES (?, ?)", (id, synonym_id))

  conn.commit()
  cursor.close()



# Init

In [None]:
await test_browser()

[INFO] Starting Chromium download.
INFO:pyppeteer.chromium_downloader:Starting Chromium download.
100%|██████████| 183M/183M [00:01<00:00, 131Mb/s]
[INFO] Beginning extraction
INFO:pyppeteer.chromium_downloader:Beginning extraction
[INFO] Chromium extracted to: /root/.local/share/pyppeteer/local-chromium/1181205
INFO:pyppeteer.chromium_downloader:Chromium extracted to: /root/.local/share/pyppeteer/local-chromium/1181205


Browser virker!


In [40]:
await initGoogleAccess()

await initDatabase()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Gem ord og synonym

In [41]:
ordliste_ids, ordliste_tags = loadSheetData()

conn = sqlite3.connect(conf['database_dir'])

print(f"Antal ord: {len(ordliste_tags)}")

for i, ordet in enumerate(ordliste_tags):
  # print(f"Tjekker ord: {ordet}")
  try:
    gem_ord_med_synonymer(conn, ordet.capitalize(), ordliste_ids[i])
  except Exception as e:
    print(f"Fejl under tjek af {ordet} - Fejl: {e}")
    continue

conn.close()

Antal ord: 510
Tjekker ord: Blod
Tjekker ord: Lægebehandling
Tjekker ord: Dæmoner
Tjekker ord: Satan
Tjekker ord: Gamle
Tjekker ord: Gamle verden
Tjekker ord: Verden
Tjekker ord: Afgud
Tjekker ord: Billeddyrkelse
Tjekker ord: Idol
Tjekker ord: Velsignelser
Tjekker ord: Kundskab
Tjekker ord: Visdom
Tjekker ord: Kærlighed
Tjekker ord: Loyal
Tjekker ord: Loyal kærlighed
Tjekker ord: Skabelsen
Tjekker ord: Skaber
Tjekker ord: Skaberværket
Tjekker ord: Organisation
Tjekker ord: Teokratiske
Tjekker ord: Teokratiske organisation
Tjekker ord: Hellig
Tjekker ord: Ren
Tjekker ord: Åndelig
Tjekker ord: Åndelig ren
Tjekker ord: Leksikon
Tjekker ord: Wiki
Tjekker ord: Link
Tjekker ord: Byrde
Tjekker ord: Ufuldkommen
Tjekker ord: Helbred
Tjekker ord: Sygdom
Tjekker ord: Dårlig
Tjekker ord: Dårlig frugt
Tjekker ord: Frugt
Tjekker ord: Guds
Tjekker ord: Guds magt
Tjekker ord: Magt
Tjekker ord: Forskrifter
Tjekker ord: Guds
Tjekker ord: Guds krav
Tjekker ord: Krav
Tjekker ord: Love
Tjekker ord: Betydni