<a href="https://colab.research.google.com/github/bkristensen/Colabs/blob/main/Teo/Hent_synonymer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installere moduler

In [None]:
!pip install gspread oauth2client pyppeteer nest_asyncio




# Importér moduler

In [4]:
import gspread
import sqlite3

import asyncio
import nest_asyncio

import pyppeteer

import google.auth

nest_asyncio.apply()

conf = {
  "drive_dir":      "/content/drive",
  "data_dir":       "/content/drive/MyDrive/Data",
  "database_dir":   "/content/drive/MyDrive/Data/Synonyms.db"
}
# Opret forbindelse til SQLite-database
conn = None
cursor = None
ordliste_ids = []
ordliste_tags = []
resultater = []

# Funktioner

In [7]:
async def test_browser():
    browser = await pyppeteer.launch(headless=True, args=["--no-sandbox"])
    page = await browser.newPage()
    await page.goto("https://example.com")
    content = await page.content()
    await browser.close()
    print("Browser virker!")

def getGoogleAccess():
  # Godkend adgang til Google sheets
  google.colab.auth.authenticate_user()

  # Brug google.auth i stedet for oauth2client
  creds, _ = google.auth.default()
  gc = gspread.authorize(creds)

  google.colab.drive.mount(conf["drive_dir"])

def getSheetData():
  # Åbn arket og vælg det første ark
  sheet = gc.open("Synonymer").get_worksheet(2)
  # Hent alle ord fra kolonne A, start fra række 2
  ordliste_ids = sheet.col_values(1)[1:]
  ordliste_tags = sheet.col_values(2)[1:]
  print(ordliste_ids)
  print(ordliste_tags)

def getDbConnection():
  if not conn:
    conn = sqlite3.connect(conf['database_dir'])
  return conn

def getDbCursor():
  if not cursor:
    dbCon = getDbConnection()
    cursor = dbCon.cursor()
  return cursor

def dbCommit():
  conn.commit()
  cursor.close()
  conn.close()

def createDatabase():
  # Opret tabeller
  cursor = getDbCursor()
  cursor.executescript("""
CREATE TABLE IF NOT EXISTS Tags (
    Id INTEGER PRIMARY KEY AUTOINCREMENT,
    Tag STRING(100) UNIQUE NOT NULL
);

CREATE TABLE IF NOT EXISTS Synonyms (
    Id INTEGER PRIMARY KEY AUTOINCREMENT,
    Synonym STRING(100) UNIQUE NOT NULL
);

CREATE TABLE IF NOT EXISTS TagSynonyms (
    TId INTEGER NOT NULL,
    SId INTEGER NOT NULL,
    PRIMARY KEY (TId, SId),
    FOREIGN KEY (TId) REFERENCES Tags(Id),
    FOREIGN KEY (SId) REFERENCES Synonyms(Id)
);

CREATE TABLE IF NOT EXISTS TagIds (
    TId INTEGER NOT NULL,
    TagId INTEGER NOT NULL,
    PRIMARY KEY (TId, TagId),
    FOREIGN KEY (TId) REFERENCES Tags(Id)
);
""")
  dbCommit()

def gem_ord_med_synonymer(ordet, tagid):
    # Indsæt ord (hvis det ikke allerede findes)
    cursor = getDbCursor()
    cursor.execute("SELECT Id FROM Tags WHERE Tag = ?", (ordet,))
    new_tag = False
    fdata = cursor.fetchone()
    if not fdata:
      new_tag = True
      cursor.execute("INSERT OR IGNORE INTO Tags (Tag) VALUES (?)", (ordet,))
      cursor.execute("SELECT Id FROM Tags WHERE Tag = ?", (ordet,))
      id = cursor.fetchone()[0]
    else:
      id = fdata[0]

    if tagid:
      cursor.execute("""
            INSERT OR IGNORE INTO TagIds (TId, TagId)
            VALUES (?, ?)
        """, (id, tagid))

    if new_tag:
      print(f"Henter synonymer for: {ordet}")
      syns = asyncio.get_event_loop().run_until_complete(hent_synonymer(ordet))
      if syns:
        for syn in syns:
          n = f"{syn}".capitalize()
          cursor.execute("INSERT OR IGNORE INTO Synonyms (Synonym) VALUES (?)", (n,))
          cursor.execute("SELECT Id FROM Synonyms WHERE Synonym = ?", (n,))
          synonym_id = cursor.fetchone()[0]

          # Link kun hvis ikke allerede linket
          cursor.execute("""
            INSERT OR IGNORE INTO TagSynonyms (TId, SId)
            VALUES (?, ?)
          """, (id, synonym_id))
    conn.commit()
    cursor.close()

async def hent_synonymer(ordet):
    url = f"https://synonymet.dk/ord/{ordet}"
    browser = await pyppeteer.launch(headless=True, args=["--no-sandbox"])
    page = await browser.newPage()
    await page.goto(url)

    try:
        await page.waitForSelector(".wordcloud-span", timeout=3000)
        elementer = await page.querySelectorAll(".wordcloud-span")
        synonymer = []
        for el in elementer:
            tekst = await page.evaluate('(el) => el.textContent', el)
            if tekst.strip():
                synonymer.append(tekst.strip())
    except:
        synonymer = []

    await browser.close()
    return synonymer

# Init

In [None]:
createDatabase()

getGoogleAccess()
getSheetData()

In [None]:
await test_browser()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Config

# SQLite handlinger

### Gem ord og synonym

In [None]:
conn = getDbConnection()
cursor = getDbCursor()

for i, ordet in enumerate(ordliste_tags):
    try:
      gem_ord_med_synonymer(ordet.capitalize(), ordliste_ids[i])
    except Exception as e:
      print(e)
      continue

conn.commit()
cursor.close()
conn.close()