In [None]:
# definition of functions to be used in ipynb notebooks

async def handle_request(route, request):
    # print(request.url)
    # if request is js, abort it
    if request.resource_type == "script":
        # print("Aborting request: " + request.url)
        await route.abort()
    else:
        await route.continue_()

async def read_webpage (url, tag, selector="h3"):
    from playwright.async_api import async_playwright
    playwright = await async_playwright().start()
    browser = await playwright.chromium.launch()
    page = await browser.new_page()

    # create folder my-data if not exists
    # !mkdir -p my-data

    # block js requests
    await page.route("**/*", handle_request)

    # set timeout to 5s
    await page.goto(url, timeout=5000)
    
    # save screenshot in file with timestamp ymd-his
    import datetime
    now = datetime.datetime.now()
    snow = now.strftime('%Y%m%d-%H%M%S')
    filename = f"my-data/shot-{tag}-{snow}.png"
    await page.screenshot(path=filename, full_page=True)


    # get all titles h3
    my_titles = []
    titles = await page.query_selector_all(selector)
    for title in titles:
        # add title to list
        tt = await title.inner_text()
        my_titles.append(tt)
        # print(await title.inner_text())

    # concatenate all titles in one string with separator newline
    txt_titles = ""

    for title in my_titles:
        txt_titles += title + "\n"

    # print(txt_titles)
    # build html page with titles
    html = f"""
    <html>
    <head>
    <title>News {tag}</title>
    </head>
    <body>
    <h1>News {tag}</h1>
    <pre>{txt_titles}</pre>
    </body>
    </html>
    """
    # show html page in a ipynb widget
    from IPython.display import HTML
    wid_html = HTML(html)
    display(wid_html)
    
    # save my_titles in plain file with one title per line
    with open(f"my-data/titles-{tag}-{snow}.txt", 'w') as f:
            f.write(str(txt_titles))

    # save my_titles in json file
    import json
    with open(f"my-data/titles-{tag}-{snow}.json", 'w') as f:
        json.dump(my_titles, f)


    await browser.close()
    await playwright.stop()

# show screenshot
#from IPython.display import Image
#Image(filename=filename)




In [None]:
# definition of functions to be used in ipynb notebooks

async def handle_request_db(route, request):
    # print(request.url)
    # if request is js, abort it
    if request.resource_type == "script":
        # print("Aborting request: " + request.url)
        await route.abort()
    else:
        await route.continue_()

def insert_news_db (titles, tag, url, sdatetime):
    # write to db
    import sqlite3
    conn = sqlite3.connect('my-data/news.db')
    c = conn.cursor()
    c.execute('CREATE TABLE IF NOT EXISTS news (id INTEGER PRIMARY KEY, md5 TEXT, tag TEXT, title TEXT, url TEXT, date TEXT)')
    # add unique index on md5 is not exists
    c.execute('CREATE UNIQUE INDEX IF NOT EXISTS idx_md5 ON news (md5)')

    # count duplicates
    ndup = 0
    for tt in titles:
        # trim title
        tt = tt.strip()
        # if title is empty, skip it
        if not tt:
            continue

        # md5 hash of title
        import hashlib
        mhash = hashlib.md5()
        mhash.update(tt.encode('utf-8'))
        mhash = mhash.hexdigest()
        # try to insert title in db with md5 hash
        # catch error if title already exists
        try:
            c.execute('INSERT INTO news (md5, tag, title, url, date) VALUES (?, ?, ?, ?, ?)', (mhash, tag, tt, url, sdatetime))
        except sqlite3.IntegrityError:
            ndup += 1
            # print("Title already exists: " + tt)

    # print number of duplicates
    print("Number of duplicates: " + str(ndup))
    # get the number of rows in the table
    c.execute('SELECT COUNT(*) FROM news')
    print("Number of rows in table news: " + str(c.fetchone()[0]))

    conn.commit()
    conn.close()

async def read_webpage_db (url, tag, selector="h3"):
    from playwright.async_api import async_playwright
    playwright = await async_playwright().start()
    browser = await playwright.chromium.launch()
    page = await browser.new_page()

    # create folder my-data if not exists
    # !mkdir -p my-data

    # block js requests
    await page.route("**/*", handle_request_db)

    # set timeout to 5s
    await page.goto(url, timeout=5000)
    
    # save screenshot in file with timestamp ymd-his
    import datetime
    now = datetime.datetime.now()
    snow = now.strftime('%Y%m%d-%H%M%S')
    sdatetime = now.strftime('%Y-%m-%d %H:%M:%S')
    filename = f"my-data/shot-{tag}-{snow}.png"
    await page.screenshot(path=filename, full_page=True)


    # get all titles h3
    my_titles = []
    titles = await page.query_selector_all(selector)

    for title in titles:
        # add title to list
        tt = await title.inner_text()
        # trim title
        tt = tt.strip()
        # if title is empty, skip it
        if not tt:
            continue
        my_titles.append(tt)
        # print(await title.inner_text())

    # insert titles in db
    insert_news_db(my_titles, tag, url, sdatetime)

    # concatenate all titles in one string with separator newline
    txt_titles = ""

    for title in my_titles:
        txt_titles += title + "\n"

    # print(txt_titles)
    # build html page with titles
    html = f"""
    <html>
    <head>
    <title>News {tag}</title>
    </head>
    <body>
    <h1>News {tag}</h1>
    <pre>{txt_titles}</pre>
    </body>
    </html>
    """
    # show html page in a ipynb widget
    from IPython.display import HTML
    wid_html = HTML(html)
    display(wid_html)
    
    # save my_titles in plain file with one title per line
    with open(f"my-data/titles-{tag}-{snow}.txt", 'w') as f:
            f.write(str(txt_titles))

    # save my_titles in json file
    import json
    with open(f"my-data/titles-{tag}-{snow}.json", 'w') as f:
        json.dump(my_titles, f)


    await browser.close()
    await playwright.stop()

# show screenshot
#from IPython.display import Image
#Image(filename=filename)




In [None]:
# read titles from db
def read_titles_db (tag=""):
    import sqlite3
    conn = sqlite3.connect('my-data/news.db')
    c = conn.cursor()
    if tag == "":
        c.execute('SELECT title FROM news')
    else:
        c.execute('SELECT title FROM news WHERE tag = ?', (tag,))
    titles = c.fetchall()
    conn.close()
    return titles

# loop in titles and split words, then add to dictionary
def count_words_db (tag=""):
    # read titles from db
    titles = read_titles_db(tag)
    # loop in titles and split words, then add to dictionary
    words = {}
    # articles = ["a", "an", "the"]
    articles = [ 
        "a", "e", "u", "es", "de", "la", "le", "des", "les", "La", "-",
        "va", "nos",
        "en", "du", "à", "un", "une", "et", "aux", "se", "si",
        "ou", "mais", "ni", "car", "or", "donc", "or", 
        "pour", "sur", "Le", "au", "dans", 
        "Les", "plus", "son", "par", "est", 
        "avec", "sont", "qui", "que", 
        "ce", "ces", "ces", "ceux", "celui", "celle", "celles", 
        "leurs", "leur", "sa", "ses", "son", "sont", "soit", "soient",
        "pas", "ne", "non", "ni", "n'",
        "Au", "après", "nous", "être", "avoir", "avons", "avez", "ont", "c'est",
        "Avec", "d'une", "En", "vous", "veut", "faire", "fait", "faites", "fait",
        "on", "on", "elles", "ils", "il", "elle", "Il", "Elle",
        "À", "n'est", "selon", "d’une", "figaro", "cette", "cet",
        "notre", "entre", "où", "avant", "quand", "dans", "dès", 
        "chez", "sous", "sans", "contre", "devant", "derrière", 
        "près", "jusqu'à", "jusqu’au", "jusqu’aux",
        "faut-il",
    ]
    # show number of titles
    print(f"Number of titles: {len(titles)}")

    for tt in titles:
        # split title in words
        for ww in tt[0].split():
            # if word is not in dictionary, add it
            # lowercase word
            ww = ww.lower()
            # trim punctuation
            ww = ww.strip(",.;:!?()[]{}'\"«»")
            # FIXME: left trim l'
            # ww = ww.lstrip("l'")
            # FIXME: left trim d'
            # ww = ww.lstrip("d'")

            # if word is empty, skip it
            if not ww:
                continue
            if ww not in words:
                words[ww] = 1
            else:
                # if word is punctation, decrease counter
                if ww in [",", ".", ":", ";", "!", "?", "(", ")", "[", "]", "{", "}", "'", '"', '«', '»']:
                    words[ww] -= 1
                # if word is article, decrease counter
                elif ww in articles:
                    words[ww] -= 1
                else:
                    # if word is in dictionary, increase counter
                    words[ww] += 1
    return words

# show words in a bar chart
def show_words_db (max=50, tag=""):
    
    # count words
    words = count_words_db(tag)
    # sort words by descending order
    words = dict(sorted(words.items(), key=lambda item: item[1], reverse=True))
    # show only max words
    words = dict(list(words.items())[0:max])

    # https://matplotlib.org/stable/gallery/lines_bars_and_markers/barh.html
    import matplotlib.pyplot as plt


    # show words in a bar chart with keys in vertical axis and horizontal bars
    plt.barh(range(len(words)), list(words.values()), align='center')
    # show label for each bar
    for i, v in enumerate(words.values()):
        plt.text(v, i, str(v))
    # show keys in vertical axis
    plt.yticks(range(len(words)), list(words.keys()))
    
    # invert vertical axis
    plt.gca().invert_yaxis()
    
    # show grid
    plt.grid(axis='x')

    # resize plt
    plt.rcParams["figure.figsize"] = (10,max*0.25)

    # show chart
    plt.show()

    # plt.bar(words.keys(), words.values())
    # plt.show()

    # print words
    print(words)