In [20]:
from datetime import datetime
import praw
import prawcore
import sys
from models.Author import Author
from models.document_factory import DocumentFactory
import urllib, urllib.request
import xmltodict
import dill
import os
from models.Corpus import Corpus
from dotenv import load_dotenv

load_dotenv()

True

In [21]:
listData = []


def fetch_reddit_data(query, limit=10):
    """
    Fetches data from Reddit and fills the listData list with it.
    @param query: the query to search for
    @param limit: the maximum number of posts to fetch (default: 10)
    @return: None
    """
    client_id = os.environ.get("REDDIT_CLIENT_ID")
    client_secret = os.environ.get("REDDIT_SECRET_ID")
    user_agent = os.environ.get("REDDIT_USER_AGENT")

    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent,
    )

    hot_posts = reddit.subreddit(query).hot(limit=limit)
    try:
        for post in hot_posts:
            if post.selftext != "":
                titre = (
                    post.title.encode("utf-8", errors="ignore")
                    .decode("utf-8")
                    .replace("\n", "")
                )

                auteur = str(post.author)
                date = datetime.fromtimestamp(post.created_utc).strftime("%Y/%m/%d")
                url = post.url
                texte = post.selftext.replace("\n", "")

                doc = DocumentFactory.create_document(
                    "Reddit",
                    titre,
                    auteur,
                    date,
                    url,
                    texte,
                )
                listData.append(("Reddit", doc))
    except (
        prawcore.exceptions.Redirect,
        prawcore.exceptions.Forbidden,
        prawcore.exceptions.NotFound,
        prawcore.exceptions.ServerError,
        prawcore.exceptions.ResponseException,
    ) as e:
        print("[ERRER] -- While fetching data from Reddit, an error occured:")
        if e.response.status_code == 404:
            print(
                """
********************************************************
The subreddit you entered does not exist.
Please make sure you entered the correct subreddit name.
********************************************************
            """
            )
        elif e.response.status_code == 403:
            print(
                """
********************************************************
You are not allowed to access this subreddit.
Please make sure you entered the correct subreddit name.
********************************************************
            """
            )
        elif e.response.status_code == 302:
            print(
                """
********************************************************
The subreddit you entered does not exist.
Please make sure you entered the correct subreddit name.
********************************************************
            """
            )
        elif e.response.status_code == 500:
            print(
                """
********************************************************
An internal server error occured.
Please try again later.
********************************************************
            """
            )
        else:
            print(e.response.status_code)
            print("Exiting...")
            sys.exit(1)


def fetch_arxiv_data(query, limit=10):
    """
    Fetches data from Arxiv and fills the listData list with it.
    @param query: the query to search for
    @param limit: the maximum number of posts to fetch (default: 10)
    @return: None
    """
    url = (
        "http://export.arxiv.org/api/query?search_query=all:"
        + query
        + "&start=0&max_results="
        + str(limit)
    )

    data = urllib.request.urlopen(url).read().decode("utf-8")

    # transform the data from xml to json

    dataToJson = xmltodict.parse(data)

    if (
        "entry" in dataToJson["feed"]
    ):  # check if the key 'entry' exists in the json before trying to access it because it may not exist
        adocs = dataToJson["feed"]["entry"]
        for aPost in adocs:
            titre = aPost["title"].replace("\n", "")
            try:
                authors = ", ".join([a["name"] for a in aPost["author"]])
            except TypeError:
                authors = aPost["author"]["name"]

            summary = aPost["summary"].replace("\n", "")
            date = datetime.strptime(aPost["published"], "%Y-%m-%dT%H:%M:%SZ").strftime(
                "%Y/%m/%d"
            )

            adoc = DocumentFactory.create_document(
                "Arxiv",
                titre,
                authors,
                date,
                aPost["link"][1]["@href"],
                summary,
            )
            listData.append(("Arxiv", adoc))
    else:
        print(
            """
[ERROR] -- While fetching data from Arxiv, an error occured:
********************************************************
No 'entry' key found in the (Arxiv) API response. Unable to retrieve document information.
********************************************************
            """
        )


id2doc = {}


def fillDocDict():
    global id2doc
    global listData
    counter = 0
    for origin, data in listData:
        id = f"{origin}_{counter}"
        id2doc[id] = data
        counter += 1


id2aut = {}
authors = {}


def fillAuthorsDict():
    idAuthor = 0
    global authors
    global id2aut
    global id2doc
    global nbPosts
    for doc in id2doc.values():
        if doc.auteur not in id2aut:
            idAuthor += 1
            authors[idAuthor] = Author(doc.auteur)
            id2aut[doc.auteur] = authors[idAuthor]


from models.Corpus import Corpus

CorpusNoSingleton = Corpus
del Corpus
corpus = CorpusNoSingleton("Mon corpus")

nbPosts = 0


def fillCorpus(docType=""):
    global nbPosts
    global corpus

    for origin, d in listData:
        if docType == origin:
            corpus.add(d)
            nbPosts += 1
        else:
            corpus.add(d)
            nbPosts += 1


def get_existing_data():
    global corpus
    with open(f"./corpus.pkl", "rb") as f:
        corpus = dill.load(f)
    print(f"{len(corpus.id2doc)} posts were retrieved from the corpus.")
    return corpus


def fetch_new_data(query, limit):
    global corpus
    global nbPosts
    global listData
    global id2doc
    global id2aut
    global authors

    listData = []
    id2doc = {}
    id2aut = {}
    authors = {}

    print("-" * 50)
    print("Fetching data from Reddit...")
    fetch_reddit_data(query, limit)

    print("-" * 50)
    print("Fetching data from Arxiv...")
    fetch_arxiv_data(query, limit)

    print("-" * 50)
    print("Filling the document dictionary...")
    fillDocDict()

    print("-" * 50)
    print("Filling the authors dictionary...")
    fillAuthorsDict()

    print("-" * 50)
    print("Filling the corpus...")
    fillCorpus()

    print("-" * 50)
    print("Saving the corpus...")
    corpus.save()
    print("Corpus saved successfully!")
    print(f"{len(corpus.id2doc)} posts were fetched and saved from Reddit and Arxiv.")
    print("-" * 50)
    return corpus

In [22]:
import ipywidgets as widgets
from IPython.display import display
from models.decorators import singleton
from models.Corpus import Corpus

corpus = Corpus("Mon corpus")


mot_clef_textbox = widgets.Text(description="Mot-clé :")
nb_articles_slider = widgets.IntSlider(
    description="Nombre d'articles :", min=1, max=100, value=10
)


# Affichage des champs du formulaire


display(mot_clef_textbox)


display(nb_articles_slider)


# Fonction pour récupérer les données lorsque l'utilisateur valide le formulaire


def on_submit_button_clicked(button):
    query = mot_clef_textbox.value
    limit = nb_articles_slider.value
    # Utilisation des valeurs saisies par l'utilisateur pour récupérer les données des deux APIs
    corpus = fetch_new_data(query, limit)
    print(corpus)


# Création du bouton de soumission du formulaire


def use_local_date(button):
    corpus = get_existing_data()
    print(corpus)


submit_button = widgets.Button(description="Valider")


submit_button.on_click(on_submit_button_clicked)
display(submit_button)

text = widgets.HTML(value="<h2>Ou utiliser des données local</h2>")
display(text)
existing_data_btn = widgets.Button(description="Valider")
existing_data_btn.on_click(use_local_date)
display(existing_data_btn)

Text(value='', description='Mot-clé :')

IntSlider(value=10, description="Nombre d'articles :", min=1)

Button(description='Valider', style=ButtonStyle())

In [23]:
search_box = widgets.Text(description="Enter a query to search for :")
search_length = widgets.IntSlider(
    description="Choose a context length :", min=1, max=100, value=30
)
search_btn = widgets.Button(description="Valider")


def on_submit_search_btn(button):
    global corpus

    query = search_box.value

    nb = search_length.value
    passages = corpus.search(query, context_length=nb)
    if not passages:
        print("No matches found.")
    else:
        print(f"Number of passages containing the keyword '{query}': {len(passages)}\n")
        print("Passages containing the keyword:\n")
        for passage in passages:
            print(f"{passage}\n")


search_btn.on_click(on_submit_search_btn)

display(search_btn)
display(search_box)
display(search_length)

Button(description='Valider', style=ButtonStyle())

Text(value='', description='Enter a query to search for :')

IntSlider(value=30, description='Choose a context length :', min=1)

In [24]:
text = widgets.HTML(value="<h2>See the concordance of a keyword</h2>")
display(text)

text = widgets.HTML(value="<b>Enter a query to search for :</b>")
display(text)

search_box = widgets.Text()
display(search_box)


text = widgets.HTML(value="<b>Choose a left context length :</b>")
display(text)
left_search_length = widgets.IntSlider(min=1, max=100, value=30)
display(left_search_length)


text = widgets.HTML(value="<b>Choose a right context length :</b>")
display(text)
right_search_length = widgets.IntSlider(min=1, max=100, value=30)
display(right_search_length)


def on_submit_concordance_btn(button):
    global corpus
    query = search_box.value
    nl = left_search_length.value
    nr = right_search_length.value
    df = corpus.concorde(query, taill=(nl, nr))
    if len(df) == 0:
        print("No matches found.")
    else:
        print(df)


search_btn = widgets.Button(description="Valider")
search_btn.on_click(on_submit_concordance_btn)

display(search_btn)

HTML(value='<h2>Search for a keyword</h2>')

HTML(value='<b>Enter a query to search for :</b>')

Text(value='')

HTML(value='<b>Choose a context length :</b>')

IntSlider(value=30, min=1)

Button(description='Valider', style=ButtonStyle())

In [25]:
text = widgets.HTML(value="<h2>See the stats of the corpus</h2>")
display(text)


text = widgets.HTML(value="<b>Enter the number of most frequent words to show :</b>")
display(text)
search_length = widgets.IntSlider(min=1, max=100, value=30)
display(search_length)


def on_submit_cstats_btn(button):
    global corpus
    query = search_box.value
    n = search_length.value
    corpus.stats(top_n=n)


search_btn = widgets.Button(description="Valider")
search_btn.on_click(on_submit_cstats_btn)

display(search_btn)

HTML(value='<h2>See the concordance of a keyword</h2>')

HTML(value='<b>Enter a query to search for :</b>')

Text(value='')

HTML(value='<b>Choose a left context length :</b>')

IntSlider(value=30, min=1)

HTML(value='<b>Choose a right context length :</b>')

IntSlider(value=30, min=1)

Button(description='Valider', style=ButtonStyle())

In [26]:
text = widgets.HTML(value="<h2>See the vocabulary</h2>")
display(text)


def on_submit_vocabulary_btn(button):
    global corpus
    corpus.get_vocabulary_stats()


search_btn = widgets.Button(description="Voir")
search_btn.on_click(on_submit_vocabulary_btn)

display(search_btn)

HTML(value='<h2>See the stats of the corpus</h2>')

HTML(value='<b>Enter the number of most frequent words to show :</b>')

IntSlider(value=30, min=1)

Button(description='Valider', style=ButtonStyle())

In [27]:
text = widgets.HTML(value="<h2>See word frequencies</h2>")
display(text)


def on_submit_frequencies_btn(button):
    global corpus
    print(corpus.get_word_frequencies())


search_btn = widgets.Button(description="Voir")
search_btn.on_click(on_submit_frequencies_btn)

display(search_btn)

HTML(value='<h2>See the vocabulary</h2>')

Button(description='Voir', style=ButtonStyle())

In [28]:
text = widgets.HTML(value="<h2>Use the search engine</h2>")
display(text)


text = widgets.HTML(value="<b>Enter keywords to search for :</b>")
display(text)
search_query = widgets.Text()
display(search_query)


def on_submit_cstats_btn(button):
    global corpus
    query = search_query.value
    scores = corpus.rank_after_scoring(corpus.search_on_scoring(query))
    print(f"Here are the best {len(scores)} result(s) for your query :\n")
    for id, score in scores:
        print("-" * 50)
        print(
            f"Document:\nTitle : {corpus.id2doc[id].titre}\nAuthor : {corpus.id2doc[id].auteur}\nText : {corpus.id2doc[id].texte}\nURL : {corpus.id2doc[id].url}\nDate : {corpus.id2doc[id].date}\n"
        )
        print(f"This document has a score of {score} concerning your request\n")
        print("-" * 50)


search_btn = widgets.Button(description="Search")
search_btn.on_click(on_submit_cstats_btn)

display(search_btn)

HTML(value='<h2>See word frequencies</h2>')

Button(description='Voir', style=ButtonStyle())