In [1]:
from datetime import datetime
import praw
import prawcore
import sys

from models.Author import Author


from models.document_factory import DocumentFactory


import urllib, urllib.request


import xmltodict
import dill
import os
from dotenv import load_dotenv
load_dotenv()


True

In [2]:
listData = []




def fetch_reddit_data(query, limit=10):
    """
    Fetches data from Reddit and fills the listData list with it.
    @param query: the query to search for
    @param limit: the maximum number of posts to fetch (default: 10)
    @return: None
    """
    client_id = os.environ.get("REDDIT_CLIENT_ID")
    client_secret = os.environ.get("REDDIT_SECRET_ID")
    user_agent = os.environ.get("REDDIT_USER_AGENT")

    reddit = praw.Reddit(
        client_id=client_id,
        client_secret=client_secret,
        user_agent=user_agent,
    )

    hot_posts = reddit.subreddit(query).hot(limit=limit)
    try:
        for post in hot_posts:
            if post.selftext != "":
                titre = (
                    post.title.encode("utf-8", errors="ignore")
                    .decode("utf-8")
                    .replace("\n", "")
                )

                auteur = str(post.author)
                date = datetime.fromtimestamp(post.created_utc).strftime("%Y/%m/%d")
                url = post.url
                texte = post.selftext.replace("\n", "")

                doc = DocumentFactory.create_document(
                    "Reddit",
                    titre,
                    auteur,
                    date,
                    url,
                    texte,
                )
                listData.append(("Reddit", doc))
    except (
        prawcore.exceptions.Redirect,
        prawcore.exceptions.Forbidden,
        prawcore.exceptions.NotFound,
        prawcore.exceptions.ServerError,
        prawcore.exceptions.ResponseException,
    ) as e:
        print("[ERRER] -- While fetching data from Reddit, an error occured:")
        if e.response.status_code == 404:
            print(
                """
********************************************************
The subreddit you entered does not exist.
Please make sure you entered the correct subreddit name.
********************************************************
            """
            )
        elif e.response.status_code == 403:
            print(
                """
********************************************************
You are not allowed to access this subreddit.
Please make sure you entered the correct subreddit name.
********************************************************
            """
            )
        elif e.response.status_code == 302:
            print(
                """
********************************************************
The subreddit you entered does not exist.
Please make sure you entered the correct subreddit name.
********************************************************
            """
            )
        elif e.response.status_code == 500:
            print(
                """
********************************************************
An internal server error occured.
Please try again later.
********************************************************
            """
            )
        else:
            print(e.response.status_code)
            print("Exiting...")
            sys.exit(1)




def fetch_arxiv_data(query, limit=10):
    """
    Fetches data from Arxiv and fills the listData list with it.
    @param query: the query to search for
    @param limit: the maximum number of posts to fetch (default: 10)
    @return: None
    """
    url = (
        "http://export.arxiv.org/api/query?search_query=all:"
        + query
        + "&start=0&max_results="
        + str(limit)
    )

    data = urllib.request.urlopen(url).read().decode("utf-8")

    # transform the data from xml to json

    dataToJson = xmltodict.parse(data)

    if (
        "entry" in dataToJson["feed"]
    ):  # check if the key 'entry' exists in the json before trying to access it because it may not exist
        adocs = dataToJson["feed"]["entry"]
        for aPost in adocs:
            titre = aPost["title"].replace("\n", "")
            try:
                authors = ", ".join([a["name"] for a in aPost["author"]])
            except TypeError:
                authors = aPost["author"]["name"]

            summary = aPost["summary"].replace("\n", "")
            date = datetime.strptime(aPost["published"], "%Y-%m-%dT%H:%M:%SZ").strftime(
                "%Y/%m/%d"
            )

            adoc = DocumentFactory.create_document(
                "Arxiv",
                titre,
                authors,
                date,
                aPost["link"][1]["@href"],
                summary,
            )
            listData.append(("Arxiv", adoc))
    else:
        print(
            """
[ERROR] -- While fetching data from Arxiv, an error occured:
********************************************************
No 'entry' key found in the (Arxiv) API response. Unable to retrieve document information.
********************************************************
            """
            )







id2doc = {}



def fillDocDict():

    counter = 0
    for origin, data in listData:
        id = f"{origin}_{counter}"
        id2doc[id] = data
        counter += 1


id2aut = {}
authors = {}


def fillAuthorsDict():
    idAuthor = 0
    for doc in id2doc.values():
        if doc.auteur not in id2aut:
            idAuthor += 1
            authors[idAuthor] = Author(doc.auteur)
            id2aut[doc.auteur] = authors[idAuthor]


from models.Corpus import Corpus

CorpusNoSingleton = Corpus
del Corpus
corpus = CorpusNoSingleton("Mon corpus")

nbPosts = 0


def fillCorpus(docType=""):
    global nbPosts
    for origin, d in listData:
        if docType == origin:
            corpus.add(d)
            nbPosts += 1
        else:
            corpus.add(d)
            nbPosts += 1







In [3]:
import ipywidgets as widgets
from IPython.display import display
from models.decorators import singleton


corpus = None  # Initialisation de la variable corpus


@singleton
class Corpus(CorpusNoSingleton):
    pass


mot_clef_textbox = widgets.Text(description="Mot-clé :")
nb_articles_slider = widgets.IntSlider(
    description="Nombre d'articles :", min=1, max=100, value=10
)



# Affichage des champs du formulaire


display(mot_clef_textbox)


display(nb_articles_slider)



# Fonction pour récupérer les données lorsque l'utilisateur valide le formulaire


def on_submit_button_clicked(button):
    global corpus
    corpus = Corpus("Mon corpus")

    query = mot_clef_textbox.value

    limit = nb_articles_slider.value
    listData = []
    # Utilisation des valeurs saisies par l'utilisateur pour récupérer les données des deux APIs

    fetch_reddit_data(query, limit // 2)

    fetch_arxiv_data(query, limit // 2)
    fillDocDict()
    fillAuthorsDict()


    fillCorpus()
    with open(str(query) + "_" + "corpus.pkl", "wb") as f:
        dill.dump(corpus, f)

    with open(str(query) + "_" + "corpus.pkl", "rb") as f:
        corpus = dill.load(f)
    print("Nombre total de posts dans le corpus :", nbPosts)
    print(corpus)



# Création du bouton de soumission du formulaire


submit_button = widgets.Button(description="Valider")


submit_button.on_click(on_submit_button_clicked)


display(submit_button)

Text(value='', description='Mot-clé :')

IntSlider(value=10, description="Nombre d'articles :", min=1)

Button(description='Valider', style=ButtonStyle())