In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import json
from nltk.stem.snowball import SnowballStemmer
import itertools
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


### Loading the camembert model and its tokenizer

In [2]:
nlp = pipeline('question-answering', model='etalab-ia/camembert-base-squadFR-fquad-piaf', tokenizer='etalab-ia/camembert-base-squadFR-fquad-piaf')

### Text preprocessing

In [3]:
stop_words = set(stopwords.words('french'))
stemmer = SnowballStemmer(('french'))
punctuations = set(string.punctuation)

In [6]:
def process_tokens(words):
    """" Processes each word of a given list
    Args:
        words: list of words to be processed
    Returns: list
    """
    tokens = [w for w in words if not w.lower() in stop_words and w.lower() not in punctuations]
    tokens = [stemmer.stem(token) for token in tokens]
    tokens = [t for t in  tokens if any(c.isnumeric() for c in t)==False]

    return tokens

def process_document(document):
    """Processes a string of text
    Args:
        document: string to be processed
    Returns: processed string
    """
    words = document.split(' ')
    words = process_tokens(words)
    return ' '.join(words)



def get_documents(json_file):
    """Retrieves all scraped documents
    Arguments:
        json_file: json file containting scraped data
    Returns: list of documents
    """
    with open(f'./data/{json_file}', encoding='UTF-8') as f:
        data = json.load(f)
        corpus = [item['content'] for item in data]
        return corpus
        


def get_relevant_doc(query, documents):
    """Retrives most relevant document from list of documents using TFIDF
    Args:
        query: input query from the user
        documents: list of scraped documents
    Returns: most relevant doc (highest cosine similarity) 
    """
    processed_documents = [process_document(d) for d in documents]
    query = process_document(query)

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_documents)
    query_vector = vectorizer.transform([query])

    cosine_similarities = np.dot(tfidf_matrix, query_vector.T)
    best_document_index = cosine_similarities.argmax()
    best_document = documents[best_document_index]
    return best_document

### Fetching documents

In [7]:
documents = get_documents('finance_glob.json')

### Getting query from user

In [11]:
query = 'que fait le ministère de finances?'

### Fetchig most relevant document

In [12]:
doc = get_relevant_doc(query, documents)

### Using camembert for Reading Comprehension

In [13]:

result = nlp({
'question': query,
'context': doc
})
print(result['answer'])

 assure la gestion du Trésor public.
