In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

In [24]:
links = ["https://fr.wikipedia.org/wiki/Apprentissage_automatique",
        "https://fr.wikipedia.org/wiki/R%C3%A9seau_de_neurones_artificiels",
        "https://dataanalyticspost.com/Lexique/reduction-de-dimensionnalite/",
        "https://economy-pedia.com/11033079-analysis-of-data",
        "https://datascientest.com/data-analysis-tout-savoir",
        "https://docs.microsoft.com/fr-fr/analysis-services/data-mining/data-mining-concepts?view=asallproducts-allversions",
        "https://www.oracle.com/fr/database/data-mining-definition.html#:~:text=Le%20Data%20Mining%20implique%20la,utilise%20des%20algorithmes%20math%C3%A9matiques%20sophistiqu%C3%A9s.",
        "https://fr.wikipedia.org/wiki/Exploration_de_donn%C3%A9es",
        "https://fr.wikipedia.org/wiki/Partitionnement_de_donn%C3%A9es",
        "https://fr.wikipedia.org/wiki/TensorFlow",
        "https://stats.stackexchange.com/questions/100891/create-a-matrix-of-tf-idf-values-from-documents",
        "https://fr.wikipedia.org/wiki/Similarit%C3%A9_cosinus",
        "https://www.ibm.com/fr-fr/cloud/learn/neural-networks#:~:text=Les%20r%C3%A9seaux%20neuronaux%20imitent%20le,et%20de%20l'apprentissage%20profond.",
        "https://www.juripredis.com/fr/blog/id-19-demystifier-le-machine-learning-partie-2-les-reseaux-de-neurones-artificiels",
        "https://fr.blog.businessdecision.com/tutoriel-machine-learning-comprendre-ce-quest-un-reseau-de-neurones-et-en-creer-un/",
        "https://en.wikipedia.org/wiki/Machine_learning"]

In [25]:
from goose3 import Goose
def get_article_text_goos(link):
    g = Goose({'browser_user_agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11'})
    article = g.extract(url=link)
    return article.cleaned_text

In [26]:
from boilerpy3 import extractors
def get_article_text(link: str):
    """
    Get the text of the article from the link"""

    extractor = extractors.ArticleExtractor()
    # From a URL
    content = extractor.get_content_from_url(link)

    return content

In [27]:
corpus = []
for link in links:
    corpus.append(get_article_text_goos(link))

In [None]:
corpus[1]

In [7]:
import spacy
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer


nlp = spacy.load('fr_core_news_md')
def clean_text(text):
    with open("/home/merouane/Gitlab/Ecole_IA/projet-5-groupe-3/Veille_IA/utils/stop_words_french.txt") as file:
        stop_words = file.read()
    stop_words = stop_words.split()
    
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.match('[a-zA-Z]', token):
            filtered_tokens.append(token)
            
    token_text=" ".join(filtered_tokens)
    #Lemmatizations
    doc = nlp(token_text)
    cleaned_text = [token.lemma_ for token in doc if str(token) not in stop_words and len(str(token))>3]
    cleaned_text = " ".join(cleaned_text)
    cleaned_text = cleaned_text.lower()
    return cleaned_text

In [28]:
cleaned_corpus = []

for text in corpus:
    cleaned_corpus.append(clean_text(text))

In [29]:
df = pd.DataFrame(cleaned_corpus, columns=['clean_corpus'])
df["link"] = links

In [10]:
df

Unnamed: 0,clean_corpus,link
0,apprentissage automatique anglais machine lear...,https://fr.wikipedia.org/wiki/Apprentissage_au...
1,réseau neurone artificiel réseau neuronal arti...,https://fr.wikipedia.org/wiki/R%C3%A9seau_de_n...
2,désigne méthode permettre projeter issu espace...,https://dataanalyticspost.com/Lexique/reductio...
3,analyse étude exhaustif ensemble information o...,https://economy-pedia.com/11033079-analysis-of...
4,existe type analyse voici méthode technique co...,https://datascientest.com/data-analysis-tout-s...
5,exploration processus recherche information ut...,https://docs.microsoft.com/fr-fr/analysis-serv...
6,tout devoir data mining data mining pratique c...,https://www.oracle.com/fr/database/data-mining...
7,exploration note connaître expression fouille ...,https://fr.wikipedia.org/wiki/Exploration_de_d...
8,partitionnement dater clustering anglais métho...,https://fr.wikipedia.org/wiki/Partitionnement_...
9,tensorflow outil open source apprentissage aut...,https://fr.wikipedia.org/wiki/TensorFlow


In [30]:
## Create Vocabulary
vocabulary = set()

for doc in df.clean_corpus:
    vocabulary.update(doc.split(' '))
    
vocabulary = list(vocabulary)

In [12]:
'apprentissage' in vocabulary

True

In [13]:
with open("/home/merouane/Gitlab/Ecole_IA/projet-5-groupe-3/Veille_IA/utils/stop_words_french.txt") as file:
    stop_words = file.read()
stop_words = stop_words.split()

In [31]:
vectorizer = TfidfVectorizer(stop_words=stop_words, vocabulary=vocabulary)
X = vectorizer.fit_transform(cleaned_corpus)



In [32]:
def gen_vector_T(tokens):
    Q = np.zeros((len(vocabulary)))   
    x= vectorizer.transform(tokens)
    #print(tokens[0].split(','))
    for token in tokens:
        try:
            ind = vocabulary.index(token)
            Q[ind] = x[0, vectorizer.vocabulary_[token]]
        except:
            pass
    return Q

In [33]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [34]:
def cosine_similarity_T(k, query):
    preprocessed_query = preprocessed_query = re.sub("\W+", " ", query).strip()
    tokens = word_tokenize(str(preprocessed_query))
    q_df = pd.DataFrame(columns=['q_clean'])
    q_df.loc[0,'q_clean'] =tokens
    q_df['q_clean'] = clean_text(tokens[0])
    d_cosines = []
    query_vector = gen_vector_T(q_df['q_clean'])
    for d in X.A:

        d_cosines.append(cosine_sim(query_vector, d))

                    
    out = np.array(d_cosines).argsort()[-k:][::-1]
    #print("")
    d_cosines.sort()
    a = pd.DataFrame()
    for i,index in enumerate(out):
        a.loc[i,'index'] = str(index)
        a.loc[i,'Subject'] = df['link'][index]
    for j,simScore in enumerate(d_cosines[-k:][::-1]):
        a.loc[j,'Score'] = simScore
    return a

In [43]:
cosine_similarity_T(6,'clustering')

Unnamed: 0,index,Subject,Score
0,8,https://fr.wikipedia.org/wiki/Partitionnement_...,0.180968
1,15,https://en.wikipedia.org/wiki/Machine_learning,0.014318
2,14,https://fr.blog.businessdecision.com/tutoriel-...,0.0
3,13,https://www.juripredis.com/fr/blog/id-19-demys...,0.0
4,12,https://www.ibm.com/fr-fr/cloud/learn/neural-n...,0.0
5,11,https://fr.wikipedia.org/wiki/Similarit%C3%A9_...,0.0
