## Importation de bibliothèques

In [None]:
!pip install sentence-transformers

In [None]:
import requests
from bs4 import BeautifulSoup
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
import en_core_web_sm
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import numpy as np
from textblob import TextBlob
from gensim.models import Word2Vec
from sentence_transformers import SentenceTransformer, util

---

## Extraction et Préparation de Données

In [13]:
#   Fonction pour extraire le texte d'une page web
def get_text_from_url(url):

    response = requests.get(url)

    # Vérifier si la requête a réussi : Un code 200 signifie que la requête a réussi ( l'erreur 404 signifie que la page n'a pas été trouvée)
    if response.status_code == 200:
        # Parser le contenu HTML de la page
        soup = BeautifulSoup(response.content, 'lxml')

        # Extraire tout le texte à l'intérieur des balises <p> (paragraphes)
        paragraphs = soup.find_all('p')
        text = " ".join([para.get_text() for para in paragraphs])

        return text.strip()

# Prétraitement du texte
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    words = nltk.word_tokenize(text.lower())
    words = [lemmatizer.lemmatize(word) for word in words if word.isalnum() and word not in stop_words]
    return ' '.join(words)

In [14]:
# URL des deux documents fournis dans le mail
url1 = "https://english.elpais.com/technology/2024-07-16/artificial-intelligence-is-already-an-environmental-problem.html"
url2 = "https://www.linkedin.com/pulse/ai-environment-how-artificial-intelligence-helping-save-planet"

text1 = get_text_from_url(url1)
text2 = get_text_from_url(url2)
print(f"Extrait du document 1 :\n{text1[:50]}...\n")
print(f"Extrait du document 2 :\n{text2[:50]}...")

Extrait du document 1 :
The era of generative artificial intelligence (AI)...

Extrait du document 2 :
Agree & Join LinkedIn
             
      By click...


In [15]:
text1 = text1[:len(text1)-870] # Nettoyage du texte
text1_processed = preprocess_text(text1)

text2 = text2[650:len(text2)-260] # Nettoyage du texte
text2_processed = preprocess_text(text2)

print(f"Extrait du document 1 après prétraitement :\n{text1_processed[:150]}...\n")
print(f"Extrait du document 2 après prétraitement :\n{text2_processed[:150]}...")

Extrait du document 1 après prétraitement :
era generative artificial intelligence ai changing world figuratively literally energy water consumption large technology company main developer techn...

Extrait du document 2 après prétraitement :
icy cookie policy world continues grapple environmental challenge climate change deforestation pollution growing interest technology help address issu...


---

## Approche Basée sur la Similarité Lexicale : (TF-IDF, Cosine Similarity)

In [20]:
documents = [text1_processed, text2_processed]

vectorizer = TfidfVectorizer(ngram_range=(1, 2)) # Initialisation du TfidfVectorizer

# Calcul des TF-IDF pour chaque document
tfidf_matrix = vectorizer.fit_transform(documents)

# Calcul de la similarité cosinus
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])

print("Similarité Cosinus entre les deux documents:", cosine_sim[0][0],"\n")

# Affichage des termes avec les poids les plus élevés pour chaque document
feature_names = vectorizer.get_feature_names_out()
dense = tfidf_matrix.todense()
doc1_tfidf = dense[0].tolist()[0]
doc2_tfidf = dense[1].tolist()[0]

# Obtenir les termes les plus significatifs (top_n)
top_n = 10
indices1 = np.argsort(doc1_tfidf)[::-1][:top_n]
indices2 = np.argsort(doc2_tfidf)[::-1][:top_n]

print("Termes significatifs dans Document 1:")
for idx in indices1:
    print(feature_names[idx], ":", doc1_tfidf[idx])

print("\n--------------------------------------")

print("\nTermes significatifs dans Document 2:")
for idx in indices2:
    print(feature_names[idx], ":", doc2_tfidf[idx])

Similarité Cosinus entre les deux documents: 0.1951066470124011 

Termes significatifs dans Document 1:
year : 0.20695517880250144
water : 0.20246918206064746
data : 0.18406289278240678
ai : 0.18406289278240678
company : 0.1472503142259254
consumption : 0.1472503142259254
google : 0.12934698675156342
center : 0.12934698675156342
data center : 0.12934698675156342
increase : 0.12884402494768474

--------------------------------------

Termes significatifs dans Document 2:
ai : 0.4928816962712112
help : 0.23090934217811776
waste : 0.21380494646122017
environmental : 0.20688861324964422
used : 0.19471869482319457
reduce : 0.17959615502742493
potential : 0.1453873635936297
system : 0.13386910269094626
risk : 0.1282829678767321
energy : 0.12778414347772143


---
---

## Analyse Sémantique et Similarité Basée sur les Embeddings

### Word2Vec

In [24]:
# Tokenisation et entraînement du modèle Word2Vec
tokenized_docs = [nltk.word_tokenize(doc.lower()) for doc in [text1, text2]]
model = Word2Vec(sentences=tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)

# Fonction pour obtenir l'embedding moyen d'une phrase
def get_avg_embedding(text, model):
    words = [word for word in nltk.word_tokenize(text.lower()) if word in model.wv]

    return np.mean([model.wv[word] for word in words], axis=0)

# Fonction pour détecter la polarité avec TextBlob
def get_polarity(text):
    return TextBlob(text).sentiment.polarity

# Diviser les documents en paragraphes/phrases
paragraphs_doc1 = nltk.sent_tokenize(text1)
paragraphs_doc2 = nltk.sent_tokenize(text2)

# Listes pour stocker les résultats
divergent_paragraphs = []
similar_paragraphs = []

# Comparer chaque paragraphe de doc1 avec chaque paragraphe de doc2
for para1 in paragraphs_doc1:
    embedding_para1 = get_avg_embedding(para1, model)
    polarity_para1 = get_polarity(para1)
    for para2 in paragraphs_doc2:
        embedding_para2 = get_avg_embedding(para2, model)
        polarity_para2 = get_polarity(para2)

        # Calcul de la similarité cosinus
        sim_score = cosine_similarity([embedding_para1], [embedding_para2])[0][0]

        # Identifier les paragraphes divergents
        if polarity_para1 * polarity_para2 < 0:  # Polarités opposées
            polarity_diff = abs(polarity_para1 - polarity_para2)
            divergent_paragraphs.append((para1, para2, polarity_para1, polarity_para2, polarity_diff))

        # Identifier les paragraphes similaires
        if sim_score > 0.7:  # Forte similarité
            similar_paragraphs.append((para1, para2, sim_score, polarity_para1, polarity_para2))

# Trier les paragraphes similaires par score de similarité et sélectionner les 3 premiers
similar_paragraphs_sorted = sorted(similar_paragraphs, key=lambda x: x[2], reverse=True)[:3]

# Trier les paragraphes divergents par différence de polarité et sélectionner les 3 premiers
divergent_paragraphs_sorted = sorted(divergent_paragraphs, key=lambda x: x[4], reverse=True)[:3]

# Afficher les résultats
print("Top 3 Paires de paragraphes similaires (Document 1 vs Document 2):")
for para1, para2, score, pol1, pol2 in similar_paragraphs_sorted:
    print(f"Doc1: {para1} \nDoc2: {para2} \nScore de Similarité: {score}\n")

print("\nTop 3 Paires de paragraphes divergents (Document 1 vs Document 2):")
for para1, para2, pol1, pol2, pol_diff in divergent_paragraphs_sorted:
    print(f"Doc1: {para1} (Polarité: {pol1})\nDoc2: {para2} (Polarité: {pol2})\nDifférence de Polarité: {pol_diff}\n")

Top 3 Paires de paragraphes similaires (Document 1 vs Document 2):
Doc1: Shaolei Ren, an associate professor of electrical and computer engineering at the University of California, Riverside and a specialist in AI sustainability, believes it is safe to infer that AI is responsible for this escalation in pollution and resource consumption. 
Doc2: To maximize the potential benefits of AI and minimize its risks, it is important to develop ethical and responsible AI systems that prioritize environmental protection and sustainability. 
Score de Similarité: 0.9952789545059204

Doc1: The companies only provide data on the water they use to cool the data centers, but do not include in their reports either the water used to generate the electricity they consume or the water used in the supply chain of the products (mainly in the manufacturing of chips and other hardware), as is the case, for example, with carbon emissions. 
Doc2: The Potential Risks of AI for the Environment While AI has the po

### SBERT

In [None]:
from sentence_transformers import SentenceTransformer, util
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

In [23]:
# Charger SBERT
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Diviser les documents en paragraphes
paragraphs_doc1 = nltk.sent_tokenize(text1)
paragraphs_doc2 = nltk.sent_tokenize(text2)

# Calculer les embeddings de chaque paragraphe
embeddings_doc1 = model.encode(paragraphs_doc1, convert_to_tensor=True)
embeddings_doc2 = model.encode(paragraphs_doc2, convert_to_tensor=True)

# Pour l'analyse de polarité
sia = SentimentIntensityAnalyzer()

# Listes pour stocker les résultats
similar_paragraphs = []
divergent_paragraphs = []

# Comparer chaque paragraphe de doc1 avec chaque paragraphe de doc2
for i, embedding1 in enumerate(embeddings_doc1):
    for j, embedding2 in enumerate(embeddings_doc2):
        # Calcul de la similarité cosinus
        sim_score = util.cos_sim(embedding1, embedding2).item()

        # Calcul de la polarité
        polarity1 = sia.polarity_scores(paragraphs_doc1[i])['compound']
        polarity2 = sia.polarity_scores(paragraphs_doc2[j])['compound']
        polarity_diff = abs(polarity1 - polarity2)  # Différence de polarité

        # Ajustement du score de divergence avec la polarité
        adjusted_divergence_score = sim_score - polarity_diff * 0.2

        # Stocker les paires similaires (score élevé)
        if sim_score > 0.5:  # Ajustez le seuil si nécessaire
            similar_paragraphs.append((paragraphs_doc1[i], paragraphs_doc2[j], sim_score))

        # Stocker les paires divergentes (score faible ou contexte opposé)
        if adjusted_divergence_score < -0.1:  # Seuil pour divergence
            divergent_paragraphs.append((paragraphs_doc1[i], paragraphs_doc2[j], adjusted_divergence_score))

similar_paragraphs_sorted = sorted(similar_paragraphs, key=lambda x: x[2], reverse=True)[:3]
divergent_paragraphs_sorted = sorted(divergent_paragraphs, key=lambda x: x[2])[:3]

# Afficher les résultats
print("Top 3 Paires de paragraphes similaires (Document 1 vs Document 2):")
for para1, para2, score in similar_paragraphs_sorted:
    print(f"Doc1: {para1} \nDoc2: {para2} \nScore de Similarité: {score}\n")

print("\nTop 3 Paires de paragraphes divergents (Document 1 vs Document 2):")
for para1, para2, score in divergent_paragraphs_sorted:
    print(f"Doc1: {para1} \nDoc2: {para2} \nScore de Divergence (ajusté): {score}\n")

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Top 3 Paires de paragraphes similaires (Document 1 vs Document 2):
Doc1: “The main driver of the increase in global carbon emissions is that associated with the manufacturing of AI chips and the construction of data centers,” he explains. 
Doc2: The potential impact of AI on the environment is significant. 
Score de Similarité: 0.6001753211021423

Doc1: The era of generative artificial intelligence (AI) is changing the world, both figuratively and literally. 
Doc2:  lot of attention is artificial intelligence (AI). 
Score de Similarité: 0.5985528230667114

Doc1: “The main driver of the increase in global carbon emissions is that associated with the manufacturing of AI chips and the construction of data centers,” he explains. 
Doc2: While AI can help reduce the environmental impact of these industries, it could also enable them to operate more efficiently, leading to increased greenhouse gas emissions and other environmental damage. 
Score de Similarité: 0.5799895524978638


Top 3 Paire

---
---

## Résumés de Texte et Comparaison Basée sur les Embeddings

SUMY

In [None]:
!pip install sumy
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.lsa import LsaSummarizer

In [16]:
def summarize_text(input_text, sentences_count):
    # Parser le texte d'entrée
    parser = PlaintextParser.from_string(input_text, Tokenizer("english"))
    # Créer un résumé LSA
    summarizer = LsaSummarizer()
    # Générer le résumé
    summary = summarizer(parser.document, sentences_count)
    return summary

summary_text1 = summarize_text(text1, sentences_count=15)
summary_text2 = summarize_text(text2, sentences_count=15)

# Afficher les résumés
print("Résumé du Document 1:")
for sentence in summary_text1:
    print(sentence)

print("\nRésumé du Document 2:")
for sentence in summary_text2:
    print(sentence)

Résumé du Document 1:
The energy and water consumption of large technology companies, the main developers of this technology, as well as their carbon emissions, have skyrocketed in recent years.
And projections show that the trend will not change.
Shaolei Ren, an associate professor of electrical and computer engineering at the University of California, Riverside and a specialist in AI sustainability, believes it is safe to infer that AI is responsible for this escalation in pollution and resource consumption.
Microsoft, owner of Copilot and which has lent its infrastructure to OpenAI to develop all versions of ChatGPT and the Dall-E image generator, has recorded a growth of 28.7%, as reflected in its annual sustainability report.
Almost the same thing has happened at Google, with an increase of 67% in this period.
All this activity has stretched energy demand, to the point that some companies, aware that the trend will continue to rise for some time, are studying developing small nucl

In [17]:
# Joindre le texte résumé en une seule chaîne
sum_text1 = " ".join([str(sentence) for sentence in summary_text1])
sum_text2 = " ".join([str(sentence) for sentence in summary_text2])

# Charger le modèle SBERT
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Diviser les documents en paragraphes
paragraphs_doc1 = nltk.sent_tokenize(sum_text1)
paragraphs_doc2 = nltk.sent_tokenize(sum_text2)

# Calculer les embeddings pour chaque paragraphe
embeddings_doc1 = model.encode(paragraphs_doc1, convert_to_tensor=True)
embeddings_doc2 = model.encode(paragraphs_doc2, convert_to_tensor=True)

# Listes pour stocker les résultats
similar_paragraphs = []
divergent_paragraphs = []

# Comparer chaque paragraphe de doc1 avec chaque paragraphe de doc2
for i, embedding1 in enumerate(embeddings_doc1):
    for j, embedding2 in enumerate(embeddings_doc2):
        # Calcul de la similarité cosinus
        sim_score = util.cos_sim(embedding1, embedding2).item()

        # Stocker les paires similaires (cosinus élevé)
        if sim_score > 0.4:  # Ajustez le seuil si nécessaire
            similar_paragraphs.append((paragraphs_doc1[i], paragraphs_doc2[j], sim_score))

        # Stocker les paires divergentes (cosinus négatif)
        if sim_score < 0:  # Seuil pour divergence
            divergent_paragraphs.append((paragraphs_doc1[i], paragraphs_doc2[j], sim_score))

# Trier les paragraphes similaires et divergents par score
similar_paragraphs_sorted = sorted(similar_paragraphs, key=lambda x: x[2], reverse=True)[:3]
divergent_paragraphs_sorted = sorted(divergent_paragraphs, key=lambda x: x[2])[:3]

# Afficher les résultats
print("Top 3 Paires de paragraphes similaires (Document 1 vs Document 2):")
for para1, para2, score in similar_paragraphs_sorted:
    print(f"Doc1: {para1} \nDoc2: {para2} \nScore de Similarité: {score}\n")

print("\nTop 3 Paires de paragraphes divergents (Document 1 vs Document 2):")
for para1, para2, score in divergent_paragraphs_sorted:
    print(f"Doc1: {para1} \nDoc2: {para2} \nScore de Divergence: {score}\n")



Top 3 Paires de paragraphes similaires (Document 1 vs Document 2):
Doc1: Shaolei Ren, an associate professor of electrical and computer engineering at the University of California, Riverside and a specialist in AI sustainability, believes it is safe to infer that AI is responsible for this escalation in pollution and resource consumption. 
Doc2: Finally, there is a risk that AI could be used to exacerbate existing environmental injustices. 
Score de Similarité: 0.5769515633583069

Doc1: Shaolei Ren, an associate professor of electrical and computer engineering at the University of California, Riverside and a specialist in AI sustainability, believes it is safe to infer that AI is responsible for this escalation in pollution and resource consumption. 
Doc2: For example, there is a risk that AI algorithms could be biased or inaccurate, leading to unintended environmental consequences. 
Score de Similarité: 0.5259228944778442

Doc1: That same year, the energy demand of AI will be between 

## Approche Basée un Large Language Model (LLM) :

Nous utilisons le modèle pré-entraîné **Qwen/Qwen2.5-1.5B-Instruct** disponible dans la bibliothèque Hugging Face Transformers.

In [2]:
! huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineGr

In [20]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct")

In [24]:
import torch

def compare_texts(text1, text2):

    prompt = f"Compare the following two texts and highlight their common ideas as well as their differences:\n\nText 1: {text1}\n\nText 2: {text2}.\n I want the result in this format \nCommon ideas:\n\nDifferences:"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)  # Envoyer les entrées sur le même périphérique que le modèle
    outputs = model.generate(inputs['input_ids'], max_length=2000)  # Utiliser les ID d'entrée uniquement
    comparison = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return comparison

result = compare_texts(sum_text1, sum_text2)
print(result)

Compare the following two texts and highlight their common ideas as well as their differences:

Text 1: The energy and water consumption of large technology companies, the main developers of this technology, as well as their carbon emissions, have skyrocketed in recent years. And projections show that the trend will not change. Shaolei Ren, an associate professor of electrical and computer engineering at the University of California, Riverside and a specialist in AI sustainability, believes it is safe to infer that AI is responsible for this escalation in pollution and resource consumption. Microsoft, owner of Copilot and which has lent its infrastructure to OpenAI to develop all versions of ChatGPT and the Dall-E image generator, has recorded a growth of 28.7%, as reflected in its annual sustainability report. Almost the same thing has happened at Google, with an increase of 67% in this period. All this activity has stretched energy demand, to the point that some companies, aware that