## <font color="#F0000"> Text summarization avec nltk

### <font color="#48dbfb"> Import les packages

In [2]:
# Importer packages
from bs4 import BeautifulSoup as bs
from requests import get
import re
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\morcodou.seck\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\morcodou.seck\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### <font color="#48dbfb">Scraper des articles sur Wikipedia

In [3]:
# Importer des données sur la cuisine iraniéne
resp = get('https://en.wikipedia.org/wiki/Iranian_cuisine')
# resp = get('https://fr.wikipedia.org/wiki/Cuisine_iranienne')

article_soup = bs(resp.text)

paragraphs = article_soup.find_all('p')

article_text = ""

for p in paragraphs:
    article_text += p.text

In [3]:
# Afficher article_text
article_text

'\nIranian cuisine (Persian: آشپزی ایرانی, romanized:\xa0Āshpazī Irānī) are the culinary traditions of Iran. Due to the historically common usage of the term "Persia" to refer to Iran in the Western world,[2][3][4] it is alternatively known as Persian cuisine, despite Persians being only one of a multitude of Iranian ethnic groups who have contributed to Iran\'s culinary traditions.[a]\nThe cuisine of Iran has made extensive contact throughout its history with the cuisines of its neighbouring regions, including Caucasian cuisine, Central Asian cuisine, Greek cuisine, Levantine cuisine, Mesopotamian cuisine, Russian cuisine and Turkish cuisine.[6][7][8][9] Aspects of Iranian cuisine have also been significantly adopted by Indian cuisine and Pakistani cuisine through various historical Persianate sultanates that flourished during Muslim rule on the Indian subcontinent, with the most notable and impactful of these polities being the Mughal Empire.[10][11][12]\nTypical Iranian main dishes 

### <font color="#48dbfb"> Nettoyer les données

In [4]:
# Supprimer [\w]*
article_text = re.sub(r'[[\w]*]', ' ', article_text)
# Supprimer les chaines de \xa0, \u200c
article_text = re.sub(r'\xa0|\u200c', ' ', article_text)
# Remplacer les espaces multiples par l'espace simple
article_text = re.sub(r'/s+', ' ', article_text)
# Remplacer l'espace en debut et fin de corpus
article_text = re.sub(r'^\s|\s$', '', article_text)

  article_text = re.sub(r'[[\w]*]', ' ', article_text)


In [7]:
# Afficher article_text
article_text

'Iranian cuisine (Persian: آشپزی ایرانی, romanized: Āshpazī Irānī) are the culinary traditions of Iran. Due to the historically common usage of the term "Persia" to refer to Iran in the Western world,    it is alternatively known as Persian cuisine, despite Persians being only one of a multitude of Iranian ethnic groups who have contributed to Iran\'s culinary traditions. \nThe cuisine of Iran has made extensive contact throughout its history with the cuisines of its neighbouring regions, including Caucasian cuisine, Central Asian cuisine, Greek cuisine, Levantine cuisine, Mesopotamian cuisine, Russian cuisine and Turkish cuisine.     Aspects of Iranian cuisine have also been significantly adopted by Indian cuisine and Pakistani cuisine through various historical Persianate sultanates that flourished during Muslim rule on the Indian subcontinent, with the most notable and impactful of these polities being the Mughal Empire.   \nTypical Iranian main dishes are combinations of rice with 

### <font color="#48dbfb"> Text Summarization

#### <font color="#fd79a8">Tokeniser en phrase

In [5]:
# Tokeniser en phrase
sentence_list = nltk.sent_tokenize(article_text)

#### <font color="#fd79a8"> Calculer les frequences des mots

In [6]:
# Stopwords
stopwords = nltk.corpus.stopwords.words('english')
# Dictionnaire de fréquences des mots
word_frequencies = {}
for word in nltk.word_tokenize(article_text):
    if word not in stopwords:
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1

In [None]:
# Afficher word_frequencies
word_frequencies

#### <font color="#fd79a8"> Fréquence pondérée de chaque mots

In [7]:
# Fréquence maximale
maximum_frequency = max(word_frequencies.values())
# Calculer la fréquence pondérée
for word in word_frequencies.keys():
    word_frequencies[word] = word_frequencies[word] / maximum_frequency

In [None]:
# afficher word_frequencies
word_frequencies

#### <font color="#fd79a8"> Score des phrases

In [8]:
# Liste des scores de chaque phrase
sentence_scores = {}
# Calculer le score de chaque phrase
for sent in sentence_list:
    for word in nltk.word_tokenize(sent.lower()):
        if word in word_frequencies.keys():
            if len(sent.split(' ')) < 30:
                if sent not in sentence_scores.keys():
                    sentence_scores[sent] = word_frequencies[word]
                else:
                    sentence_scores[sent] += word_frequencies[word]

In [None]:
# Afficher sentences scores
sentence_scores

#### <font color="#fd79a8"> Résumé de l'article

In [9]:
# Ordonner les phrases par pondération et recupérer les 10 premières phrases
summary_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=False)[:10]
# regrouper ensemble les phrases qui ont les poids les plus élévés
summary = ' '.join(summary_sentences)

# Afficher le résumé
summary_sentences

['The measurements and directions are not as detailed as in the earlier book.',
 'Iranians traditionally put a lump of sugar cube in the mouth before drinking the tea.',
 'The food of southern Iran is typically spicy.',
 "Iran's Turkmen people are predominantly centered in the Iranian provinces of Golestan and North Khorasan.",
 'The agriculture of Iran produces many fruits and vegetables.',
 'The dolma is then simmered in meat broth or ascallions sweet-and-sour sauce.',
 'Other contemporary cooks and their specialties are also mentioned.',
 'The following is a list of several Iranian desserts.',
 'It is also mixed with vinegar into which broad beans are dipped before eating.',
 'Fruit dolma is probably a specialty of Iranian cuisine.']

## <font color="#F0000"> Text summarization avec sumy

In [10]:
!pip install sumy




[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [11]:
# Importer les packages
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer

# Créer un text parser utilisant de tokenisation
parser = PlaintextParser.from_string(article_text, Tokenizer('english'))

In [None]:
parser.document.sentences

### <font color="#48dbfb">  TextRankSummarizer

In [12]:
# Importer le TextRankSummarizer
from sumy.summarizers.text_rank import TextRankSummarizer

# Initialiser le modèle
summarizer_textrank = TextRankSummarizer()

# Summariser en 5 phrases
summary = summarizer_textrank(parser.document, 5)

# Regrouper les phrases
text_summary = ""
for sentence in summary:
    text_summary += str(sentence)

# Afficher le summary
text_summary

'Among the writings available from the Middle Persian scripts, the treatise of Khosrow and Ridag, points about stews and foods and the way of using them and how they are obtained in the Sassanid period are found as valid references in compiling the history of cooking in Iran.Although the Arabic cookbooks written under the rule of the Abbasid Caliphate—one of the Arab caliphates which ruled Iran after the Muslim invasion—include some recipes with Iranian names, the earliest surviving classical cookbooks in Persian are two volumes from the Safavid period.The large quantities specified, as well as the generous use of such luxury ingredients as saffron, suggest that these dishes were prepared for large aristocratic households, even though in his introduction, the author claimed to have written it "for the benefit of the nobility, as well as the public."Traditionally, rice was most prevalent as a major staple item in northern Iran and the homes of the wealthy, while bread was the dominant s

### <font color="#48dbfb">  LexRankSummarizer

In [13]:
# Importer LexRankSummarizer
from sumy.summarizers.lex_rank import LexRankSummarizer
# Initialiser le modèle
summarizer_lexrank = LexRankSummarizer()

# Summariser en 5 phrases
summary = summarizer_lexrank(parser.document, 5)

# Regrouper les phrases
text_summary = ""
for sentence in summary:
    text_summary += str(sentence)
    
# Afficher le summary
text_summary

'The cuisine of Iran has made extensive contact throughout its history with the cuisines of its neighbouring regions, including Caucasian cuisine, Central Asian cuisine, Greek cuisine, Levantine cuisine, Mesopotamian cuisine, Russian cuisine and Turkish cuisine.Typical Iranian main dishes are combinations of rice with meat, vegetables and nuts.A polow dish includes rice stuffed with cuts of vegetables, fruits, and beans, usually accompanied by either chicken or red meat.Turkish coffee is also popular in Iran, more specifically among Iranian Azeris.Prior to the 1979 Revolution, it had been produced traditionally in several cities of Iran.'

### <font color="#48dbfb"> LsaSummarizer

In [14]:
# Importer LsaSummarizer
from sumy.summarizers.lsa import LsaSummarizer
# Initialiser le modèle
summarizer_lsa = LsaSummarizer()

# Summariser en 5 phrases
summary = summarizer_lsa(parser.document, 5)

# Regrouper les phrases
text_summary = ""
for sentence in summary:
    text_summary += str(sentence)

# Afficher le summary
print(text_summary)

Thus, a bowl of fresh fruit is common on Iranian tables, and vegetables are standard side dishes in most meals.A polow dish includes rice stuffed with cuts of vegetables, fruits, and beans, usually accompanied by either chicken or red meat.Araq sagi, literally meaning "doggy distillate", is a type of distilled alcoholic beverage in Iran which contains at least 65% pure ethanol.Smoked fish (Persian: ماهی دودی, Romanized: Mahi doodi) is also popular in   and usually incorporated into rice by steaming the two together.Another notable dessert from this region is Reshteh Khoshkar (Persian: رشته خشکار), consisting of fried rice flour dough filled with sugar and nuts.
