In [1]:
import nltk
import requests
import re
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from collections import Counter
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import spacy
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [2]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
# URLs of the books
urls = [
    "https://www.gutenberg.org/files/11/11-0.txt",  # Alice’s Adventures in Wonderland
    "https://www.gutenberg.org/files/12/12-0.txt",  # Through the Looking-Glass
    "https://www.gutenberg.org/files/851/851-0.txt"  # A Tangled Tale
]


In [7]:
def load_texts(urls):
    corpus = []
    for url in urls:
        response = requests.get(url)
        text = response.text

        # Remove non-word characters and digits
        text = re.sub(r'[^A-Za-z\s]', '', text)
        corpus.append(text)
    return corpus


In [10]:
books = load_texts(urls)
print(type(books))  # Vérifie que c'est bien une liste
print(len(books))   # Affiche combien de livres ont été chargés

<class 'list'>
3


In [11]:
for text in books:
    print(text[:200])

 START OF THE PROJECT GUTENBERG EBOOK  
Illustration




Alices Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 

Contents

 CHAPTER I     Down the RabbitHole
 START OF THE PROJECT GUTENBERG EBOOK  
Illustration




THROUGH THE LOOKINGGLASS

And What Alice Found There

By Lewis Carroll

The Millennium Fulcrum Edition 




DRAMATIS PERSON
A
DOCTYPE html
html classclientnojs langen dirltr
head
 meta charsetUTF

title  Project Gutenbergtitle
 link relstylesheet hrefgutenbergstylecssv
 link relstylesheet hrefgutenbergcollapsiblecss
 link re


Afin de retirer les partie du texte non pertiente, nous allons repérer les partie commençons pas "START ou "END".

In [5]:
# Load the texts
books = load_texts(urls)

# Print first 200 characters of each book
for i, book in enumerate(books):
    print(f"Book {i+1} (First 200 characters):\n", book[:200], "\n")


Book 1 (First 200 characters):
  START OF THE PROJECT GUTENBERG EBOOK  
Illustration




Alices Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 

Contents

 CHAPTER I     Down the RabbitHole 

Book 2 (First 200 characters):
  START OF THE PROJECT GUTENBERG EBOOK  
Illustration




THROUGH THE LOOKINGGLASS

And What Alice Found There

By Lewis Carroll

The Millennium Fulcrum Edition 




DRAMATIS PERSON
A 

Book 3 (First 200 characters):
 DOCTYPE html
html classclientnojs langen dirltr
head
 meta charsetUTF

title  Project Gutenbergtitle
 link relstylesheet hrefgutenbergstylecssv
 link relstylesheet hrefgutenbergcollapsiblecss
 link re 



In [16]:
import nltk
nltk.download('punkt', force=True)
# nltk.download('averaged_perceptron_tagger', force=True)
# nltk.download('maxent_ne_chunker', force=True)
# nltk.download('words', force=True

# Tokenize text
books_tokens = [word_tokenize(book.lower()) for book in books]

# Remove stopwords
stop_words = set(stopwords.words('english'))
books_tokens_cleaned = [[word for word in tokens if word not in stop_words] for tokens in books_tokens]

# Verify stopwords removal
print("Sample tokens after stopword removal:", books_tokens_cleaned[0][:50])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [7]:
# Stemming
stemmer = PorterStemmer()
stemmed_books = [[stemmer.stem(word) for word in tokens] for tokens in books_tokens_cleaned]
print("First 50 stemmed words:", stemmed_books[0][:50])

# Lemmatization
lemmatized_books = [[token.lemma_ for token in nlp(' '.join(tokens))] for tokens in books_tokens_cleaned]
print("First 50 lemmatized words:", lemmatized_books[0][:50])

# POS tagging
books_pos_tags = [[(word, nltk.pos_tag([word])[0][1]) for word in tokens] for tokens in books_tokens_cleaned]
print("Sample POS tags:", books_pos_tags[0][:50])

NameError: name 'books_tokens_cleaned' is not defined

In [8]:
# Named Entity Recognition
books_entities = [[(ent.text, ent.label_) for ent in nlp(' '.join(tokens)).ents] for tokens in books_tokens_cleaned]
print("Sample Named Entities:", books_entities[0][:10])

NameError: name 'books_tokens_cleaned' is not defined

In [9]:
# WordCloud generation
def generate_wordcloud(text, title):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(text))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title)
    plt.show()

for i, book in enumerate(books_tokens_cleaned):
    generate_wordcloud(book, f"Word Cloud for Book {i+1}")


NameError: name 'books_tokens_cleaned' is not defined

In [10]:
# Bag of Words
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([' '.join(tokens) for tokens in books_tokens_cleaned])

NameError: name 'books_tokens_cleaned' is not defined

In [11]:
# Get the most frequent words
word_freq = np.array(X.sum(axis=0)).flatten()
words = vectorizer.get_feature_names_out()
word_freq_dict = dict(zip(words, word_freq))
top_words = Counter(word_freq_dict).most_common(5)

NameError: name 'X' is not defined

In [12]:
# Pie chart for top 5 words
def plot_pie_chart(word_freqs, title):
    words, counts = zip(*word_freqs)
    plt.figure(figsize=(8, 8))
    plt.pie(counts, labels=[f"{word}: {count}" for word, count in word_freqs], autopct='%1.1f%%', colors=sns.color_palette('pastel'))
    plt.title(title)
    plt.show()

plot_pie_chart(top_words, "Top 5 Most Frequent Words")

NameError: name 'top_words' is not defined

In [13]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(min_df=1, max_df=2)
X_tfidf = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in books_tokens_cleaned])

tfidf_freq = np.array(X_tfidf.sum(axis=0)).flatten()
tfidf_words = tfidf_vectorizer.get_feature_names_out()
tfidf_freq_dict = dict(zip(tfidf_words, tfidf_freq))
top_tfidf_words = Counter(tfidf_freq_dict).most_common(5)

plot_pie_chart(top_tfidf_words, "Top 5 Most Relevant Words (TF-IDF)")

NameError: name 'books_tokens_cleaned' is not defined