In [3]:
import fitz  # PyMuPDF
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import spacy

# Download necessary NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Load Spacy French model
nlp = spacy.load('fr_core_news_sm')

def extract_text_from_pdf(pdf_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

def preserve_emails_and_urls(text):
    email_pattern = r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b'
    url_pattern = r'(https?://[^\s]+)'

    emails = re.findall(email_pattern, text)
    urls = re.findall(url_pattern, text)

    for email in emails:
        text = text.replace(email, f"__EMAIL__{emails.index(email)}__")
    for url in urls:
        text = text.replace(url, f"__URL__{urls.index(url)}__")

    return text, emails, urls

def preprocess_sentence(sentence, emails, urls):
    # Lowercasing
    sentence = sentence.lower()
    
    # Tokenization
    tokens = word_tokenize(sentence, language='french')
    
    # Removing punctuation except for preserved placeholders
    tokens = [word for word in tokens if word.isalnum() or "__email__" in word or "__url__" in word]
    
    # Removing stopwords
    stop_words = set(stopwords.words('french'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization using Spacy
    #doc = nlp(' '.join(tokens))
    #tokens = [token.lemma_ for token in doc]
    
    # Replace placeholders with original emails and URLs
    for i, email in enumerate(emails):
        tokens = [word.replace(f"__email__{i}__", email) for word in tokens]
    for i, url in enumerate(urls):
        tokens = [word.replace(f"__url__{i}__", url) for word in tokens]
    
    # Joining tokens back to text
    processed_sentence = ' '.join(tokens)
    
    return processed_sentence

def clean_text(text):
    # Remove unnecessary repeated sequences and extra spaces
    text = re.sub(r'(_{2,})', ' ', text)
    text = re.sub(r' +', ' ', text)
    
    # Ensure proper spacing around placeholders
    text = re.sub(r'(__EMAIL__[0-9]+__)', r' \1 ', text)
    text = re.sub(r'(__URL__[0-9]+__)', r' \1 ', text)

    # Remove leading and trailing whitespace
    text = text.strip()
    
    return text

def preprocess_text(text, emails, urls):
    # Sentence tokenization
    sentences = sent_tokenize(text, language='french')
    
    # Preprocess each sentence
    processed_sentences = [preprocess_sentence(sentence, emails, urls) for sentence in sentences]
    
    # Joining sentences back to text
    processed_text = ' '.join(processed_sentences)
    
    # Clean the final text
    processed_text = clean_text(processed_text)
    
    return processed_text

# Example usage
pdf_path = 'data.pdf'
text = extract_text_from_pdf(pdf_path)
text, emails, urls = preserve_emails_and_urls(text)
processed_text = preprocess_text(text, emails, urls)

print(processed_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


cycle formation nom di genie electrique management industriel formation url https://fstt.ac formation details formation proposee specialite genie electrique fst tanger a but e fournir futur ingenieur genie electrique tous elements indispensables a inse rtion harmonieuse monde industriel enseignements associent theorie techniq ues experimentation projets realisations enseignements dispenses repartis quatre groupes matieres enseignements a caractere general disciplines fonda mentales enseignement professionnel formation pratique biais projets stages mathematiques elect instrumentation energetique communication professionnelle semestre ingenierie automatique lineaire continue conce ption procedes culture gestion anglais echnique semestre mathematiques ingenieur traitement sources energies reseaux machines grh dro it travail semestre gestion maintenance surete machines ele electronique systemes a microprocesseurs a oft analyse gestion comptabilite generale analytique semestre modeli sation