# LE PRETRAITEMENT EN NLP

## A-préparation de données

importer dataset

In [92]:
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, trainers
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import unicodedata
import string
import re

ModuleNotFoundError: No module named 'tokenizers'

In [4]:
df = pd.read_csv('./spooky.csv')

afficher les 10 premiers échantillons 

In [7]:
df.head(10)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
5,id22965,"A youth passed in solitude, my best years spen...",MWS
6,id09674,"The astronomer, perhaps, at this point, took r...",EAP
7,id13515,The surcingle hung in ribands from my body.,EAP
8,id19322,I knew that you could not say to yourself 'ste...,EAP
9,id00912,I confess that neither the structure of langua...,MWS


In [9]:
df.shape

(19579, 3)

In [11]:
df.isnull().sum()

id        0
text      0
author    0
dtype: int64

In [13]:
df.duplicated().sum()

0

## B-nettoyage d’un texte

gérer les caractères répétitifs (par exemple « cooooool » → « cool »)

In [16]:
df['text']

0        This process, however, afforded me no means of...
1        It never once occurred to me that the fumbling...
2        In his left hand was a gold snuff box, from wh...
3        How lovely is spring As we looked from Windsor...
4        Finding nothing else, not even gold, the Super...
                               ...                        
19574    I could have fancied, while I looked at it, th...
19575    The lids clenched themselves together as if in...
19576    Mais il faut agir that is to say, a Frenchman ...
19577    For an item of news like this, it strikes us i...
19578    He laid a gnarled claw on my shoulder, and it ...
Name: text, Length: 19579, dtype: object

In [28]:
repeated_chars = re.compile(r'(\w*)(\w*)\2(\w*)')
repl = r'\1\2\3'

def replace(word):
    if wordnet.synsets(word):
        return word
    repl_word = repeated_chars.sub(repl, word)

    if repl_word != word:
        return replace(repl_word)
    else:
        return repl_word
    
def remove_repeating_chars(text):
    words = text.split()
    new_words = []
    for word in words:
        new_words.append(replace(word))
    return ' '.join(new_words)

In [None]:
remove_repeating_chars('cooooool')

LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - '/home/mina/nltk_data'
    - '/home/mina/anaconda3/nltk_data'
    - '/home/mina/anaconda3/share/nltk_data'
    - '/home/mina/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [97]:
df['text'] = remove_repeating_chars(df['text'])

AttributeError: 'Series' object has no attribute 'split'

manipuler des homoglyphes (par exemple « $tupide » → « stupide »)

transformer les entrées spéciales telles que les URL, les adresses e-mail et les balises HTML à une forme canonique

In [60]:
def remove_url(text):
    return re.sub(r'https?://\S+|www\.\S+', '[url]', text)

In [61]:
remove_url('https://www.github.com')

'[url]'

In [58]:
def remove_email(text):
    return re.sub(r'([A-Za-z0-9_.-]+@[A-Za-z0-9-]+\.[A-Za-z0-9.]+)', '[email]', text)

In [59]:
remove_email('a.ghandouz@esi-sba.dz')

'[email]'

In [54]:
def remove_html_balises(text):
    return BeautifulSoup(text, 'html.parser').get_text()

In [57]:
remove_html_balises('<html><head></head><body><header>hey there</header></body></html>')

'hey there'

mettre tous les caracteres en minuscule

In [44]:
def text_to_lower(text):
    return text.lower()

In [47]:
text_to_lower("HeyY thERe")

'heyy there'

In [106]:
i = 0
while i <= 10:
    df['text'][i] = text_to_lower(df['text'][i])
    i+=1

supprimer les ponctuation

In [49]:
def remove_punc(text):
    return ("".join([i for i in text if i not in string.punctuation]))

In [50]:
remove_punc("hey! bye.")

'hey bye'

supprimer les mots-vide

In [73]:
def remove_emptywords(text):
    text = text_to_lower(text)
    empty_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

In [72]:
remove_emptywords('hey, bye, this is sentence')

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - '/home/mina/nltk_data'
    - '/home/mina/anaconda3/nltk_data'
    - '/home/mina/anaconda3/share/nltk_data'
    - '/home/mina/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## C-segmentation

segmenter chaque phrase sur les espaces/ponctuation

In [80]:
def tokenize_by_space_punc(text):
    to_be_removes = set(string.punctuation)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if not all(char in to_be_removes for char in token)]
    return tokens

In [81]:
tokenize_by_space_punc('hey, it\'s a sentence. bye .')

LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/home/mina/nltk_data'
    - '/home/mina/anaconda3/nltk_data'
    - '/home/mina/anaconda3/share/nltk_data'
    - '/home/mina/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


segmenter chaque phrase avec un algo de segmentation base sur des regles

In [90]:
def tokenize_by_rules(text):
   text = re.sub(r"([’'`])\s", "", text)
   text = re.sub(r"\s(n't|N'T|'(s|S|m|M|ll|LL|d|D|ve|VE))\b", "", text)

   text = re.sub(r"(\S+)-\s*(\S+)", "", text)

   text = re.sub(r"([^\w\s\.,])", "", text)

   text = re.sub(r"(?<!\d),(?!\d)", r" , ", text)

   tokens = re.split(r"\s+", text.strip())
   return tokens

In [91]:
tokenize_by_rules('heyy. this\'is NLP. bye.')

['heyy.', 'thisis', 'NLP.', 'bye.']

segmenter chaque phrase avec un algo de segmentation en sous mots (subword tkenization)

In [95]:
def tokenize_by_subwords(text):
    bpe_model = models.BPE()
    trainer = trainers.BpeTrainer(special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"])
    tokenizer = Tokenizer(bpe_model)
    tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()
    tokenizer.decoder = decoders.BPEDecoder()
    tokenizer.train_from_iterator([text], trainer=trainer)
    subwords = tokenizer.encode(text).tokens
    return subwords

In [96]:
tokenize_by_subwords("heyy, this isexample to be used with tokenization with subwords.")

NameError: name 'models' is not defined

## D-reconnaissance d'entite nommee

pour chaque phrase representait les entites nommees (ntlk/spacy)

## E-reduction des formes

utiliser la lemmatisation/racinisation avec nltm

## F-analyse des frequences

compter le nombre de phrases pour chasue auteur ou apparait le mot Great

utiliser pywaffle pour obtenir un graphique qui resume de maniere synthetique le nombre d'occurrences du mots "prepare par DR Bousmaha great" par auteur

refaire l'analyse avec le mots "impossible"