<div style="font-size:1.5em">
    <p>📜 Table des matières:</p>
    <ul>
       <li><a href="#nltk">Text Preprocessing with NLTK</a></li>  
       <li>
          <a href="#spacy">Text Preprocessing with spaCy</a>
       </li>
       <li>
          <a href="#ner">Definig NER for nltk and spacy</a>
       </li>
       <li><a href="#comp">Comparing NER before/After preprocessing</a>
       <li><a href="#param">testing preprocessing filters</a></li>
       </li>
    </ul>
</div>

<h1 id ="nltk">Text Preprocessing with NLTK</h1>

In [1]:
#import libraries
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import WhitespaceTokenizer , TreebankWordTokenizer,WordPunctTokenizer,word_tokenize
from nltk.corpus import stopwords
import string
import contractions
from spellchecker import SpellChecker
import re
from french_lefff_lemmatizer.french_lefff_lemmatizer import FrenchLefffLemmatizer
import unicodedata
from langdetect import detect
from itertools import chain , combinations
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
nltk.download('wordnet')

def remove_urls(text):
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)

def remove_accented_chars(text):
    new_text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return new_text


def text_preprocessing(text,accented=True,stopw=True,punctuation=True,lowercase=True,lemmatize=True,spelling=True,expand_contraction=True,urls=True):
    if detect(text)=='en':
        stopword =stopwords.words('english')
        lemmatizer = WordNetLemmatizer()
        spell = SpellChecker()
    else :#if lang="french"
        stopword = stopwords.words('french')
        lemmatizer = FrenchLefffLemmatizer()
        spell = SpellChecker(language="fr")
    if lowercase:
        #lowercase the text 
        text = text.lower()
    if urls:
        #remove urls
        text=remove_urls(text)
    #tokenize the text 
    tokens =WhitespaceTokenizer().tokenize(text)
    if expand_contraction:
        #expand contractions
        tokens = [contractions.fix(token) for token in tokens]
    if punctuation:
        #remove punctuation
        tokens = [token for token in tokens if token not in string.punctuation]
    if stopw:
        #remove stopwords
        tokens = [token for token in tokens if token not in stopword]
    if accented:
        tokens = [remove_accented_chars(token) for token in tokens]
    if spelling:
        #spell check:
        tokens = [spell.correction(token) for token in tokens]
    if lemmatize:
        #lemmatization : 
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(word for word in tokens)

    #Some tests:
file = 'C:/Users/PC2/Downloads/test-ex.txt'
f=open(file,'r')
data = f.read()
print(text_preprocessing(data))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PC2\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PC2\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\PC2\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PC2\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


gisevciconia zero solution objet ordre virement Monsieur habit compte :_rib: 022 222 222 172 22 29576475 53 set agilisys zero solution swift agilamcxxx virer montant 89 250,00 dirais (quatre-vingt-neuf mille deux cent cinquante dirais de profit i parler export switzerland ian chez 0483 555 555 0266600 0 swift creschzz80a motif 22a005 signature agilisys industrie sara siège social angle na rue ibn aisha boulevard abdelkrim el khattabi millimètre pari haan étage appât nana gueulez amarrakech tu +212 (0) 999 99 6161 f +212 (0) 999 999 130 sara capital 500.000 000 des rcna35 33 if na 3939333 tina 3939 cnss:939 mt-103-stp 2838383393 et point pote trot cc cs con instance type and transmission de original receive from


<h1 id ="spacy">Text Preprocessing with spaCy</h1>

In [2]:

#Now we ll do preprocessing using mainly spacy
import spacy
#load only french and english models tokenizers
nlp_en = spacy.load("en_core_web_sm", disable=['parser', 'tagger', 'ner'])
nlp_fr = spacy.load("fr_core_news_sm", disable=['parser', 'tagger', 'ner'])

def spacy_preprocessing(text,lowercase=True,stopw=True,punctuation=True,alphabetic=True,lemmatize=True,):
    if detect(text)=="en":
        nlp = nlp_en
    else :
        nlp = nlp_fr
    if lowercase:
        text = text.lower()
    remove_accented_chars(text)
    #tokenize with spacy's default tokenizer
    tokens = nlp(text)
    if stopw :
        tokens = [token for token in tokens if not token.is_stop]
    if lemmatize :
        tokens = [token.lemma_.strip() for token in tokens]
    if punctuation :
        tokens = [re.sub('<[^>]*>', '', token) for token in tokens]
    if alphabetic:
        tokens = [re.sub('[\W]+','',token) for token in tokens]
    return ' '.join(word for word in tokens)

file = 'C:/Users/PC2/Downloads/test-ex.txt'
f=open(file,'r')
data = f.read()
print(spacy_preprocessing(data,lowercase=False))

 Gisevciconia  aero solutions  Objet  ordre virement  monsieur   dÃ  bit compte  _ rib  022 222 222 172 22 29576475 53  swt  agilisy aero solution  SWIFT  agilamcxx   virer montant  89 25000 Dirhams  quatrevingtneuf Dirhams     profit  CARLEX export Switzerland  IBAN  ch35 0483 555 555 0266600 0  SWIFT  creschzz80a  Motif  22A005  Signature   agilisy industrier sarl  SiÃ  ge social  Angle nÂ degré 2 rue ibn aicha bd Abdelkrim el khattabi imm Paris 2Â degré â  ã  tage appt nâ degré 23 gueliz âmarrakech  t  212  0  999 99 6161 l f  212  0  999 999 130   sarl capital 500000 000 dhs  RCNÂ degré 35 33  IF nâ degré 3939333  tpnâ degré 3939  cnss939  mt103stp 2838383393  mt Print   poet trot ccc cs econ instance type and transmission   Original received from 


<h1 id ="ner">Definig NER for nltk and spacy</h1>

In [3]:
import spacy

In [4]:
def ner_nltk(text):
    assert detect(text) =='en' 'text should be english to be parsed with nltk'
    tokens = word_tokenize(text)
    tagged_tokens  = nltk.pos_tag(tokens)
    chunked = nltk.ne_chunk(tagged_tokens)
    for chunk in chunked : 
        if hasattr(chunk,"label") and chunk.label == "NE":
            print(chunk)

In [5]:
def ner_spacy(text):
    if detect(text) == "en":
        ner  = spacy.load("en_core_web_sm",disable=["tagger","parser"])
    else:
        ner  = spacy.load("fr_core_news_sm",disable=["tagger","parser"])
    return ner(text).ents


<h1 id ="comp">Comparing results before\after preprocessing</h1>

In [6]:
ner_spacy(data)

(Messieurs,
 AGILAMCXXX
 |,
 Montant,
 Dirhams,
 Mille deux cent cinquante,
 Dirhams,
 CH35,
 SWIFT,
 CRESCHZZ80A
 
  
 
 Motif,
 INDUSTRIES SARL |,
 Angle nÂ,
 Abdelkrim el khattabi,
 Paris 2Â,
 Ã,
 nÂ,
 F,
 Capital,
 dhs | RCNÂ,
 MT Print,
 Instance Type and Transmission)

In [7]:
preprocessed_nltk=text_preprocessing(data)
ner_spacy(preprocessed_nltk)

(gisevciconia zero,
 ordre virement Monsieur,
 zero solution swift,
 switzerland ian,
 el khattabi,
 haan étage appât nana gueulez,
 cnss:939 mt-103-stp 2838383393)

In [8]:
preprocessed_spacy=spacy_preprocessing(data,lowercase=False)
ner_spacy(preprocessed_spacy)

(dÃ  bit,
 swt  agilisy aero,
 SWIFT,
 Dirhams,
 Dirhams     ,
 Switzerland  IBAN  ch35,
 SWIFT  creschzz80a  Motif  22A005  Signature   agilisy,
 SiÃ,
 Angle nÂ,
 Abdelkrim el khattabi,
 Paris 2Â,
 dhs  RCNÂ,
 Original received from)

<h2 id="param">Testing preprocessing filters</h2>

In [9]:
from itertools import chain , combinations
def powerset(iterable): # return all possible subsets of iterable
    s = list(iterable)
    return list(chain.from_iterable(combinations(s, r) for r in range(len(s)+1)))

[accented,stopw,punctuation,lowercase,lemmatize,spelling,expand_contraction,urls] = [True for i in range(8)]
list_of_params = [accented,stopw,punctuation,lowercase,lemmatize,spelling,expand_contraction,urls]

def filter_combinations(params):#return all the possible filter combinations, param; list of parametrs with a default value
    list_of_combs = []
    combs=powerset([i for i in range(len(params)) ])
    for comb in combs:
        initial_conf = [True for i in range(len(params))]
        if len(comb)==0 :
            list_of_combs.append(params)
        else:
            for i in comb:
                initial_conf[i]=False
            list_of_combs.append(initial_conf)
    return list_of_combs
filter_combinations(list_of_params)

[[True, True, True, True, True, True, True, True],
 [False, True, True, True, True, True, True, True],
 [True, False, True, True, True, True, True, True],
 [True, True, False, True, True, True, True, True],
 [True, True, True, False, True, True, True, True],
 [True, True, True, True, False, True, True, True],
 [True, True, True, True, True, False, True, True],
 [True, True, True, True, True, True, False, True],
 [True, True, True, True, True, True, True, False],
 [False, False, True, True, True, True, True, True],
 [False, True, False, True, True, True, True, True],
 [False, True, True, False, True, True, True, True],
 [False, True, True, True, False, True, True, True],
 [False, True, True, True, True, False, True, True],
 [False, True, True, True, True, True, False, True],
 [False, True, True, True, True, True, True, False],
 [True, False, False, True, True, True, True, True],
 [True, False, True, False, True, True, True, True],
 [True, False, True, True, False, True, True, True],
 [T

In [10]:
from collections import defaultdict
ff=filter_combinations(list_of_params)
def results(text,preprocessing_function):
    result_dict = defaultdict(lambda :[])
    i=0 #used for ff[i] to give the configuration that gave the result number i 
    for comb in ff:#comb is a combinations of active filters 
        preprocessed_text=preprocessing_function(text,*comb)
        entities = ner_spacy(preprocessed_text)
        for ent in entities:
            if len(result_dict[ent.text])==0:
                result_dict[ent.text]=[(ent.label_,i)]
            else :
                if result_dict[ent.text][len(result_dict[ent.text])-1][0] != ent.label_:#store only if the value changes 
                    result_dict[ent.text].append((ent.label_,i))
        i+=1
    return result_dict




In [11]:
results(data,text_preprocessing)



defaultdict(<function __main__.results.<locals>.<lambda>()>,
            {'gisevciconia zero': [('PER', 0)],
             'ordre virement Monsieur': [('ORG', 0)],
             'zero solution swift': [('MISC', 0)],
             'switzerland ian': [('MISC', 0)],
             'el khattabi': [('ORG', 0),
              ('LOC', 2),
              ('ORG', 3),
              ('LOC', 9),
              ('ORG', 10),
              ('LOC', 16),
              ('ORG', 19),
              ('LOC', 20),
              ('ORG', 25),
              ('LOC', 37),
              ('ORG', 40),
              ('LOC', 41),
              ('ORG', 46),
              ('LOC', 61),
              ('ORG', 67),
              ('LOC', 72),
              ('ORG', 77),
              ('LOC', 96),
              ('ORG', 102),
              ('LOC', 107),
              ('ORG', 112),
              ('LOC', 137),
              ('ORG', 144),
              ('LOC', 172),
              ('ORG', 179)],
             'haan étage appât nana gueulez':