# word repetition detection

## spacy

In [1]:
import spacy

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# execute the following commands in shell
# export LD_LIBRARY_PATH=~/micromamba/lib
# python -m spacy download fr_core_news_lg

In [2]:
frnlp = spacy.load("fr_core_news_lg", disable=['parser','attribute_ruler','ner'])

## quick start

In [17]:
pos_example = "Expliquez et expliquez les lois de la friction des limites."
neg_example = "Énoncez et expliquez les lois de la friction des limites."

In [13]:
tokens = frnlp(pos_example)

In [14]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

Expliquez True 28.721529 True
et True 34.778656 True
expliquez True 29.138958 True
les True 32.644928 True
lois True 30.863085 True
de True 29.462963 True
la True 33.883762 True
friction True 31.723007 True
des True 33.48726 True
limites True 31.948713 True
. True 32.006744 True


In [16]:
type(token), type(tokens)

(spacy.tokens.token.Token, spacy.tokens.doc.Doc)

## tokenizer

In [252]:
from spacy.tokenizer import Tokenizer

In [253]:
from spacy.lang.fr import French

In [254]:
nlp = French()

In [20]:
tokenizer = Tokenizer(nlp.vocab)

In [22]:
tokens = tokenizer(pos_example)

In [25]:
for token in tokens:
    print(token.text)

Expliquez
et
expliquez
les
lois
de
la
friction
des
limites.


seems we prefer the first tokenizer (which also seperates punctuations)

In [42]:
tokens = frnlp(pos_example)
for token in tokens: print(token.text)

Expliquez
et
expliquez
les
lois
de
la
friction
des
limites
.


## lemmatizer

In [46]:
lemmas = [token.lemma_ for token in tokens]

In [48]:
Expliquez = tokens[0]
Expliquez.lemma_

'expliquez'

In [50]:
frnlp(Expliquez.lemma_)[0].lemma_

'expliquer'

In [53]:
frnlp('expliquer')[0].lemma_

'expliquer'

In [47]:
lemmas

['expliquez',
 'et',
 'expliquer',
 'le',
 'loi',
 'de',
 'le',
 'friction',
 'de',
 'limite',
 '.']

In [38]:
ts = frnlp("les lois")
ls = [t.lemma_ for t in ts]
print(ls)

['le', 'loi']


## morphologizer

In [41]:
ms = [t.morph for t in ts]
print(ms)

[Definite=Def|Number=Plur|PronType=Art, Gender=Fem|Number=Plur]


In [56]:
bs = frnlp('GOOD Golden Bridge change')
ls = [t.lemma_ for t in bs]
print(ls)

['GOOD', 'Golden', 'Bridge', 'change']


## tagger

In [72]:
tags = [t.tag_ for t in ts]
print(tags)
poss = [t.pos_ for t in ts]
print(tags)

['DET', 'NOUN']
['DET', 'NOUN']


## pipeline

In [244]:
nlp = frnlp
def lemmatizer(word):
    token = nlp(word)[0]
    if token.text == token.lemma_:
        return token
    else:
        return lemmatizer(nlp(token.lemma_))

In [68]:
lemmatizer('Expliquez')

expliquer

In [225]:
'Expliquez'[0].lower()

'e'

we first process a sentence, this will automatically do tokenization, lemmatizing and pos-tagging. We need to count the repetitive words (in form of `lemmas`) which is a `open word`

## experiment

In [4]:
is_open = ['ADJ', 'ADV', 'INTJ', 'NOUN', 'PROPN', 'VERB']
is_close = ['ADP', 'AUX', 'CCONJ', 'DET', 'NUM', 'PART', 'PRON', 'SCONJ']
is_other =['PUNCT', 'SYM', 'X']

In [5]:
frnlp('mot')[0].pos_

'NOUN'

In [6]:
from collections import defaultdict

In [9]:
def counter(tokens, idx):
    d = defaultdict(lambda: [0,float('inf'),float('-inf'),float('-inf')])
    for i, (tok, ix) in enumerate(zip(tokens, idx)):
        # multiplicity
        d[tok][0] += 1
        # min distance
        dis = i - d[tok][2]
        if dis < d[tok][1]:
            d[tok][1] = dis 
            # repetition idx
            d[tok][3] = ix
        # latest pos
        d[tok][2] = i
    return {k: (v[0],v[1],v[3]) for k,v in d.items()}

In [10]:
# Doc

In [11]:
def rep_detect(sent, account=is_open, min_dist=3):
    tokens = frnlp(sent)
    lemmas = []
    idx = []
    for ix, token in enumerate(tokens):
        if token.pos_ in account:
            lemmas.append(token.lemma_)
            idx.append(ix)
    counts = counter(lemmas, idx)
    
    return {k: v for k, v in counts.items() if v[0] > 1 and v[1] < min_dist}

In [167]:
'\u0332'.join('a')

'a'

In [168]:
print("\033[4mUnderlined text\033[0m")

[4mUnderlined text[0m


In [171]:
rep_detect(pos_example, is_open)

{'expliquer': (2, 1, 2)}

In [12]:
from tqdm import tqdm
with open('./corpus/test.fr', encoding='utf-8') as f:
    reps = []
    lines = f.readlines()
    for i, l in tqdm(enumerate(lines), total=len(lines)):
        rep = rep_detect(l, is_open)
        if rep:
            reps.append((i,rep))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3027/3027 [00:26<00:00, 113.59it/s]


In [21]:
def probe(corpus, i, rep):
    sent = corpus[i]
    tokens = frnlp(sent)
    tokens = [token.text for token in tokens]
    if rep:
        for v in rep.values():
            tokens[v[2]] = f"\033[4m{tokens[v[2]]}\033[0m"
    sent_marked = ' '.join(tokens)
    print(sent_marked)

In [15]:
reps

[(13, {'-': (2, 2, 20)}),
 (25, {'financier': (2, 2, 33)}),
 (47, {'tel': (2, 1, 42)}),
 (92, {'mp3': (2, 2, 14)}),
 (116, {'temps': (2, 1, 13)}),
 (137, {'plus': (2, 1, 8)}),
 (138, {'internet': (2, 1, 25)}),
 (148, {'avoir': (2, 2, 10)}),
 (163, {'même': (2, 2, 6), 'heure': (2, 1, 39)}),
 (180, {'Mobile': (2, 2, 16)}),
 (190, {'moins': (2, 2, 15)}),
 (235, {'opérateur': (3, 2, 26)}),
 (268, {'question': (2, 2, 44)}),
 (279, {'plus': (2, 2, 8)}),
 (281, {'opérateur': (2, 2, 14)}),
 (282, {'trop': (2, 2, 17)}),
 (312, {'jour': (2, 1, 10)}),
 (372, {'image': (2, 1, 23)}),
 (400, {'accuser': (2, 2, 19)}),
 (403, {'pouvoir': (2, 2, 12)}),
 (448, {'métier': (2, 1, 14)}),
 (513, {'financier': (3, 2, 29)}),
 (535, {'peu': (2, 1, 19)}),
 (561, {'pas': (2, 2, 8)}),
 (570, {'Courir': (2, 1, 4)}),
 (591, {'banque': (2, 2, 9)}),
 (598, {'-': (2, 2, 44)}),
 (608, {'devoir': (2, 2, 21)}),
 (609, {'fédéral': (2, 2, 6)}),
 (619, {'crise': (3, 2, 43)}),
 (624, {'prochain': (2, 2, 28)}),
 (628, {'plus'

In [16]:
probe(lines, reps[25][0], reps[25][1])

L' argent provient autant de banques privées que de [4mbanques[0m de droit public . 



In [18]:
rep = rep_detect(pos_example)

In [27]:
reps = [(0, rep)]

In [28]:
reps

[(0, {'expliquer': (2, 1, 2)})]

In [29]:
probe([pos_example], reps[0][0], reps[0][1])

Expliquez et [4mexpliquez[0m les lois de la friction des limites .


In [30]:
pos_example, neg_example

('Expliquez et expliquez les lois de la friction des limites.',
 'Énoncez et expliquez les lois de la friction des limites.')