In [30]:
from unidecode import unidecode
from deepmultilingualpunctuation import PunctuationModel
import re
import time
import pandas as pd

with open("dictionnaire.txt", 'r', encoding='latin1') as file1:
    with open("lexiqueorgdico.txt", 'r', encoding='latin1') as file2:
        file = list(set(file1).union(set(file2)))
        french_words = [unidecode(word.lower().strip()) for word in file if all(c in "abcdefghijklmnopqrstuvwxyzéèàçù" for c in word.strip().lower())]

def capitalize_sentences(text):
    # Utilisation de re.sub pour identifier chaque phrase et mettre la première lettre en majuscule
    return re.sub(r'(^|(?<=[.!?…])\s+)([a-z])', lambda match: match.group(1) + match.group(2).upper(), text)

model = PunctuationModel()

Device set to use cpu


In [2]:
def maxMatch(string):
    tokens = []
    not_in = ""
    i = 0
    while i < len(string):
        maxWord = ""
        for j in range(i, len(string)):
            tempWord = string[i:j+1]
            if tempWord in french_words and len(tempWord) > len(maxWord):
                maxWord = tempWord
        if len(maxWord) == 0:
            not_in = string[i:j+1]
            break
        i = i+len(maxWord)
        tokens.append(maxWord)
    return tokens, not_in

string = "jaimeletempslactionlamertume"
maxMatch(string)

(['j', 'aime', 'let'], 'empslactionlamertume')

In [3]:
def completeMaxMatch(string):
    tokens, not_in = maxMatch(string)
    not_in_array = [0] * len(tokens)
    while len(not_in) > 0:
        tokens.append(not_in[0])
        not_in_array.append(1)
        not_in  = not_in[1:]
        if len(not_in) > 0:
            tokens_, not_in = maxMatch(not_in)
            tokens.extend(tokens_)
            not_in_array.extend([0] * len(tokens_))
    return  tokens, not_in_array

string = "jaimeletempslactionlamertume"
completeMaxMatch(string)

(['j',
  'aime',
  'let',
  'e',
  'm',
  'p',
  's',
  'lac',
  't',
  'ion',
  'lamer',
  'tu',
  'me'],
 [0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])

In [4]:
def reverseMaxMatch(string):
    tokens = []
    not_in = ""
    i = len(string)
    while i > 0:
        maxWord = ""
        for j in range(i - 1, -1, -1):
            tempWord = string[j:i]
            if tempWord in french_words and len(tempWord) > len(maxWord):
                maxWord = tempWord
        if len(maxWord) == 0:
            not_in = string[j:i]
            break
        i = i - len(maxWord)
        tokens.append(maxWord)
    return tokens[::-1], not_in

string = "jaimeletempslactionlamertume"
reverseMaxMatch(string)

(['amertume'], 'jaimeletempslactionl')

In [5]:
def completeReverseMaxMatch(string):
    tokens, not_in = reverseMaxMatch(string)
    not_in_array = [0] * len(tokens)
    while len(not_in) > 0:
        tokens.insert(0, not_in[-1])
        not_in_array.insert(0, 1)
        not_in  = not_in[:-1]
        if len(not_in) > 0:
            tokens_, not_in = reverseMaxMatch(not_in)
            tokens = tokens_ + tokens 
            not_in_array = [0] * len(tokens_) + not_in_array
    return tokens, not_in_array

string = "jaimeletempslactionlamertume"
completeReverseMaxMatch(string)

(['j', 'aime', 'le', 'temps', 'l', 'action', 'l', 'amertume'],
 [0, 0, 0, 0, 1, 0, 1, 0])

In [6]:
def get_splits_idx(all_tokens_l):
    l_splits_idx = []
    sum_ = 0
    for token in all_tokens_l:
        sum_ += len(token)
        l_splits_idx.append(sum_)
    return l_splits_idx

def compare_tokens_list(all_tokens_l1, not_in_array_l1, all_tokens_l2, not_in_array_l2):
    if sum(not_in_array_l1) <  sum(not_in_array_l2):
        return all_tokens_l1
    elif sum(not_in_array_l1) > sum(not_in_array_l2):
        return all_tokens_l2
    else:
        sum_square_len_l1 = sum([len(word) ** 2 for word in all_tokens_l1])
        sum_square_len_l2 = sum([len(word) ** 2 for word in all_tokens_l2])
        return all_tokens_l1 if sum_square_len_l1 > sum_square_len_l2 else all_tokens_l2

def mix_algo(string):
    tokens_max, not_in_array_max = completeMaxMatch(string)
    tokens_reverse_max, not_in_array_reverse_max = completeReverseMaxMatch(string)

    max_splits_idx = get_splits_idx(tokens_max)
    reverse_max_splits_idx = get_splits_idx(tokens_reverse_max)
    
    finals_words = []
    i_act = 0
    j_act = 0
    for i in range(len(max_splits_idx)):
        for j in range(j_act, len(reverse_max_splits_idx)):
            if max_splits_idx[i] == reverse_max_splits_idx[j]:
                finals_words.extend(compare_tokens_list(tokens_max[i_act : i + 1], not_in_array_max[i_act : i + 1], tokens_reverse_max[j_act : j + 1], not_in_array_reverse_max[j_act : j + 1]))
                i_act = i + 1
                j_act = j + 1
                break
    return finals_words

In [7]:
def algo_optimise(string):
    MAX_WORD_LENGTH = 25
    confirmed_ = []
    next_ = []
    next_raw  = []
    i = 0
    while True :
        next_ = mix_algo(string[i:i+MAX_WORD_LENGTH])
        if i + MAX_WORD_LENGTH >= len(string):
            next_raw = string[i:i+MAX_WORD_LENGTH]
            break
        else:
            confirmed_.append(next_[0])
            i += len(next_[0])
        
    return confirmed_, next_, next_raw

In [None]:
all_keys = "lecielestbleujaidesamisquisontaussimesamoureuxetdescheminsquisontaussiunpeulesmiens"
next_raw = ""
confirmed  = []

for key in all_keys:
    next_raw += key # Ajout d'une lettre
    start = time.time()
    confirmed = mix_algo(next_raw)
    result = re.sub(r"\b([JjLlCc]) (\w+)", r"\1'\2", capitalize_sentences(model.restore_punctuation(" ".join(confirmed))))
    end = time.time()
    print(end - start)
    
print(result)

0.3642868995666504
0.08113861083984375
0.09110474586486816
0.09232091903686523
0.10866689682006836
0.10566186904907227
0.12026357650756836
0.1308591365814209
0.13041353225708008
0.16025900840759277
0.17586255073547363
0.18616628646850586
0.18634653091430664
0.19272351264953613
0.22929668426513672
0.24340486526489258
0.27930283546447754
0.25632190704345703
0.2666773796081543
0.33224964141845703
0.3291130065917969
0.3187687397003174
0.3187122344970703
0.3712291717529297
0.43076109886169434
0.39023470878601074
0.5013022422790527
0.6316475868225098
0.576901912689209
0.503044843673706
0.49495625495910645
0.5639760494232178
0.646714448928833
0.7658421993255615
0.7108008861541748
0.7101924419403076
0.756040096282959
0.6435153484344482
0.6588184833526611
0.7362382411956787
0.8151490688323975
0.7497823238372803
0.851689338684082
0.8791098594665527
0.8757216930389404
0.8282585144042969
0.8694193363189697
0.8721764087677002
0.9860200881958008
1.2975318431854248
1.1001019477844238
1.08877635002136

In [9]:
all_keys = "lecielestbleujaidesamisquisontaussimesamoureuxetdescheminsquisontaussiunpeulesmiens"
next_raw = ""
next_ = []
confirmed  = []

for key in all_keys:
    next_raw += key # Ajout d'une lettre
    start = time.time()
    confirmed_, next_, next_raw = algo_optimise(next_raw)
    result = capitalize_sentences(model.restore_punctuation(" ".join(confirmed + next_)))
    end = time.time()
    print(end - start)
    confirmed.extend(confirmed_)
    
print(result)

0.10761380195617676
0.09238553047180176
0.10279297828674316
0.09963607788085938
0.12524747848510742
0.12404990196228027
0.13698601722717285
0.13226938247680664
0.12989425659179688
0.15300726890563965
0.1855030059814453
0.21556448936462402
0.2089405059814453
0.23837542533874512
0.2292780876159668
0.2295215129852295
0.26164746284484863
0.24991965293884277
0.27812981605529785
0.3737657070159912
0.479999303817749
0.43828463554382324
0.4238002300262451
0.4588890075683594
0.40859556198120117
0.677424430847168
0.3771078586578369
0.6629929542541504
0.36408019065856934
0.33022522926330566
0.41300034523010254
0.6243979930877686
0.40465760231018066
0.4186713695526123
0.7170510292053223
0.39542722702026367
0.34298133850097656
0.39655590057373047
0.534895658493042
0.43728184700012207
0.2968776226043701
0.39327406883239746
0.35272717475891113
0.3894221782684326
0.5351588726043701
0.35786938667297363
0.3358163833618164
0.3562591075897217
0.5050718784332275
0.3605508804321289
0.43928027153015137
0.496

# Test models

In [62]:
def nettoyer_texte(texte):
    # Enlever les accents
    texte = unidecode(texte)
    # Remplacer les apostrophes par un espace
    texte = texte.replace("'", " ")
    # Enlever la ponctuation
    texte = re.sub(r"[^\w\s]", "", texte)
    # Mettre en minuscule
    texte = texte.lower()
    return texte

df = pd.read_csv("questoin-reponse.csv")
df["phrase_espace"] =  df["question"].apply(nettoyer_texte).str.strip()
df["phrase_concat"] = df["phrase_espace"].apply(lambda x: x.replace(" ", ""))
del  df["question"]
del df["reponse"]

df = df[:1000]
df.head()

Unnamed: 0,phrase_espace,phrase_concat
0,quel architecte fut a l origine des plans du w...,quelarchitectefutaloriginedesplansduwoolworthb...
1,ou se trouvait franck woolworth lors de l inau...,ousetrouvaitfranckwoolworthlorsdelinauguration...
2,comment fut paye le batiment commande par fran...,commentfutpayelebatimentcommandeparfranckwoolw...
3,en quelle annee ouvrit le woolworth building,enquelleanneeouvritlewoolworthbuilding
4,qui commanda la construction du woolworth buil...,quicommandalaconstructionduwoolworthbuilding


In [63]:
for index, row in df.iterrows():
    real_tokens = row["phrase_espace"].split()
    tokens_mix_algo = mix_algo(row["phrase_concat"])
    pred_mix_algo = " ".join(tokens_mix_algo)
    df.at[index, "pred_mix_algo"] = pred_mix_algo
    relative_accuracy_mix_algo = len(set(real_tokens).intersection(set(tokens_mix_algo))) / len(set(real_tokens))
    df.at[index, "relative_accuracy_mix_algo"] = relative_accuracy_mix_algo
    tokens_max_match = completeMaxMatch(row["phrase_concat"])[0]
    pred_max_match = " ".join(tokens_max_match)
    df.at[index, "pred_max_match"] = pred_max_match
    relative_accuracy_max_match = len(set(real_tokens).intersection(set(tokens_max_match))) / len(set(real_tokens))
    df.at[index, "relative_accuracy_max_match"] = relative_accuracy_max_match
    tokens_reverse_max_match = completeReverseMaxMatch(row["phrase_concat"])[0]
    pred_reverse_max_match = " ".join(tokens_reverse_max_match)
    df.at[index, "pred_reverse_max_match"] = pred_reverse_max_match
    relative_accuracy_reverse_max_match = len(set(real_tokens).intersection(set(tokens_reverse_max_match))) / len(set(real_tokens))
    df.at[index, "relative_accuracy_reverse_max_match"] = relative_accuracy_reverse_max_match

In [64]:
mean_algo = sum(df["pred_mix_algo"] == df["phrase_espace"]) / len(df)
relative_accuracy_algo = df["relative_accuracy_mix_algo"].mean()
mean_max_match = sum(df["pred_max_match"] == df["phrase_espace"]) / len(df)
relative_accuracy_max_match = df["relative_accuracy_max_match"].mean()
mean_reverse_max_match = sum(df["pred_reverse_max_match"] == df["phrase_espace"]) / len(df)
relative_accuracy_reverse_max_match = df["relative_accuracy_reverse_max_match"].mean()
print("Algo mixte: ")
print(mean_algo * 100, "%", "de précision exacte")
print(relative_accuracy_algo * 100, "%", "de mots corrects en moyenne")
print("Max match: ")
print(mean_max_match * 100, "%", "de précision exacte")
print(relative_accuracy_max_match * 100, "%", "de mots corrects en moyenne")
print("Reverse max match: ")
print(mean_reverse_max_match * 100, "%", "de précision exacte")
print(relative_accuracy_reverse_max_match * 100, "%", "de mots corrects en moyenne")

Algo mixte: 
4.2 % de précision exacte
65.79488167104653 % de mots corrects en moyenne
Max match: 
1.9 % de précision exacte
59.57251336254045 % de mots corrects en moyenne
Reverse max match: 
3.5000000000000004 % de précision exacte
63.67959902111991 % de mots corrects en moyenne
