In [None]:
from unidecode import unidecode
from deepmultilingualpunctuation import PunctuationModel
import re
import time

with open("dictionnaire.txt", 'r', encoding='latin1') as file1:
    with open("lexiqueorgdico.txt", 'r', encoding='latin1') as file2:
        file = list(set(file1).union(set(file2)))
        french_words = [unidecode(word.lower().strip()) for word in file if all(c in "abcdefghijklmnopqrstuvwxyzéèàçù" for c in word.strip().lower())]

def capitalize_sentences(text):
    # Utilisation de re.sub pour identifier chaque phrase et mettre la première lettre en majuscule
    return re.sub(r'(^|(?<=[.!?…])\s+)([a-z])', lambda match: match.group(1) + match.group(2).upper(), text)

model = PunctuationModel()

In [56]:
def maxMatch(string):
    tokens = []
    not_in = ""
    i = 0
    while i < len(string):
        maxWord = ""
        for j in range(i, len(string)):
            tempWord = string[i:j+1]
            if tempWord in french_words and len(tempWord) > len(maxWord):
                maxWord = tempWord
        if len(maxWord) == 0:
            not_in = string[i:j+1]
            break
        i = i+len(maxWord)
        tokens.append(maxWord)
    return tokens, not_in

string = "jaimeletempslactionlamertume"
maxMatch(string)

(['j', 'aime', 'let'], 'empslactionlamertume')

In [57]:
def completeMaxMatch(string):
    tokens, not_in = maxMatch(string)
    not_in_array = [0] * len(tokens)
    while len(not_in) > 0:
        tokens.append(not_in[0])
        not_in_array.append(1)
        not_in  = not_in[1:]
        if len(not_in) > 0:
            tokens_, not_in = maxMatch(not_in)
            tokens.extend(tokens_)
            not_in_array.extend([0] * len(tokens_))
    return  tokens, not_in_array

string = "jaimeletempslactionlamertume"
completeMaxMatch(string)

(['j',
  'aime',
  'let',
  'e',
  'm',
  'p',
  's',
  'lac',
  't',
  'ion',
  'lamer',
  'tu',
  'me'],
 [0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0])

In [58]:
def reverseMaxMatch(string):
    tokens = []
    not_in = ""
    i = len(string)
    while i > 0:
        maxWord = ""
        for j in range(i - 1, -1, -1):
            tempWord = string[j:i]
            if tempWord in french_words and len(tempWord) > len(maxWord):
                maxWord = tempWord
        if len(maxWord) == 0:
            not_in = string[j:i]
            break
        i = i - len(maxWord)
        tokens.append(maxWord)
    return tokens[::-1], not_in

string = "jaimeletempslactionlamertume"
reverseMaxMatch(string)

(['amertume'], 'jaimeletempslactionl')

In [59]:
def completeReverseMaxMatch(string):
    tokens, not_in = reverseMaxMatch(string)
    not_in_array = [0] * len(tokens)
    while len(not_in) > 0:
        tokens.insert(0, not_in[-1])
        not_in_array.insert(0, 1)
        not_in  = not_in[:-1]
        if len(not_in) > 0:
            tokens_, not_in = reverseMaxMatch(not_in)
            tokens = tokens_ + tokens 
            not_in_array = [0] * len(tokens_) + not_in_array
    return tokens, not_in_array

string = "jaimeletempslactionlamertume"
completeReverseMaxMatch(string)

(['j', 'aime', 'le', 'temps', 'l', 'action', 'l', 'amertume'],
 [0, 0, 0, 0, 1, 0, 1, 0])

In [60]:
def get_splits_idx(all_tokens_l):
    l_splits_idx = []
    sum_ = 0
    for token in all_tokens_l:
        sum_ += len(token)
        l_splits_idx.append(sum_)
    return l_splits_idx

def compare_tokens_list(all_tokens_l1, not_in_array_l1, all_tokens_l2, not_in_array_l2):
    if sum(not_in_array_l1) <  sum(not_in_array_l2):
        return all_tokens_l1
    elif sum(not_in_array_l1) > sum(not_in_array_l2):
        return all_tokens_l2
    else:
        sum_square_len_l1 = sum([len(word) ** 2 for word in all_tokens_l1])
        sum_square_len_l2 = sum([len(word) ** 2 for word in all_tokens_l2])
        return all_tokens_l1 if sum_square_len_l1 > sum_square_len_l2 else all_tokens_l2

def mix_algo(string):
    tokens_max, not_in_array_max = completeMaxMatch(string)
    tokens_reverse_max, not_in_array_reverse_max = completeReverseMaxMatch(string)

    max_splits_idx = get_splits_idx(tokens_max)
    reverse_max_splits_idx = get_splits_idx(tokens_reverse_max)
    
    finals_words = []
    i_act = 0
    j_act = 0
    for i in range(len(max_splits_idx)):
        for j in range(j_act, len(reverse_max_splits_idx)):
            if max_splits_idx[i] == reverse_max_splits_idx[j]:
                finals_words.extend(compare_tokens_list(tokens_max[i_act : i + 1], not_in_array_max[i_act : i + 1], tokens_reverse_max[j_act : j + 1], not_in_array_reverse_max[j_act : j + 1]))
                i_act = i + 1
                j_act = j + 1
                break
    return finals_words

In [61]:
def algo_optimise(string):
    MAX_WORD_LENGTH = 25
    confirmed_ = []
    next_ = []
    next_raw  = []
    i = 0
    while True :
        next_ = mix_algo(string[i:i+MAX_WORD_LENGTH])
        if i + MAX_WORD_LENGTH >= len(string):
            next_raw = string[i:i+MAX_WORD_LENGTH]
            break
        else:
            confirmed_.append(next_[0])
            i += len(next_[0])
        
    return confirmed_, next_, next_raw

In [68]:
all_keys = "lecielestbleujaidesamisquisontaussimesamoureuxetdescheminsquisontaussiunpeulesmiens"
next_raw = ""
confirmed  = []

for key in all_keys:
    next_raw += key # Ajout d'une lettre
    start = time.time()
    confirmed = mix_algo(next_raw)
    result = capitalize_sentences(model.restore_punctuation(" ".join(confirmed)))
    end = time.time()
    print(end - start)
    
print(result)

0.15477252006530762
0.10584473609924316
0.10550451278686523
0.11718916893005371
0.12906432151794434
0.10823178291320801
0.20311999320983887
0.15661883354187012
0.1665041446685791
0.21157455444335938
0.25013160705566406
0.284865140914917
0.3035421371459961
0.27565574645996094
0.3428051471710205
0.368541955947876
0.42994093894958496
0.3875441551208496
0.409074068069458
0.4746551513671875
0.6374962329864502
0.7803544998168945
0.5229465961456299
0.6431798934936523
0.6461389064788818
0.6147212982177734
0.689159631729126
0.833449125289917
0.7188584804534912
0.7421252727508545
0.9627201557159424
0.8952233791351318
0.9486312866210938
1.0387685298919678
0.9356396198272705
1.095484972000122
1.0064282417297363
1.0479042530059814
1.0845093727111816
1.183119297027588
1.3140020370483398
1.325807809829712
1.2718002796173096
1.454578161239624
1.4529802799224854
1.4450972080230713
1.5638175010681152
1.6962876319885254
1.7902803421020508
1.8761231899261475
1.9498507976531982
2.032971143722534
2.14407086

In [67]:
all_keys = "lecielestbleujaidesamisquisontaussimesamoureuxetdescheminsquisontaussiunpeulesmiens"
next_raw = ""
next_ = []
confirmed  = []

for key in all_keys:
    next_raw += key # Ajout d'une lettre
    start = time.time()
    confirmed_, next_, next_raw = algo_optimise(next_raw)
    result = capitalize_sentences(model.restore_punctuation(" ".join(confirmed + next_)))
    end = time.time()
    print(end - start)
    confirmed.extend(confirmed_)
    
print(result)

0.23189163208007812
0.07820439338684082
0.08375239372253418
0.08230185508728027
0.09566950798034668
0.09237122535705566
0.16239643096923828
0.17587018013000488
0.1685950756072998
0.1935744285583496
0.2588691711425781
0.2996633052825928
0.2620089054107666
0.3299384117126465
0.3332524299621582
0.4159719944000244
0.38338160514831543
0.3649318218231201
0.38080525398254395
0.43103933334350586
0.4924020767211914
0.4734516143798828
0.481647253036499
0.557978630065918
0.6456680297851562
1.05552077293396
0.5904791355133057
0.9886829853057861
0.5001826286315918
0.5040266513824463
0.5784962177276611
0.9346165657043457
0.6198136806488037
0.6038661003112793
1.038999080657959
0.5195283889770508
0.5176815986633301
0.5639767646789551
0.8640429973602295
0.8885548114776611
0.6374642848968506
0.477916955947876
0.5476696491241455
0.690087080001831
0.9924817085266113
0.5313417911529541
0.5498156547546387
0.5143642425537109
0.9435431957244873
0.5543036460876465
0.5396573543548584
1.0497260093688965
0.585173