# Process Data

In [190]:
import pandas as pd
import re
import string
import numpy as np
import random
import matplotlib.pyplot as plt

from collections import Counter

import nltk
from os import getcwd

nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GUEGUEN\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [191]:
with open("Test1.txt", "r") as file1:
    FileContent = file1.read() + " 200 ml de gni et 455 kg de ligheur"
    print(FileContent)

J'ai bu 245 ml de gin et 100 ml de Liqeur de café et 350 ml de gan  200 ml de gni et 455 kg de ligheur


In [192]:
def process_tweet(sentence):
    """Process tweet function.
    Input:
        tweet: a string containing a tweet
    Output:
        tweets_clean: a list of words containing the processed tweet

    """
    sentence = sentence.lower()
    stopwords_french = stopwords.words('french')
    # remove stock market tickers like $GE
    sentence = re.sub(r'\$\w*', '', sentence)
    # remove old style retweet text "RT"
    sentence = re.sub(r'^RT[\s]+', '', sentence)
    # remove hyperlinks    
    sentence = re.sub(r'https?://[^\s\n\r]+', '', sentence)
    # remove hashtags
    # only removing the hash # sign from the word
    sentence = re.sub(r'#', '', sentence)
    # Suppression des apostrophes
    sentence = re.sub(r'\'', ' ', sentence) 
    # tokenize sentence
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True,
                               reduce_len=True)
    sentence_tokens = tokenizer.tokenize(sentence)
    

    sentence_clean = []
    for word in sentence_tokens:
        if (word not in stopwords_french and word not in string.punctuation):  # remove punctuation
            # tweets_clean.append(word)
            sentence_clean.append(word)
            
    return sentence_clean

In [193]:
M = process_tweet(FileContent)
M

['bu',
 '245',
 'ml',
 'gin',
 '100',
 'ml',
 'liqeur',
 'café',
 '350',
 'ml',
 'gan',
 '200',
 'ml',
 'gni',
 '455',
 'kg',
 'ligheur']

# Autocorrect

In [194]:
df = pd.read_csv(r"C:\Users\GUEGUEN\Desktop\WSApp\tracker_alimentaire\package\nlp\data_base\processed_aliment.csv")
vocab = df["word"]

In [195]:
def levenshtein_distance(s1, s2):
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for index2, char2 in enumerate(s2):
        new_distances = [index2 + 1]
        for index1, char1 in enumerate(s1):
            if char1 == char2:
                new_distances.append(distances[index1])
            else:
                new_distances.append(1 + min((distances[index1], distances[index1 + 1], new_distances[-1])))
        distances = new_distances

    return distances[-1]

def similar_names(name1, name2, threshold=0.84):
    """
    Compare two names and determine if they are similar using the Levenshtein distance.

    :param name1: First name to compare.
    :param name2: Second name to compare.
    :param threshold: Similarity threshold between 0 and 1, where 1 indicates a perfect match.
    :return: True if the names are considered similar, otherwise False.
    """
    
    # Calculate the similarity ratio based on the Levenshtein distance.
    max_len = max(len(name1), len(name2))
    if max_len == 0:
        return True

    similarity = 1 - (levenshtein_distance(name1, name2) / max_len)
    
    return similarity > threshold

def has_similar_authors(authors_list):
    """
    Checks if a list of authors contains similar authors.

    :param authors_list: List of author names.
    :return: True if similar authors are found, otherwise False.
    """

    for i in range(len(authors_list)):
        for j in range(i+1, len(authors_list)):
            if similar_names(authors_list[i], authors_list[j]):
                return True
    return False

books_df['has_similar'] = books_df['authors_list'].apply(has_similar_authors)

similar_authors_df = books_df[books_df['has_similar'] == True]

print(len(similar_authors_df))
similar_authors_df

NameError: name 'books_df' is not defined

### Get count

In [196]:
def get_count(word_l):
    '''
    Input:
        word_l: a set of words representing the corpus. 
    Output:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    '''
    
    word_count_dict = {}  # fill this with word counts
    for word in word_l:
        if word in word_count_dict:
            word_count_dict[word] += 1  # Increment the word count
        else:
            word_count_dict[word] = 1  # Add the word with count 1
    return word_count_dict

In [197]:
vocab_count = get_count(vocab)

### Get probs

In [198]:
# UNQ_C3 GRADED FUNCTION: get_probs
def get_probs(word_count_dict):
    '''
    Input:
        word_count_dict: The wordcount dictionary where key is the word and value is its frequency.
    Output:
        probs: A dictionary where keys are the words and the values are the probability that a word will occur. 
    '''
    probs = {}  # return this variable correctly
        
    total_count = sum(word_count_dict.values())  # Calculate the total count of words
    
    for word in word_count_dict:
        probs[word] = word_count_dict[word] / total_count  # Calculate the probability for each word
      
    return probs

In [199]:
probs = get_probs(vocab_count)
probs

{'gin': 0.1111111111111111,
 'liqueur': 0.1111111111111111,
 'cafe': 0.1111111111111111,
 nan: 0.6666666666666666}

### delete letter

In [200]:
def delete_letter(word, verbose=False):
    '''
    Input:
        word: the string/word for which you will generate all possible words 
                in the vocabulary which have 1 missing character
    Output:
        delete_l: a list of all possible strings obtained by deleting 1 character from word
    '''
    
    delete_l = []
    split_l = []
    
    # Generate all possible splits of the word
    split_l = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    
    # Generate all words that result from deleting one character
    # Exclude the last split since it doesn't make sense to delete a character after the end of the word
    delete_l = [L + R[1:] for L, R in split_l if R]  

    if verbose: print(f"input word {word}, \nsplit_l = {split_l}, \ndelete_l = {delete_l}")

    return  delete_l

In [201]:
for word in M:
    print(delete_letter(word, verbose=False))

['u', 'b']
['45', '25', '24']
['l', 'm']
['in', 'gn', 'gi']
['00', '10', '10']
['l', 'm']
['iqeur', 'lqeur', 'lieur', 'liqur', 'liqer', 'liqeu']
['afé', 'cfé', 'caé', 'caf']
['50', '30', '35']
['l', 'm']
['an', 'gn', 'ga']
['00', '20', '20']
['l', 'm']
['ni', 'gi', 'gn']
['55', '45', '45']
['g', 'k']
['igheur', 'lgheur', 'liheur', 'ligeur', 'lighur', 'ligher', 'ligheu']


### Switch letter

In [202]:
def switch_letter(word, verbose=False):
    '''
    Input:
        word: input string
     Output:
        switches: a list of all possible strings with one adjacent charater switched
    ''' 
    
    switch_l = []
    split_l = []
    
    # Generate all possible splits of the word
    split_l = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    
    # Form strings by swapping the first two characters of R for each split, where R has at least two characters
    switch_l = [L + R[1] + R[0] + R[2:] for L, R in split_l if len(R) > 1]
    
    if verbose: print(f"Input word = {word} \nsplit_l = {split_l} \nswitch_l = {switch_l}") 
    
    return switch_l

In [203]:
for word in M:
    print(switch_letter(word, verbose=True))

Input word = bu 
split_l = [('', 'bu'), ('b', 'u'), ('bu', '')] 
switch_l = ['ub']
['ub']
Input word = 245 
split_l = [('', '245'), ('2', '45'), ('24', '5'), ('245', '')] 
switch_l = ['425', '254']
['425', '254']
Input word = ml 
split_l = [('', 'ml'), ('m', 'l'), ('ml', '')] 
switch_l = ['lm']
['lm']
Input word = gin 
split_l = [('', 'gin'), ('g', 'in'), ('gi', 'n'), ('gin', '')] 
switch_l = ['ign', 'gni']
['ign', 'gni']
Input word = 100 
split_l = [('', '100'), ('1', '00'), ('10', '0'), ('100', '')] 
switch_l = ['010', '100']
['010', '100']
Input word = ml 
split_l = [('', 'ml'), ('m', 'l'), ('ml', '')] 
switch_l = ['lm']
['lm']
Input word = liqeur 
split_l = [('', 'liqeur'), ('l', 'iqeur'), ('li', 'qeur'), ('liq', 'eur'), ('liqe', 'ur'), ('liqeu', 'r'), ('liqeur', '')] 
switch_l = ['ilqeur', 'lqieur', 'liequr', 'liquer', 'liqeru']
['ilqeur', 'lqieur', 'liequr', 'liquer', 'liqeru']
Input word = café 
split_l = [('', 'café'), ('c', 'afé'), ('ca', 'fé'), ('caf', 'é'), ('café', '')] 
sw

### Replace letter

In [204]:
def replace_letter(word, verbose=False):
    '''
    Input:
        word: the input string/word 
    Output:
        replaces: a list of all possible strings where we replaced one letter from the original word. 
    ''' 
    
    letters = 'abcdefghijklmnopqrstuvwxyz'
    
    replace_l = []
    split_l = []
        
    # Generate all possible splits of the word
    split_l = [(word[:i], word[i:]) for i in range(len(word))]
    
    # Form strings by replacing the first character of R in each split with each letter in the alphabet
    replace_l = [L + c + R[1:] for L, R in split_l if R for c in letters]
    
    # Convert the list to a set to remove duplicates
    replace_set = set(replace_l)
    
    # Remove the original word from the set
    replace_set.discard(word)
    
    # turn the set back into a list and sort it, for easier viewing
    replace_l = sorted(list(replace_set))
    
    if verbose: print(f"Input word = {word} \nsplit_l = {split_l} \nreplace_l {replace_l}")   
    
    return replace_l

In [205]:
for word in M:
    print(replace_letter(word, verbose=False))

['au', 'ba', 'bb', 'bc', 'bd', 'be', 'bf', 'bg', 'bh', 'bi', 'bj', 'bk', 'bl', 'bm', 'bn', 'bo', 'bp', 'bq', 'br', 'bs', 'bt', 'bv', 'bw', 'bx', 'by', 'bz', 'cu', 'du', 'eu', 'fu', 'gu', 'hu', 'iu', 'ju', 'ku', 'lu', 'mu', 'nu', 'ou', 'pu', 'qu', 'ru', 'su', 'tu', 'uu', 'vu', 'wu', 'xu', 'yu', 'zu']
['24a', '24b', '24c', '24d', '24e', '24f', '24g', '24h', '24i', '24j', '24k', '24l', '24m', '24n', '24o', '24p', '24q', '24r', '24s', '24t', '24u', '24v', '24w', '24x', '24y', '24z', '2a5', '2b5', '2c5', '2d5', '2e5', '2f5', '2g5', '2h5', '2i5', '2j5', '2k5', '2l5', '2m5', '2n5', '2o5', '2p5', '2q5', '2r5', '2s5', '2t5', '2u5', '2v5', '2w5', '2x5', '2y5', '2z5', 'a45', 'b45', 'c45', 'd45', 'e45', 'f45', 'g45', 'h45', 'i45', 'j45', 'k45', 'l45', 'm45', 'n45', 'o45', 'p45', 'q45', 'r45', 's45', 't45', 'u45', 'v45', 'w45', 'x45', 'y45', 'z45']
['al', 'bl', 'cl', 'dl', 'el', 'fl', 'gl', 'hl', 'il', 'jl', 'kl', 'll', 'ma', 'mb', 'mc', 'md', 'me', 'mf', 'mg', 'mh', 'mi', 'mj', 'mk', 'mm', 'mn', '

### Insert letter

In [206]:
def insert_letter(word, verbose=False):
    '''
    Input:
        word: the input string/word 
    Output:
        inserts: a set of all possible strings with one new letter inserted at every offset
    ''' 
    letters = 'abcdefghijklmnopqrstuvwxyz'
    insert_l = []
    split_l = []
    
    # Generate all possible splits of the word
    split_l = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    
    # Form strings by inserting a letter from 'letters' at every offset
    insert_l = [L + c + R for L, R in split_l for c in letters]
    
    if verbose: print(f"Input word {word} \nsplit_l = {split_l} \ninsert_l = {insert_l}")
    
    return insert_l

In [207]:
for word in M:
    print(insert_letter(word, verbose=False))

['abu', 'bbu', 'cbu', 'dbu', 'ebu', 'fbu', 'gbu', 'hbu', 'ibu', 'jbu', 'kbu', 'lbu', 'mbu', 'nbu', 'obu', 'pbu', 'qbu', 'rbu', 'sbu', 'tbu', 'ubu', 'vbu', 'wbu', 'xbu', 'ybu', 'zbu', 'bau', 'bbu', 'bcu', 'bdu', 'beu', 'bfu', 'bgu', 'bhu', 'biu', 'bju', 'bku', 'blu', 'bmu', 'bnu', 'bou', 'bpu', 'bqu', 'bru', 'bsu', 'btu', 'buu', 'bvu', 'bwu', 'bxu', 'byu', 'bzu', 'bua', 'bub', 'buc', 'bud', 'bue', 'buf', 'bug', 'buh', 'bui', 'buj', 'buk', 'bul', 'bum', 'bun', 'buo', 'bup', 'buq', 'bur', 'bus', 'but', 'buu', 'buv', 'buw', 'bux', 'buy', 'buz']
['a245', 'b245', 'c245', 'd245', 'e245', 'f245', 'g245', 'h245', 'i245', 'j245', 'k245', 'l245', 'm245', 'n245', 'o245', 'p245', 'q245', 'r245', 's245', 't245', 'u245', 'v245', 'w245', 'x245', 'y245', 'z245', '2a45', '2b45', '2c45', '2d45', '2e45', '2f45', '2g45', '2h45', '2i45', '2j45', '2k45', '2l45', '2m45', '2n45', '2o45', '2p45', '2q45', '2r45', '2s45', '2t45', '2u45', '2v45', '2w45', '2x45', '2y45', '2z45', '24a5', '24b5', '24c5', '24d5', '24e

### Edit one letter

In [208]:
def edit_one_letter(word, allow_switches = True):
    """
    Input:
        word: the string/word for which we will generate all possible wordsthat are one edit away.
    Output:
        edit_one_set: a set of words with one possible edit. Please return a set. and not a list.
    """
    
    edit_one_set = set()
    
    # Use the previously implemented functions
    deletes = delete_letter(word)
    replaces = replace_letter(word)
    inserts = insert_letter(word)
    # Optionally include switches
    if allow_switches:
        switches = switch_letter(word)
    else:
        switches = []
    
    # Combine the results and convert to a set to eliminate duplicates
    edit_one_set = set(deletes + replaces + inserts + switches)
    
    # return this as a set and not a list
    return set(edit_one_set)

In [209]:
for word in M[2 : 4]:
    print(edit_one_letter(word, allow_switches = True))

{'mu', 'mi', 'eml', 'mcl', 'mr', 'mlr', 'mn', 'wl', 'nml', 'ol', 'cml', 'bl', 'pl', 'fml', 'mbl', 'mxl', 'mlx', 'mo', 'mle', 'me', 'mli', 'uml', 'mln', 'mx', 'mnl', 'mlo', 'mv', 'mql', 'mzl', 'mlf', 'ms', 'mg', 'oml', 'mk', 'zl', 'mlb', 'vml', 'xl', 'mjl', 'el', 'bml', 'mll', 'mdl', 'lm', 'mw', 'mtl', 'mlz', 'mlt', 'mt', 'dml', 'mly', 'wml', 'jml', 'm', 'my', 'mld', 'mgl', 'msl', 'mlw', 'myl', 'mla', 'tl', 'mlj', 'qml', 'mls', 'sml', 'vl', 'ma', 'mf', 'mp', 'mb', 'kl', 'mwl', 'gl', 'gml', 'ul', 'mlm', 'mlh', 'mml', 'mlp', 'mul', 'll', 'l', 'pml', 'yml', 'mil', 'xml', 'rml', 'yl', 'rl', 'mlk', 'mc', 'zml', 'mm', 'mlg', 'fl', 'kml', 'sl', 'mpl', 'mel', 'mrl', 'mj', 'mfl', 'ql', 'il', 'aml', 'nl', 'hml', 'mlq', 'mal', 'mvl', 'lml', 'mkl', 'jl', 'dl', 'al', 'mz', 'mq', 'hl', 'mol', 'tml', 'mlu', 'mlc', 'mhl', 'mh', 'iml', 'md', 'mlv', 'cl'}
{'ginz', 'sin', 'ginf', 'givn', 'giin', 'jgin', 'gvin', 'gbin', 'lgin', 'gtin', 'gxin', 'gihn', 'ugin', 'iin', 'gifn', 'xin', 'gie', 'gmin', 'dgin', 'i

### Edit two letters

In [210]:
def edit_two_letters(word, allow_switches = True):
    '''
    Input:
        word: the input string/word 
    Output:
        edit_two_set: a set of strings with all possible two edits
    '''
    
    edit_two_set = set()
    
    # First, get all words that are one edit away
    edit_one_set = edit_one_letter(word, allow_switches)
    
    # Then, for each of these words, get all words that are one edit away from them
    for edit_one_word in edit_one_set:
        edit_two_temp_set = edit_one_letter(edit_one_word, allow_switches)
        
        # Combine the results using union
        edit_two_set = edit_two_set.union(edit_two_temp_set)
    
    # Make sure to include the first edit set as well to capture all possibilities
    edit_two_set = edit_two_set.union(edit_one_set)
    
    # return this as a set instead of a list
    return set(edit_two_set)

In [211]:
for word in M[2 : 3]:
    print(edit_two_letters(word, allow_switches = True))

{'', 'dmlz', 'mlfg', 'mhvl', 'ile', 'wl', 'tme', 'mqa', 'url', 'oqml', 'mdlb', 'vmul', 'zmy', 'smj', 'amu', 'zmal', 'plg', 'wmls', 'lrm', 'nv', 'qqml', 'mljh', 'xjl', 'ej', 'mwlt', 'idl', 'xmlt', 'kt', 'dc', 'zfl', 'elml', 'mff', 'bmlr', 'amkl', 'cmil', 'mti', 'emlc', 'omil', 'jx', 'ymx', 'mlau', 'pmsl', 'ruml', 'dmx', 'mjlm', 'tmu', 'mjs', 'beml', 'nmlj', 'lzl', 'vmo', 'mja', 'ivl', 'pcml', 'dk', 'mlxm', 'cmld', 'mlja', 'ijml', 'mnm', 'zmlw', 'mlf', 'ymd', 'nj', 'zgml', 'mlbp', 'zmg', 'mlkc', 'mflg', 'vmh', 'mjlz', 'mym', 'mlkl', 'ucl', 'wgl', 'myo', 'pmv', 'alw', 'lmy', 'cu', 'mlik', 'mlkj', 'mlvq', 'hlz', 'slp', 'ce', 'mgly', 'mlwc', 'omn', 'kmlh', 'cfl', 'mkk', 'mdml', 'zmj', 'eqml', 'xlj', 'kmfl', 'mbld', 'mglh', 'n', 'vhml', 'amul', 'mud', 'aeml', 'nlq', 'fsl', 'hmli', 'nt', 'iq', 'nzl', 'olq', 'hgml', 'mjbl', 'mulz', 'mylp', 'dlk', 'mrla', 'tlq', 'tkml', 'myql', 'rmz', 'mlrh', 'ppl', 'mdlu', 'mklp', 'zms', 'mtw', 'blc', 'vld', 'cv', 'vvml', 'mvlh', 'mug', 'ebl', 'dlj', 'tmlw', '

### Correction

In [212]:
def get_corrections(word, probs, vocab, n=2, verbose = False):
    '''
    Input: 
        word: a user entered string to check for suggestions
        probs: a dictionary that maps each word to its probability in the corpus
        vocab: a set containing all the vocabulary
        n: number of possible word corrections you want returned in the dictionary
    Output: 
        n_best: a list of tuples with the most probable n corrected words and their probabilities.
    '''
    
    suggestions = []
    n_best = []
    
        # Step 1: create suggestions as described above
    # If the word is in the vocabulary, suggest the word itself.
    if word in vocab:
        suggestions = [word]
    else:
        # Otherwise, try getting suggestions from one edit away words.
        one_edit_away = edit_one_letter(word)
        two_edits_away = edit_two_letters(word)
        
        # Use set intersection to find suggestions in vocabulary
        suggestions_one_edit = one_edit_away.intersection(vocab)
        suggestions_two_edits = two_edits_away.intersection(vocab)
        
        # Logical 'or' to select the first non-empty set of suggestions
        suggestions = (suggestions_one_edit or suggestions_two_edits or [word])
                    
    # Step 2: determine probability of suggestions
    # Create a dictionary of suggestions with their probabilities
    suggestions_probs = {suggestion: probs.get(suggestion, 0) for suggestion in suggestions}
    
    # Step 3: Get all your best words and return the most probable top n_suggested words as n_best
    # Use the Counter object to find n most common words based on their probabilities
    from collections import Counter
    n_best = Counter(suggestions_probs).most_common(n)
    
    
    if verbose: print("entered word = ", word, "\nsuggestions = ", suggestions)

    return n_best

In [213]:

for word in M:
    get_corrections(word, probs, vocab, n=2, verbose = True)

entered word =  bu 
suggestions =  ['bu']
entered word =  245 
suggestions =  ['245']
entered word =  ml 
suggestions =  ['ml']
entered word =  gin 
suggestions =  {'gin'}
entered word =  100 
suggestions =  ['100']
entered word =  ml 
suggestions =  ['ml']
entered word =  liqeur 
suggestions =  {'liqueur'}
entered word =  café 
suggestions =  {'cafe'}
entered word =  350 
suggestions =  ['350']
entered word =  ml 
suggestions =  ['ml']
entered word =  gan 
suggestions =  {'gin'}
entered word =  200 
suggestions =  ['200']
entered word =  ml 
suggestions =  ['ml']
entered word =  gni 
suggestions =  {'gin'}
entered word =  455 
suggestions =  ['455']
entered word =  kg 
suggestions =  ['kg']
entered word =  ligheur 
suggestions =  {'liqueur'}
