In [1]:
#Contractions in NLP

In [3]:
# !pip install contractions
import contractions
# contracted text
text = '''I'll be there within 5 min. Shouldn't you be there too?
          I'd love to see u there my dear. It's awesome to meet new friends.
          We've been waiting for this day for so long.'''

# creating an empty list
expanded_words = []
for word in text.split():
  # using contractions.fix to expand the shortened words
  expanded_words.append(contractions.fix(word))

expanded_text = ' '.join(expanded_words)
print('Original text: ' + text)
print('Expanded_text: ' + expanded_text)

Original text: I'll be there within 5 min. Shouldn't you be there too? 
          I'd love to see u there my dear. It's awesome to meet new friends.
          We've been waiting for this day for so long.
Expanded_text: I will be there within 5 min. Should not you be there too? I would love to see you there my dear. It is awesome to meet new friends. We have been waiting for this day for so long.


**Ambiguity of contractions**
---

It's observed that some contractions represent multiple word combinations.

*"ain't": "am not / are not / is not / has not / have not"*

**Solution to the above:**

The pycontractions library
We can also use the pycontractions library to expand the contractions. It works in the following way:

Case 1: If a contraction corresponds to only one sequence of words, pycontractions replaces the contraction with that word sequence.

Case 2: If a contraction corresponds to many possible expansions. Then, in that case, pycontractions produces all the possible expansions and then uses a spell checker. The grammatically incorrect options are discarded, and the correct choice is selected.

It has been observed that pycontractions is more accurate than the contractions library of python as it takes into account the grammar of the text.

https://pypi.org/project/pycontractions/


In [4]:
import contractions

text = '''I ain't doing that.'''

expanded_text = []
for word in text.split():
  expanded_text.append(contractions.fix(word))

expanded_text = ' '.join(expanded_text)
print('Input : ' + text)
print('\n')
print('Output: ' + expanded_text)

Input : I ain't doing that.


Output: I are not doing that.


*Tokenization*

In [6]:
text = 'LLMs aka Large Language Models have been the talk of the town for some time.'
words = text.split(" ") # split the text on spaces

tokens = {v: k for k, v in enumerate(words)} # generate a word to index mapping

In [7]:
#mapped every word in the text to a numerical index
tokens

{'LLMs': 0,
 'aka': 1,
 'Large': 2,
 'Language': 3,
 'Models': 4,
 'have': 5,
 'been': 6,
 'the': 10,
 'talk': 8,
 'of': 9,
 'town': 11,
 'for': 12,
 'some': 13,
 'time.': 14}

Byte Pair Encoding

In [8]:
from collections import Counter

def tokenize(text):
    return list(text)

def get_pair_frequencies(tokens):
    pairs = Counter()
    for i in range(len(tokens) - 1):
        pair = (tokens[i], tokens[i+1])
        pairs[pair] += 1
    return pairs

def merge_most_frequent_pair(tokens, max_merge_operations):
    final_tokens = list(tokens)  # Copy the initial tokens into final_tokens
    new_tokens = []  # Initialize a list to store the new tokens
    for _ in range(max_merge_operations):
        pair_frequencies = get_pair_frequencies(final_tokens)  # Count pairs in the current final_tokens
        if not pair_frequencies:
            break
        most_frequent_pair = max(pair_frequencies, key=pair_frequencies.get)
        new_token = ''.join(most_frequent_pair)
        merged_tokens = []
        i = 0
        while i < len(final_tokens):
            if i < len(final_tokens) - 1 and (final_tokens[i], final_tokens[i+1]) == most_frequent_pair:
                merged_tokens.append(new_token)
                i += 2
            else:
                merged_tokens.append(final_tokens[i])
                i += 1
        final_tokens = merged_tokens
        new_tokens.append(new_token)  # Add the new token to the list
    return final_tokens, new_tokens  # Return both final_tokens and new_tokens

text = "car,cable,table,watch,chair,mouse"
tokens = tokenize(text)
final_tokens, new_tokens = merge_most_frequent_pair(tokens, 15)
print("Final Tokens:", final_tokens)
print("New Tokens:", new_tokens)

Final Tokens: ['car,cable,table,wat', 'ch', ',', 'ch', 'a', 'i', 'r,', 'm', 'o', 'u', 's', 'e']
New Tokens: ['ca', 'r,', 'bl', 'ble', 'ble,', 'ch', 'car,', 'car,ca', 'car,cable,', 'car,cable,t', 'car,cable,ta', 'car,cable,table,', 'car,cable,table,w', 'car,cable,table,wa', 'car,cable,table,wat']
