## Prepared by : Abhishek Choraria
  **Linkedin** : https://www.linkedin.com/in/abhishek-choraria-b0966a67/

  **Github** : https://github.com/abhiemj

In [10]:
import torch
import re

In [11]:
## read the text file and append all lines to a list

list_of_unique_words = []
with open('/content/unique_words.txt','r',encoding='utf-8') as f:
  ## read all line
  list_of_unique_words = f.readlines()

list_of_unique_words[:5]

['a\n', 'aa\n', 'aaa\n', 'aaaa\n', 'aabattery\n']

In [12]:
#3 remove the newline character from each lines
unique_words = []
for word in list_of_unique_words:
  unique_words.append(word.strip())

In [13]:
unique_words[:10]

['a',
 'aa',
 'aaa',
 'aaaa',
 'aabattery',
 'aaczel',
 'aak',
 'aan',
 'aand',
 'aandb']

In [14]:
## join all words

unique_words = ' '.join(unique_words)
unique_words



## Deine special tokens

In [15]:
initial_token = '[initial]'
final_token = '[final]'
mysterious_token = '[mysterious]'
padded_token = '[PAD]' # add pad in extra space

In [16]:
## deine a function to tokenize the words,

def create_tokens(text):
  return re.findall(r"[\w']+|[.,!?]",text.lower())

In [17]:
text = "I am builduing a tokenizer"

tokens = create_tokens(text)

print(tokens)

['i', 'am', 'builduing', 'a', 'tokenizer']


In [18]:
## convert our unique words into tokens

tokens = create_tokens(unique_words)

## convert the tokens into unique tokens
unique_tokens = list(set(tokens))

## sort them

unique_tokens.sort()

print(unique_tokens)



In [19]:
reordered_tokens = []

for token in unique_tokens:
  if token not in (initial_token,final_token,mysterious_token,padded_token):
      reordered_tokens.append(token)

## Append special tokens at end
reordered_tokens.append(initial_token)
reordered_tokens.append(final_token)
reordered_tokens.append(mysterious_token)
reordered_tokens.append(padded_token)

In [20]:
print("first 10 tokens:", reordered_tokens[:10])
print("Last 10 tokens:", reordered_tokens[-10:])

first 10 tokens: ['a', 'aa', 'aaa', 'aaaa', 'aabattery', 'aaczel', 'aak', 'aan', 'aand', 'aandb']
Last 10 tokens: ['zwill', 'zwith', 'zx', 'zz', 'zzz', 'zzzz', '[initial]', '[final]', '[mysterious]', '[PAD]']


In [21]:
## Assign index to each token

# Assign index to each token
word_to_id = {}
for id, token in enumerate(reordered_tokens):
    word_to_id[token] = id



In [22]:
## print id of word
word_to_id['above']

53

In [23]:
## id to word
id_to_word = {}
for token,id in word_to_id.items():
  id_to_word[id] = token

In [24]:
## get the word from id
id_to_word[53]

'above'

In [25]:
## define token ids for special tokens
Initial_token_id = word_to_id[initial_token]
final_token_id = word_to_id[final_token]
Mysterious_token_id = word_to_id[mysterious_token]
pad_token_id = word_to_id[padded_token]

In [26]:
print(f"start token : { initial_token} - ID {Initial_token_id}")
print(f"End token : {final_token} - ID {final_token_id}")
print(f"Mysterious token : {mysterious_token} - ID {Mysterious_token_id}")
print(f"start token : {padded_token} - ID {pad_token_id}")

start token : [initial] - ID 17091
End token : [final] - ID 17092
Mysterious token : [mysterious] - ID 17093
start token : [PAD] - ID 17094


## Encoded Function

In [27]:
import torch

"""
    Encode the text into a sequence of token IDs, with optional truncation.

    Parameters:
    - text: The text to encode.
    - max_length: The maximum length of the token sequence after encoding.
    - truncation: Whether to truncate the sequence to max_length.

    Returns:
    - A list of token IDs representing the encoded text.
    """

def encoder(text, max_length=None, truncate=False, return_tensors=False):

    # create tokens of input text
    tokens = create_tokens(text)

    # Encoding tokens using a for loop
    encoded_tokens = []
    for token in tokens:
        token_id = word_to_id.get(token, word_to_id[mysterious_token])
        encoded_tokens.append(token_id)

    ## add the start and end tokens
    encoded_tokens = [word_to_id[initial_token]] + encoded_tokens + [word_to_id[final_token]]

    # Handling truncation
    if truncate and max_length is not None:
      encoded_tokens = encoded_tokens[:max_length - 1] + [word_to_id[final_token]]


    # Convert to tensor if return_tensors is True
    if return_tensors:
        encoded_tokens = torch.tensor([encoded_tokens])

    return encoded_tokens


In [28]:
text = "Distinguish between qualitative and quantitative data"
encoded_tokens = encoder(text)
encoded_tokens

[17091, 4227, 1894, 11613, 657, 11626, 3646, 17092]

## Decode Function

In [29]:
## Decode Function

In [30]:
def decoder(index,skip_special_tokens=False):
  if isinstance(index,torch.Tensor):
    index = index.tolist()

  special_token_ids = set()
  if skip_special_tokens:
    special_token_ids.update([
        Initial_token_id,
        final_token_id,
        Mysterious_token_id,
        pad_token_id,
    ])

  words = []
  for id in index:
    if id not in special_token_ids:
      word = id_to_word[id]

      words.append(word)
  return ' '.join(words)

In [31]:
decode_text = decoder(encoded_tokens)
decode_text

'[initial] distinguish between qualitative and quantitative data [final]'

## Add up everything and define a class

In [34]:
class myTokenizer:
  def __init__(self,text):
    ## define special tokens
    self.initial_token = '[initial]'
    self.final_token = '[final]'
    self.mysterious_token = '[mysterious]'
    self.padded_token = '[PAD]'

    ## Tokenize the input text
    tokens = self.create_tokens(text)
    ## get the unique tokens
    unique_tokens = list(set(tokens))

    ## sort the tokens to ensure considtent indexing
    unique_tokens.sort()


    ## Add special tokens
    reordered_tokens = []
    for token in unique_tokens:
      if token not in (self.initial_token,self.final_token,self.mysterious_token,self.padded_token):
          reordered_tokens.append(token)

    ## Append special tokens at end
    reordered_tokens.append(self.initial_token)
    reordered_tokens.append(self.final_token)
    reordered_tokens.append(self.mysterious_token)
    reordered_tokens.append(self.padded_token)

    # Assign index to each token
    self.word_to_id = {}
    for id, token in enumerate(reordered_tokens):
        self.word_to_id[token] = id

    # Assign token to each index
    self.id_to_word = {}
    for token,id in word_to_id.items():
      self.id_to_word[id] = token

    ## define token ids for special tokens
    self.Initial_token_id = word_to_id[initial_token]
    self.final_token_id = word_to_id[final_token]
    self.Mysterious_token_id = word_to_id[mysterious_token]
    self.pad_token_id = word_to_id[padded_token]


  ## deine a function to tokenize the words
  def create_tokens(self,text):
    return re.findall(r"[\w']+|[.,!?]",text.lower())

  ## define encoder
  """
    Encode the text into a sequence of token IDs, with optional truncation.

    Parameters:
    - text: The text to encode.
    - max_length: The maximum length of the token sequence after encoding.
    - truncation: Whether to truncate the sequence to max_length.

    Returns:
    - A list of token IDs representing the encoded text.
    """

  def encoder(self,text, max_length=None, truncate=False, return_tensors=False):

      # create tokens of input text
      tokens = self.create_tokens(text)

      # Encoding tokens using a for loop
      encoded_tokens = []
      for token in tokens:
          token_id = self.word_to_id.get(token, self.word_to_id[self.mysterious_token])
          encoded_tokens.append(token_id)

      ## add the start and end tokens
      encoded_tokens = [self.word_to_id[self.initial_token]] + encoded_tokens + [self.word_to_id[self.final_token]]

      # Handling truncation
      if truncate and max_length is not None:
        encoded_tokens = encoded_tokens[:max_length - 1] + [self.word_to_id[self.final_token]]


      # Convert to tensor if return_tensors is True
      if return_tensors:
          encoded_tokens = torch.tensor([encoded_tokens])

      return encoded_tokens

  ## define decoder
  def decoder(self,index,skip_special_tokens=False):
    if isinstance(index,torch.Tensor):
      index = index.tolist()

    special_token_ids = set()
    if skip_special_tokens:
      special_token_ids.update([
          self.Initial_token_id,
          self.final_token_id,
          self.Mysterious_token_id,
          self.pad_token_id,
      ])

    words = []
    for id in index:
      if id not in special_token_ids:
        word = self.id_to_word[id]

        words.append(word)
    return ' '.join(words)








In [35]:
tokenizer = myTokenizer(unique_words)
encoded_text = tokenizer.encoder("Distinguish between qualitative and quantitative data")
decoded_text = tokenizer.decoder(encoded_text)

print(f"Encoded : {encoded_text}")
print(f"Decoded : {decoded_text}")

Encoded : [17091, 4227, 1894, 11613, 657, 11626, 3646, 17092]
Decoded : [initial] distinguish between qualitative and quantitative data [final]
