## NLP Assignment 1
---

### Task 1

In [1]:
import re
import os
from collections import defaultdict

In [2]:
class Tokenizer:
  def __init__(self):
    self.vocab = defaultdict(int)
    self.all_tokens= defaultdict(int)  # all tokens from first to last
    self.merge_rules=[]
    self.final_tokens=[]         # only the final tokens



  def merge_vocabulary(self,pair, vocab_old):
    vocab_new = {}
    self.merge_rules.append(pair)
    bigram = re.escape(' '.join(pair))
    regex=re.compile(r'(?<!\S)' + bigram + r'(?!\S)')

   # merge the old vocab based on the new merging rule
    for word in vocab_old:
        w_out = regex.sub(''.join(pair), word) #find all words where bigram was found in the old vocab
        all_tokens=w_out.split()
        for a in all_tokens: self.all_tokens[a]+=1
        vocab_new[w_out] = vocab_old[word]
    return vocab_new



  def learn_vocabulary(self, corpus, num_merges):
      ### create a list of strings
      data = corpus.split('\n')

      ### updating dict vocab with words and frequencies
      for line in data:
          for word in line.split():
            new_word=' '.join(list(word)) + ' $'
            self.vocab[new_word] += 1

            all_tokens=new_word.split()
            for a in all_tokens: self.all_tokens[a]+=1


      ### making pairs and updating their frequencies
      for _ in range(num_merges):
          pairs = defaultdict(int)
          for word,freq in self.vocab.items():
              chars = word.split()
              for j in range(len(chars)-1):
                  pairs[chars[j],chars[j+1]] += freq


          best_pair = max(pairs, key=pairs.get)
          self.vocab = self.merge_vocabulary(best_pair, self.vocab)

      t = " ".join(list(self.vocab.keys()))
      d=defaultdict(int)
      l=t.split()
      for a in l: d[a]+=1
      self.final_tokens=list(d.keys())



  def tokenize(self,text_lists):
    # divide the text into individual letters

     ans_list=[]
     for a in text_lists:
          text=a
          data = text.split('\n')
          new_text=""
          for line in data:
              for word in line.split():
                  new_word=' '.join(list(word)) + ' $ '
                  new_text+=new_word

          # Tokenize the text based on merge rules
          merge_rules=self.merge_rules
          for rule in merge_rules:
              merged_token = "".join(rule)
              new_text = new_text.replace(" ".join(rule),merged_token)

          tokens = new_text.split()
          # print(tokens)
          # print()
          ans_list.append(tokens)


     return ans_list
  

  def write_to_file(self,root,text_list):
      token_path = os.path.join(root,"tokens.txt")
      rules_path = os.path.join(root,"merge_rules.txt")
      samples_path = os.path.join(root,"tokenized_samples.txt")

      file=open(token_path,"w+")
      for a in list(self.all_tokens.keys()):
          file.write(a+"\n")
      file.close()


      file=open(rules_path,"w+")
      for a in self.merge_rules:
          file.write(a[0]+","+a[1]+"\n")
      file.close()


      file=open(samples_path,"w")
      for a in text_list:
          s=",".join(a)+"\n"
          print(s)
          file.write(s)
      file.close()

In [3]:
if __name__ == "__main__":
    num_merges = 500

    file=open("./corpus/corpus.txt","r")
    corpus=file.read()
    # file.seek(0)
    # test_corpus=file.readlines()

    tokenizer = Tokenizer()
    tokenizer.learn_vocabulary(corpus, num_merges)


In [5]:
if __name__ == "__main__":
    test_corpus=['''Tokenization is the process of breaking down a sequence of text into smaller units called tokens, which can be words, phrases, or even individual characters.''',
    '''Tokenization is often the first step in natural languages processing tasks such as text classification, named entity recognition, and sentiment analysis.''',
    '''The resulting tokens are typically used as input to further processing steps, such as vectorization, where the tokens are converted into numerical representations for machine learning models to use.'''  ]
    
    text_list=tokenizer.tokenize(test_corpus)

    tokenizer.write_to_file("./merged_tokens/",text_list)

T,o,ken,i,z,ation$,is$,the$,proc,ess$,of$,brea,king$,down$,a$,se,qu,ence$,of$,tex,t$,into$,s,maller$,un,its$,called$,to,ken,s,,,$,which$,can$,be$,wor,d,s,,,$,p,h,rases,,,$,or$,even$,in,di,vi,du,al$,charac,ter,s,.,$

T,o,ken,i,z,ation$,is$,of,ten$,the$,fir,st$,step$,in$,n,atural$,l,angu,ages$,proc,es,sing$,tas,ks$,su,ch$,as$,tex,t$,cl,as,si,fication,,,$,n,amed$,en,tity$,re,co,g,n,ition,,,$,and$,sen,timent$,an,al,y,sis,.,$

T,he$,resul,ting$,to,ken,s$,are$,t,y,p,ically$,used$,as$,in,pu,t$,to$,fur,ther$,proc,es,sing$,step,s,,,$,su,ch$,as$,v,e,c,tori,z,ation,,,$,where$,the$,to,ken,s$,are$,con,ver,ted$,into$,n,u,merical$,represen,tations$,for$,machine$,lear,ning$,mo,del,s$,to$,use,.,$

