<a href="https://colab.research.google.com/github/aithaprasad/NLP_Celtic_Mutation/blob/main/Celtic_Mutation_Scratch_Approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### Reading the data from the respective files.

In [1]:
import csv
with open('celtic_train.tsv', encoding="utf-8") as file:
  f = csv.reader(file, delimiter="\t")
  all_data = []
  for line in f:
    all_data.append(line)

In [2]:
with open('celtic_test.tsv', encoding="utf-8") as test_file:
  f = csv.reader(test_file, delimiter="\t")
  all_test_data = []
  for line in f:
    all_test_data.append(line)

In [3]:
len(all_data)

5057059

In [4]:
len(all_test_data)

473951

##### Function to just split the data. Re-used from the first competition

In [5]:
#def train_test_split(data, percent = 0.1):
#    percent = int(percent * 10)

#    train = data[((len(data) // 10) * percent):]
#    test = data[:((len(data) // 10) * percent)]

#    return train, test

In [4]:
#train, test = train_test_split(all_data)

In [6]:
train, test = all_data, all_test_data

##### As we have five tags. The below code keeps the count of different tags on each unique word. I guess the names for the variables speak for what they actually are.

In [7]:
word_tag_count = dict()
N_count, S_count, U_count, T_count, H_count = 0, 0, 0, 0, 0 
unique_words = []
for word_tag in train:
  word, tag = word_tag[0], word_tag[1]
  
  if word == "<S>": continue
  
  if tag == 'N': N_count += 1
  elif tag == 'S': S_count += 1
  elif tag == 'T': T_count += 1
  elif tag == 'U': U_count += 1
  else: H_count += 1

  if word not in word_tag_count:
    word_tag_count[word] = {tag : 1}
    unique_words.append(word)
  else:
    if tag not in word_tag_count[word]: word_tag_count[word][tag] = 1
    else: word_tag_count[word][tag] += 1

In [8]:
print(len(word_tag_count))
print(len(unique_words))
print(N_count)
print(S_count)
print(T_count)
print(U_count)
print(H_count)

126547
126547
4139588
493102
17721
165818
40569


##### We are manipulating the above "word_tag_count" dictionary with the probabilities(each word's tag has the probability of that tag for that word). For example, the word "tug". it is a 'S' with 0.005895380673796454 probability.

In [9]:
total_count_per_tag = {'N' : N_count, 'S': S_count, 'T' : T_count, 'U': U_count, 'H' : H_count}
for word in unique_words:
  new_dict = {'N' : 0, 'S': 0, 'T' : 0, 'U': 0, 'H' : 0}
  tags = ['N', 'S', 'U', 'T', 'H']
  for tag in tags:
    if tag in word_tag_count[word]:
      new_dict[tag] = word_tag_count[word][tag] / total_count_per_tag[tag] 
  word_tag_count[word] = new_dict

In [10]:
total_count_per_tag

{'N': 4139588, 'S': 493102, 'T': 17721, 'U': 165818, 'H': 40569}

In [11]:
word_tag_count['tug']

{'N': 1.9325594721020546e-06,
 'S': 0.005877080198417366,
 'T': 0,
 'U': 0.00028344329324922506,
 'H': 0}

##### We are constructing the dictionary which stores the context for each sentence like how many times a 'N' has followed 'N' or basically, how many times what tag is followed by what tag. 

In [12]:
sentence_context_tag_count = {'<S>' : {'N': 0, 'S': 0, 'T': 0, 'U': 0, 'H': 0}, 'N' : {'N': 0, 'S': 0, 'T': 0, 'U': 0, 'H': 0}, 
                              'S' : {'N': 0, 'S': 0, 'T': 0, 'U': 0, 'H': 0}, 'T' : {'N': 0, 'S': 0, 'T': 0, 'U': 0, 'H': 0},
                              'U' : {'N': 0, 'S': 0, 'T': 0, 'U': 0, 'H': 0}, 'H' : {'N': 0, 'S': 0, 'T': 0, 'U': 0, 'H': 0}}

for i in range(1, len(train)):
  if train[i][0] == '<S>': continue
  word, tag = train[i][0], train[i][1]
  prev_word, prev_tag = train[i - 1][0], train[i - 1][1]
  if prev_word == '<S>': prev_tag = '<S>'
  sentence_context_tag_count[prev_tag][tag] += 1

In [13]:
sentence_context_tag_count

{'<S>': {'N': 180588, 'S': 19093, 'T': 14, 'U': 351, 'H': 214},
 'N': {'N': 3281024, 'S': 439585, 'T': 17657, 'U': 164746, 'H': 39313},
 'S': {'N': 464177, 'S': 25486, 'T': 30, 'U': 578, 'H': 773},
 'T': {'N': 16551, 'S': 1035, 'T': 3, 'U': 2, 'H': 53},
 'U': {'N': 158600, 'S': 6335, 'T': 12, 'U': 106, 'H': 120},
 'H': {'N': 38647, 'S': 1568, 'T': 5, 'U': 35, 'H': 96}}

##### We utilize the count and convert them to probabilities with 1-smoothing.

In [14]:
for tag in ['N', 'S', 'T', 'U', 'H', '<S>']:
  total_tag_sum = sum(sentence_context_tag_count[tag].values())
  for next_tag in ['N', 'S', 'T', 'U', 'H']:
    sentence_context_tag_count[tag][next_tag] = (1 + sentence_context_tag_count[tag][next_tag]) / total_tag_sum

In [15]:
sentence_context_tag_count

{'<S>': {'N': 0.9017726954958554,
  'S': 0.09534605013482472,
  'T': 7.490262658543893e-05,
  'U': 0.0017577149705383002,
  'H': 0.0010736043143912914},
 'N': {'N': 0.8322563461916509,
  'S': 0.11150425193255249,
  'T': 0.0044790827747585495,
  'U': 0.04178929946161212,
  'H': 0.009972287926540811},
 'S': {'N': 0.9452879986314872,
  'S': 0.051903699057518266,
  'T': 6.313079886934776e-05,
  'U': 0.0011791204046887856,
  'H': 0.0015762334943508118},
 'T': {'N': 0.9381092722738608,
  'S': 0.05871684425300385,
  'T': 0.00022670596236681024,
  'U': 0.00017002947177510768,
  'H': 0.0030605304919519385},
 'U': {'N': 0.9602114146985282,
  'S': 0.038359780351510234,
  'T': 7.870535741313653e-05,
  'U': 0.0006478056340927391,
  'H': 0.0007325652497684246},
 'H': {'N': 0.9577953458402518,
  'S': 0.03888379470149439,
  'T': 0.0001486951996233055,
  'U': 0.000892171197739833,
  'H': 0.0024039057272434387}}

##### This is basically the predict function which is Viterbi, where we utlize the above probabilities and assign the probability for the tag and store the max state(probability till that point) and at the end we return the sentence and the respective predicted tags.

In [16]:
def viterbi(sentence):
  state = []
  tags = ['N', 'S', 'U', 'T', 'H']
  for key, word in enumerate(sentence):
    p = []
    for tag in tags:
      emission_p = 0
      if key == 0 or word == '<S>': transmission_prob = sentence_context_tag_count['<S>'][tag]
      else: transmission_prob = sentence_context_tag_count[state[-1]][tag]
      if word in word_tag_count.keys(): emission_p = word_tag_count[word][tag]
  
      state_prop = emission_p * transmission_prob
      p.append(state_prop)
    max_value=max(p)
    value=tags[p.index(max_value)]
    state.append(value)
  return list(zip(sentence, state)), state

In [23]:
test_data = []
test_label = []
for data in test:
  if len(data) == 2:
    test_data.append(data[0])
    test_label.append(data[1])

In [25]:
predicted_pairs, pred_val = viterbi(test_data)

In [26]:
def predict_from_scratch(sentence_tag):
  
  correct = 0
  sentence, tags = [], []
  
  for word, tag in sentence_tag:
    sentence.append(word)
    tags.append(tag)
  
  pred_labels = viterbi(sentence)
  for i in range(len(tags)):
    if pred_labels[1][i] == tags[i]:
      correct += 1
  
  return correct


In [34]:
def evaluate():
  total = 0
  correct_from_scratch = 0
  testfile = open('test.tsv', 'r')
  sentence = []
  for line in testfile:
    total += 1
    pieces = line.rstrip("\n").split("\t") 
    if pieces[0]=='<S>':
      correct_from_scratch += predict_from_scratch(sentence)
      sentence = []
    else:
      sentence.append(pieces)
  correct_from_scratch += predict_from_scratch(sentence)
  return correct_from_scratch/total

In [35]:
evaluate()

0.870005

##References:

https://www.mygreatlearning.com/blog/pos-tagging/ 

I used this for some algo understanding purposes, but did not take any code from it.