# Error Suppression

## Part 0: Encoding and processing a file, helper methods, etc 

In [None]:
def encode_file(filename): 
  sentences = []
  annotations = {} 

  idx = -1 
  with open(filename, encoding="latin-1") as f: 
    for line in f.readlines(): 
      if line[0] == 'S': 
        sentences.append(line) 
        idx += 1
      elif line[0] == 'A':
        if idx in annotations: 
          annotations[idx].append(line) 
        else: 
          annotations[idx] = [line] 
      else: 
        continue 

  return sentences, annotations     

In [None]:
sentences, annotations = encode_file("/content/fce.train.lang.m2")

Encode all annotations in a dictionary with only the error types

In [None]:
def encode_errors_only(annotations): 
  errors = {} 
  error_count = 0 
  for idx in annotations.keys(): 
    errors[idx] = [] 
    for annot in annotations[idx]: 
      error = annot.split("|||")[1]
      if error != "noop": 
        error_count += 1
        errors[idx].append(error) 

  return errors, error_count

In [None]:
errors_by_sentence, error_count = encode_errors_only(annotations)

In [None]:
def get_error_dist(errors): 
  error_dist = {} 
  total_count = 0
  for idx in errors.keys(): 
    for error in errors[idx]: 
      total_count += 1
      if error not in error_dist: 
        error_dist[error] = 1
      else: 
        error_dist[error] += 1 
  
  for error in error_dist.keys(): 
    error_dist[error] = error_dist[error] * 1.0 / total_count 

  return error_dist 

In [None]:
error_dist = get_error_dist(errors_by_sentence)
print(error_dist)

{'R:PREP': 0.06447606142728093, 'R:MORPH': 0.0228319783197832, 'R:NOUN': 0.039317976513098464, 'R:OTHER': 0.08949864498644987, 'R:VERB': 0.06043360433604336, 'U:ADV': 0.00539747064137308, 'M:PUNCT': 0.04905149051490515, 'R:DET': 0.026851851851851852, 'M:VERB': 0.010953026196928636, 'R:WO': 0.017728093947606143, 'M:PREP': 0.028794037940379404, 'M:DET': 0.056571815718157184, 'R:VERB:FORM': 0.0268970189701897, 'U:PREP': 0.021409214092140923, 'M:PRON': 0.015808491418247517, 'R:NOUN:NUM': 0.029426377597109303, 'U:DET': 0.025654923215898826, 'R:ORTH': 0.027416440831074976, 'UNK': 0.02994579945799458, 'R:VERB:TENSE': 0.04509936766034327, 'M:CONJ': 0.0038392050587172538, 'U:VERB:TENSE': 0.007362240289069557, 'U:PRON': 0.006097560975609756, 'R:ADV': 0.011946702800361336, 'R:SPELL': 0.10860433604336044, 'U:NOUN': 0.00481029810298103, 'U:PUNCT': 0.021183378500451672, 'R:PRON': 0.016576332429990966, 'R:VERB:SVA': 0.01461156278229449, 'R:PUNCT': 0.0244579945799458, 'M:NOUN:POSS': 0.0021906052393857

For the purpose of coming up with a new distribution, shuffle error_dist to get a goal_dist 

In [None]:
from random import shuffle

values = list(error_dist.values())
shuffle(values)

goal_error_dist = dict(zip(error_dist, values))

## Part 1: Identify what errors need to be fixed

In [None]:
# Using the same m2 file as if that's the one I need to supress errors in. proof of concept basically 
sentences, annotations = encode_file("/content/fce.train.lang.m2")
errors_by_sentence, error_count = encode_errors_only(annotations)

In [None]:
goal_error_dist_count = {} 
for error in goal_error_dist.keys(): 
  goal_error_dist_count[error] = goal_error_dist[error] * error_count 

In [None]:
def get_errors_to_suppress(goal_error_dist_count, errors_by_sentence): 
  errors_to_suppress = {} 
  for sent_idx in errors_by_sentence.keys(): 
    annot_idx = 0
    errors_to_suppress[sent_idx] = []
    missing_suppressed = False # has at least one MISSING error been suppressed? 
    for error in errors_by_sentence[sent_idx]:
      goal_error_dist_count[error] -= 1 
      # TODO: check if this logic is okay  
      if goal_error_dist_count[error] <= 0: 
        if error[0] == 'M': 
          if not missing_suppressed: 
            errors_to_suppress[sent_idx].append(annot_idx)
            missing_suppressed = True 
        else: 
          errors_to_suppress[sent_idx].append(annot_idx)


      annot_idx += 1
    
  return errors_to_suppress 

In [None]:
errors_to_suppress = get_errors_to_suppress(goal_error_dist_count, errors_by_sentence)

print(errors_to_suppress)

{0: [], 1: [], 2: [], 3: [], 4: [], 5: [], 6: [], 7: [], 8: [], 9: [], 10: [], 11: [], 12: [], 13: [], 14: [], 15: [], 16: [], 17: [], 18: [], 19: [], 20: [], 21: [], 22: [], 23: [], 24: [], 25: [], 26: [], 27: [], 28: [], 29: [], 30: [], 31: [], 32: [], 33: [], 34: [], 35: [], 36: [], 37: [], 38: [], 39: [], 40: [], 41: [], 42: [], 43: [], 44: [], 45: [], 46: [], 47: [], 48: [], 49: [], 50: [], 51: [], 52: [], 53: [], 54: [], 55: [], 56: [], 57: [], 58: [], 59: [], 60: [], 61: [], 62: [], 63: [], 64: [], 65: [], 66: [], 67: [], 68: [], 69: [], 70: [], 71: [], 72: [], 73: [], 74: [], 75: [], 76: [], 77: [], 78: [], 79: [], 80: [], 81: [], 82: [], 83: [], 84: [], 85: [], 86: [], 87: [], 88: [], 89: [], 90: [], 91: [], 92: [], 93: [], 94: [], 95: [], 96: [], 97: [], 98: [], 99: [], 100: [], 101: [], 102: [], 103: [], 104: [], 105: [], 106: [], 107: [], 108: [], 109: [], 110: [], 111: [], 112: [], 113: [], 114: [], 115: [], 116: [], 117: [], 118: [], 119: [], 120: [], 121: [], 122: [], 12

## Part 2: Fix the errors in the file 

Types of errors:
- R: Replace
- M: Missing
- U: Unnecessary
- noop 

In [None]:
def supresser_method(sentence_dict):

    # idx_change is used to adjust the indices of the edits
    idx_change = 0
    # sentence_list is the source/S sentence in a list format
    sentence_list = sentence_dict['S'].split(' ')

    # iterating over the annotations
    for x in sentence_dict['A']:
        # getting the type edit
        type_edit = x[1].split(':')[0]

        # getting the start index from the annotation line and adjusting for subsequent changes
        str_idx = int(x[0].split(' ')[1])
        str_idx = str_idx + idx_change

        # getting the end index from the annotation line and adjusting for subsequent changes
        end_idx = int(x[0].split(' ')[2])
        end_idx = end_idx + idx_change

        # calculating the count of elements to be inserted
        if len(x[2]) > 0:
            insert_count = int(len(x[2].split(' ')))
            insert_str_list = x[2].split(' ')
        else:
            insert_count = 0

        # code for deleting elements and updating the idx change
        if str_idx == end_idx:
            pass
        else:
            del sentence_list[str_idx:end_idx]
            idx_change = idx_change - (end_idx - str_idx)

        # code for inserting elements and updating the idx change
        if insert_count == 0:
            pass
        else:
            sentence_list[str_idx:str_idx] = insert_str_list
            idx_change = idx_change + len(insert_str_list)

    return sentence_list

In [None]:
sentence_dict = {}
sentence_dict['S'] = 'S Firstly , I would like to tell you , that the group has been booked into Palace Hotel , which locates beside the congress house in the centre .'
sentence_dict['A'] = []
sentence_dict['A'].append('A 8 9|||U:PUNCT||||||REQUIRED|||-NONE-|||0'.split('|||'))
sentence_dict['A'].append('A 16 16|||M:DET|||the|||REQUIRED|||-NONE-|||0'.split('|||'))
sentence_dict['A'].append('A 20 21|||R:VERB:TENSE|||is located|||REQUIRED|||-NONE-|||0'.split('|||'))

output_list = supresser(sentence_dict)
print(*output_list)

Firstly , I would like to tell you that the group has been booked into the Palace Hotel , which is located beside the congress house in the centre .


In [None]:
'''
# return start_index, end_index, error_type, word(s)
def parse_annotation(annot): 
  a_split = annot.split("|||")

  # get the error type: 
  err_type = a_split[1]

  # get indices
  index_split = a_split[0].split(" ")
  start_index = int(index_split[1])
  end_index = int(index_split[2])

  # get words to replace/remove 
  words = a_split[2].split(" ")

  return start_index, end_index, err_type, words
'''   

'\n# return start_index, end_index, error_type, word(s)\ndef parse_annotation(annot): \n  a_split = annot.split("|||")\n\n  # get the error type: \n  err_type = a_split[1]\n\n  # get indices\n  index_split = a_split[0].split(" ")\n  start_index = int(index_split[1])\n  end_index = int(index_split[2])\n\n  # get words to replace/remove \n  words = a_split[2].split(" ")\n\n  return start_index, end_index, err_type, words\n'

TODO: I don't think this works if the replacement phrase is > than the part of the sentence. also this method doesn't fully work. 

In [None]:
'''
def replace(sent_split, start_index, end_index, words):
  try: 
    for i in range(end_index - start_index):
      if i >= len(words): 
        sent_split[start_index + i + 1] = "" 
      elif start_index + i + 1 >= len(sent_split): 
        sent_split.append(words[i])
      else: 
        sent_split[start_index + i + 1] = words[i]
  except: 
    print("replace failed") 
'''

'\ndef replace(sent_split, start_index, end_index, words):\n  try: \n    for i in range(end_index - start_index):\n      if i >= len(words): \n        sent_split[start_index + i + 1] = "" \n      elif start_index + i + 1 >= len(sent_split): \n        sent_split.append(words[i])\n      else: \n        sent_split[start_index + i + 1] = words[i]\n  except: \n    print("replace failed") \n'

In [None]:
'''
def missing(sent_split, start_index, end_index, words): 
  for i in range(end_index - start_index): 
    sent_split.insert(start_index + i + 1, words[i]) 
'''

'\ndef missing(sent_split, start_index, end_index, words): \n  for i in range(end_index - start_index): \n    sent_split.insert(start_index + i + 1, words[i]) \n'

In [None]:
'''
def unnecessary(sent_split, start_index, end_index, words): 
  for i in range(end_index - start_index): 
    # substitute with empty string to prevent indices from changing 
    if start_index + i + 1 < len(sent_split): 
      sent_split[start_index + i + 1] = "" 
'''

'\ndef unnecessary(sent_split, start_index, end_index, words): \n  for i in range(end_index - start_index): \n    # substitute with empty string to prevent indices from changing \n    if start_index + i + 1 < len(sent_split): \n      sent_split[start_index + i + 1] = "" \n'

In [None]:
'''
def correct_error(sent, annot): 
  start_index, end_index, error_type, words = parse_annotation(annot) 
  sent_split = sent.split(" ")

  # shouldn't be needed but just in case, early termination condition 
  if error_type == "noop": 
    return sent 

  # get R, M, U error type
  err_subtype = error_type.split(":")[0]

  if err_subtype == "R": 
    replace(sent_split, start_index, end_index, words)

  # TODO: fix missing so that it is compatible with further edits 
  elif err_subtype == "M": 
    missing(sent_split, start_index, end_index, words)

  elif err_subtype == "U": 
    unnecessary(sent_split, start_index, end_index, words)

  correct_sent = ' '.join(sent_split) 
  correct_sent = correct_sent.replace("  ", " ")
  return correct_sent 
'''

'\ndef correct_error(sent, annot): \n  start_index, end_index, error_type, words = parse_annotation(annot) \n  sent_split = sent.split(" ")\n\n  # shouldn\'t be needed but just in case, early termination condition \n  if error_type == "noop": \n    return sent \n\n  # get R, M, U error type\n  err_subtype = error_type.split(":")[0]\n\n  if err_subtype == "R": \n    replace(sent_split, start_index, end_index, words)\n\n  # TODO: fix missing so that it is compatible with further edits \n  elif err_subtype == "M": \n    missing(sent_split, start_index, end_index, words)\n\n  elif err_subtype == "U": \n    unnecessary(sent_split, start_index, end_index, words)\n\n  correct_sent = \' \'.join(sent_split) \n  correct_sent = correct_sent.replace("  ", " ")\n  return correct_sent \n'

In [None]:
'''
sent = 'S I am writing in order to express my disappointment about your musical show " Over the Rainbow " .'
annot = 'A 9 10|||R:PREP|||with|||REQUIRED|||-NONE-|||0|||ca'
print(correct_error(sent, annot))
'''

'\nsent = \'S I am writing in order to express my disappointment about your musical show " Over the Rainbow " .\'\nannot = \'A 9 10|||R:PREP|||with|||REQUIRED|||-NONE-|||0|||ca\'\nprint(correct_error(sent, annot))\n'

In [None]:
'''
sent = 'S I am writing in order to express my disappointment about your musical show " Over the Rainbow " .'
annot = 'A 9 10|||R:PREP|||with|||REQUIRED|||-NONE-|||0|||ca'
print(correct_error(sent, annot))
'''

'\nsent = \'S I am writing in order to express my disappointment about your musical show " Over the Rainbow " .\'\nannot = \'A 9 10|||R:PREP|||with|||REQUIRED|||-NONE-|||0|||ca\'\nprint(correct_error(sent, annot))\n'

In [None]:
'''
sent = 'S Maybe you should make have next year ?'
annot = 'A 8 9|||R:PUNCT|||.|||REQUIRED|||-NONE-|||0|||ru'
print(correct_error(sent, annot))
'''

"\nsent = 'S Maybe you should make have next year ?'\nannot = 'A 8 9|||R:PUNCT|||.|||REQUIRED|||-NONE-|||0|||ru'\nprint(correct_error(sent, annot))\n"

In [None]:
for sent_idx in range(len(sentences)): 

  try: 
    sentence_dict = {}
    sentence_dict['S'] = sentences[sent_idx]
    sentence_dict['A'] = [] 
    for annot_idx in errors_to_suppress[sent_idx]: 
      sentence_dict['A'].append(annotations[sent_idx][annot_idx])
      
    corrected_sentence = supresser_method(sentence_dict)
    #print(corrected_sentence)
    sentences[sent_idx] = " ".join(corrected_sentence)
  except: 
    print("failed")
    continue 

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
failed
['S', 'Classrooms', ',', 'Lessons', 'and', 'Breaks', '.\n']
failed
['S', 'Obviously', 'it', 'will', 'be', 'interesting', 'to', 'find', 'out', 'what', 'methods', 'are', 'used', 'by', 'our', 'teachers', 'to', 'help', 'pupils', 'understand', 'their', 'subject', 'better', '.\n']
failed
failed
failed
['S', 'Outdoor', 'Activities\n']
failed
failed
['S', 'Conclusion\n']
failed
['S', 'I', 'hope', 'you', 'will', 'consider', 'this', 'information', '.\n']
['S', 'Dear', 'Mr', 'Robertson', ',\n']
failed
failed
['S', 'However', ',', 'there', 'is', 'one', 'point', 'that', 'we', 'would', 'like', 'to', 'change', 'in', 'the', 'programme', '.\n']
failed
failed
['S', 'Furthermore', 'is', 'totally', 'free', 'for', 'students', '.\n']
failed
failed
failed
['S', 'It', 'is', 'also', 'possible', 'to', 'go', 'to', 'the', 'show', 'in', 'the', 'morning', 'but', 'we', 'are', 'totally', 'agree', 'that', 'we', 'do', "n't", 'want', 'to', 'lose', '

## Part 3: Creating a new file with errors suppressed

In [None]:
with open("/content/new_file.m2", "w+") as f: 
  for sent_idx in range(len(sentences)): 
    f.write(sentences[sent_idx])
    annot_idx = 0
    for annotation in annotations[sent_idx]: 
      if annot_idx not in errors_to_suppress[sent_idx]: 
        f.write(annotation)
      annot_idx += 1

    f.write("\n")