# Import Libraries

In [208]:
import numpy as np
import random
from collections import defaultdict, Counter

# Configurations

In [209]:
training_filename = 'train.txt'

# Tokenize and Process File

In [210]:
def process_file(file_name):
    with open(file_name,'r') as file:
        file_data = file.read();
        file_lines = file_data.splitlines()
                
        sentences = []
        pos_tags = []
        ner_tags = []
        
        for i in range(0,len(file_lines),3):
            sentences.append(file_lines[i].split('\t'))
            pos_tags.append(file_lines[i+1].split('\t'))
            ner_tags.append(file_lines[i+2].split('\t'))
            
        
        return sentences, pos_tags, ner_tags
           

# Create Training and Dev Set

In [211]:
sentences_tr, pos_tags_tr, ner_tags_tr = process_file(training_filename)
sentences_te, pos_tags_te, ner_tags_te = process_file(test_filename)
nTr = len(sentences_tr)
c=list(zip(sentences_tr, pos_tags_tr, ner_tags_tr))
random.seed(1)
random.shuffle(c)

sentences_tr, pos_tags_tr, ner_tags_tr = zip(*c)

split_point = int(0.9*nTr)

#Validation Set
sentences_val = sentences_tr[split_point:]
pos_tags_val = pos_tags_tr[split_point:]
ner_tags_val = ner_tags_tr[split_point:]


sentences_tr = sentences_tr[:split_point]
pos_tags_tr = pos_tags_tr[:split_point]
ner_tags_tr = ner_tags_tr[:split_point]

# Evaluation Functions

In [212]:
def flatten_nested_lists(nested_lists):
    result = []
    
    for list_ in nested_lists:
        result+=list_
    
    return result

def evaluate_model_span(predicted_seq, correct_seq):
    T = len(predicted_seq)
    
    num_spans_pred = ['B-' in x for x in predicted_seq].count(True)
    num_spans_ans = ['B-' in x for x in correct_seq].count(True)
    
    correct_pred=0
    
    for i in range(len(correct_seq)):
        
        if 'B-' in correct_seq[i]:
            if correct_seq[i]==predicted_seq[i]:
                flag=1;
                tag_type = correct_seq[i][2:]
                j=i+1
                while( j<T and (correct_seq[j] == 'I-'+tag_type or predicted_seq[j] == 'I-'+tag_type)):
                    if (correct_seq[j]!=predicted_seq[j]):
                        flag=0;
                    
                    j=j+1             
                
                if(flag==1):
                    correct_pred+=1;
                               
                i=j-1
                
                
    precision = correct_pred/num_spans_pred;
    recall = correct_pred/num_spans_ans;
    
    f_score = (2*precision*recall)/(precision+recall)
                
    return precision, recall, f_score     


    

# Baseline Algorithm1: Most Frequent Class

In [213]:
# training function
def most_frequent_class(words_tr, tags_tr):
    words_tag_count = Counter(list(zip(words_tr, tags_tr)))
    most_frequent_counts = {}
    for (word, tag), count in words_tag_count.items():
        if count > most_frequent_counts.get(word, (None, 0))[1]:
            most_frequent_counts[word] = (tag, count)
    most_frequent_tags = {k:v[0] for (k,v) in most_frequent_counts.items()}
    return most_frequent_tags

# predicting function
def baseline_predict(words_list, most_frequent_tags):
    prediction = [most_frequent_tags[word] if word in most_frequent_tags else 'O' for word in words_list]
    return prediction

In [214]:
# prepare training data and train model
words_tr = flatten_nested_lists(sentences_tr)
words_pos_tr = flatten_nested_lists(pos_tags_tr)
words_ner_tr = flatten_nested_lists(ner_tags_tr)
most_frequent_ner = most_frequent_class(words_tr, words_ner_tr)
most_frequent_pos = most_frequent_class(words_tr, words_pos_tr)

In [215]:
# prepare validation data and predict validation set
words_val = flatten_nested_lists(sentences_val)
predciton = baseline_predict(words_val, most_frequent_ner)
answer_key = flatten_nested_lists(ner_tags_val)

In [216]:
# evaluate the baseline performance
evaluate_model_span(predciton, answer_key)
# (precision, recall, f-score)

(0.7155452436194896, 0.6660907127429806, 0.6899328859060402)