## ML Project Part 1

### Write a function that estimates the emission parameters from the training set using MLE (maximum likelihood estimation)

In [None]:
"""
each line in the training file contains a word and a tag, seperated by a tab.
sentences are separated by empty lines

to estimate the emission parameters from the training set, we need 2 things:
1. a count of how many times each word appears with a tag
2. a count of how many times each tag appears
"""

from collections import defaultdict

# function to read data
def read_training_data(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip(): # not an empty line
                word, tag = line.strip().split()
                data.append((word, tag))
    return data

# function to estimate emission parameters
def estimate_emission_parameters(data):
    count_y = defaultdict(int) # count how many times y appears overall
    count_y_to_x = defaultdict(lambda: defaultdict(int))  # count how many times word x is tagged with y
    
    for word, tag in data:
        count_y[tag] += 1
        count_y_to_x[tag][word] += 1

    emission_probs = {}  # e(x|y)

    for tag in count_y_to_x:
        emission_probs[tag] = {}
        for word in count_y_to_x[tag]:
            emission_probs[tag][word] = count_y_to_x[tag][word] / count_y[tag]
    
    return emission_probs

# example usage
# e("the" | "B-NP")

if __name__ == "__main__":
    training_data = read_training_data("EN/train")
    emission_probs = estimate_emission_parameters(training_data)

    # print emission probability for example
    print(emission_probs["B-NP"].get("the", 0))

0.1639572983828348


### Handle Unknown Words with Smoothing

In [3]:
# function to replace words with count < k with '#UNK'
def replace_rare_words(data, word_counts, k=3):
    new_data = []
    for word, tag in data:
        if word_counts[word] < k:
            new_data.append(('#UNK#', tag))
        else:
            new_data.append((word, tag))
    return new_data

# function to count word frequencies
def count_word_frequencies(data):
    word_counts = defaultdict(int)
    for word, _ in data:
        word_counts[word] += 1
    return word_counts

# example usage
# e("#UNK#" | "B-NP")

if __name__ == "__main__":
    
    training_data = read_training_data("EN/train")
    word_counts = count_word_frequencies(training_data)
    smoothed_data = replace_rare_words(training_data, word_counts, k=3)

    emission_probs = estimate_emission_parameters(smoothed_data)

    print(emission_probs.get("B-NP", {}).get("#UNK#", 0))

0.0825071345523729


### Implement a Baseline Tagger

In [None]:
# for each word, pick the tag y that has the highest emission probability e(x|y)

"""
for each word in the dev set:
if its in the training vocab, find the tag with the highest emission probability
if not, replace with #UNK#, then do the same.
write predicted tags to a new file
"""

def baseline_tagger(dev_path, output_path, emission_probs, known_words):
    with open(dev_path, 'r') as f_in, open(output_path, 'w') as f_out:
        for line in f_in:
            word = line.strip()
            if not word:
                f_out.write('\n')
                continue

            # Use word if seen, else use #UNK#
            word_key = word if word in known_words else '#UNK#'

            best_tag = None
            best_score = 0

            for tag in emission_probs:
                prob = emission_probs[tag].get(word_key, 0)
                if prob > best_score:
                    best_score = prob
                    best_tag = tag

            f_out.write(f"{word} {best_tag if best_tag else 'O'}\n")

# example usage
if __name__ == "__main__":
    train_path = "EN/train"
    dev_path = "EN/dev.in"
    output_path = "EN/dev.p1.out"

    data = read_training_data(train_path)
    word_counts = count_word_frequencies(data)
    smoothed_data = replace_rare_words(data, word_counts, k=3)

    emission_probs = estimate_emission_parameters(smoothed_data)
    known_words = set(word for word, _ in smoothed_data)

    baseline_tagger(dev_path, output_path, emission_probs, known_words)

Running the above with the eval script, we get the following scores:

#Entity in gold data: 13179 
#Entity in prediction: 19406

#Correct Entity : 9152      
Entity  precision: 0.4716   
Entity  recall: 0.6944      
Entity  F: 0.5617

#Correct Sentiment : 7644   
Sentiment  precision: 0.3939
Sentiment  recall: 0.5800   
Sentiment  F: 0.4692

For a baseline system that is only using emission probabilities, this is not bad. System currently lacks transition modelling and sequence structure!