# ML Project Part 1

### Write a function that estimates the emission parameters from the training set using MLE (maximum likelihood estimation)

In [1]:
def read_training_data(filename):
    """Reads training data and returns a list of (word, tag) pairs."""
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == "":
                continue
            word, tag = line.split()
            data.append((word, tag))
    return data

def estimate_emission_parameters(data):
    """Estimates emission probabilities e(x|y) using MLE."""
    from collections import defaultdict

    # Count(y) - how many times a tag y appeared
    tag_counts = defaultdict(int)

    # Count(y -> x) - how many times a word x appeared with tag y
    emission_counts = defaultdict(int)

    for word, tag in data:
        tag_counts[tag] += 1
        emission_counts[(tag, word)] += 1

    # Compute emission probabilities
    emission_probs = {}
    for (tag, word), count in emission_counts.items():
        emission_probs[(tag, word)] = count / tag_counts[tag]

    return emission_probs, tag_counts

### Handle Unknown Words with Smoothing

In [2]:
def replace_rare_words(data, k=3):
    """Replace words appearing less than k times with #UNK#."""
    from collections import Counter

    word_counter = Counter([word for word, tag in data])
    updated_data = []

    for word, tag in data:
        if word_counter[word] < k:
            updated_data.append(("#UNK#", tag))
        else:
            updated_data.append((word, tag))

    return updated_data

def estimate_emission_parameters_with_smoothing(filename, k=3):
    """Reads training data, replaces rare words, and estimates emission probabilities."""
    data = read_training_data(filename)
    data = replace_rare_words(data, k)
    emission_probs, tag_counts = estimate_emission_parameters(data)
    return emission_probs, tag_counts

### Implement a Tagger

In [3]:
def read_dev_data(filename):
    """Reads the dev.in file and returns a list of sentences (each sentence is a list of words)."""
    sentences = []
    with open(filename, 'r', encoding='utf-8') as f:
        sentence = []
        for line in f:
            line = line.strip()
            if line == "":
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                sentence.append(line)
        if sentence:
            sentences.append(sentence)
    return sentences

def simple_tagging(emission_probs, tag_counts, dev_filename, output_filename):
    """Tags each word individually with the highest emission probability."""
    sentences = read_dev_data(dev_filename)

    all_tags = list(tag_counts.keys())

    with open(output_filename, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            for word in sentence:
                best_tag = None
                best_score = 0
                for tag in all_tags:
                    if (tag, word) in emission_probs:
                        score = emission_probs[(tag, word)]
                    elif (tag, "#UNK#") in emission_probs:
                        score = emission_probs[(tag, "#UNK#")]
                    else:
                        score = 0
                    if score > best_score:
                        best_score = score
                        best_tag = tag
                f.write(f"{word} {best_tag}\n")
            f.write("\n")

### Results
After running "py EvalScript/evalResult.py EN/dev.out EN/dev.p1.out":

#Entity in gold data: 13179<br>
#Entity in prediction: 17085

#Correct Entity : 9186<br>
Entity  precision: 0.5377<br>
Entity  recall: 0.6970<br>
Entity  F: 0.6071

#Correct Sentiment : 8261<br>
Sentiment  precision: 0.4835<br>
Sentiment  recall: 0.6268<br>
Sentiment  F: 0.5459

# ML Project Part 2

### Write a function that estimates the transition parameters

In [4]:
def read_training_sentences(filename):
    """Reads training file and returns list of sentences, each sentence is a list of (word, tag) pairs."""
    sentences = []
    sentence = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line == "":
                if sentence:
                    sentences.append(sentence)
                    sentence = []
            else:
                word, tag = line.split()
                sentence.append((word, tag))
        if sentence:
            sentences.append(sentence)
    return sentences

def estimate_transition_parameters(sentences):
    """Estimates transition probabilities q(y_i|y_{i-1}) using MLE."""
    from collections import defaultdict

    bigram_counts = defaultdict(int)
    unigram_counts = defaultdict(int)

    for sentence in sentences:
        tags = [tag for word, tag in sentence]
        tags = ["START"] + tags + ["STOP"]

        for i in range(len(tags) - 1):
            unigram_counts[tags[i]] += 1
            bigram_counts[(tags[i], tags[i+1])] += 1

    transition_probs = {}
    for (tag_prev, tag_curr), count in bigram_counts.items():
        transition_probs[(tag_prev, tag_curr)] = count / unigram_counts[tag_prev]

    return transition_probs

### Implement the Viterbi algorithm

In [7]:
import math
from collections import defaultdict

def safe_log(x):
    """Safe log: returns log(x) if x > 0, else -inf."""
    return math.log(x) if x > 0 else float('-inf')

def read_dev_sentences(dev_path):
    """Reads sentences from dev.in into a list of sentences."""
    sentences = []
    with open(dev_path, 'r') as f:
        sentence = []
        for line in f:
            if line.strip():
                sentence.append(line.strip())
            else:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
        if sentence:
            sentences.append(sentence)
    return sentences

def viterbi(dev_path, output_path, emission_probs, transition_probs, tag_set, known_words):
    """Viterbi decoding algorithm."""
    sentences = read_dev_sentences(dev_path)

    with open(output_path, 'w') as out:
        for sentence in sentences:
            n = len(sentence)
            V = defaultdict(lambda: defaultdict(lambda: -math.inf))
            backpointer = defaultdict(dict)

            V[0]['START'] = 0

            for i in range(1, n+1):
                word = sentence[i-1]
                word_key = word if word in known_words else '#UNK#'
                for curr_tag in tag_set:
                    emit = emission_probs[curr_tag].get(word_key, 0)
                    if emit == 0:
                        continue
                    for prev_tag in V[i-1]:
                        trans = transition_probs.get((prev_tag, curr_tag), 0)
                        if trans == 0:
                            continue
                        score = V[i-1][prev_tag] + safe_log(trans) + safe_log(emit)
                        if score > V[i][curr_tag]:
                            V[i][curr_tag] = score
                            backpointer[i][curr_tag] = prev_tag

            # Termination
            best_score = -math.inf
            best_last_tag = None
            for tag in V[n]:
                trans = transition_probs.get((tag, 'STOP'), 0)
                if trans == 0:
                    continue
                score = V[n][tag] + safe_log(trans)
                if score > best_score:
                    best_score = score
                    best_last_tag = tag

            # Backtrack
            if best_last_tag is None:
                best_path = ['O'] * n
            else:
                best_path = [best_last_tag]
                for i in range(n, 1, -1):
                    best_path.insert(0, backpointer[i][best_path[0]])

            # Write output
            for word, tag in zip(sentence, best_path):
                out.write(f"{word} {tag}\n")
            out.write("\n")

### Running Part 2's code

In [8]:
data = read_training_data('EN/train')
data = replace_rare_words(data, k=3)
emission_probs_raw, tag_counts = estimate_emission_parameters(data)

# Organize emission_probs into emission_probs[tag][word]
from collections import defaultdict
emission_probs = defaultdict(dict)
for (tag, word), prob in emission_probs_raw.items():
    emission_probs[tag][word] = prob

sentences = read_training_sentences('EN/train')
transition_probs = estimate_transition_parameters(sentences)
known_words = set(word for (word, tag) in data)
tag_set = set(tag_counts.keys())
viterbi('EN/dev.in', 'EN/dev.p2.out', emission_probs, transition_probs, tag_set, known_words)

### Results
After running "py EvalScript/evalResult.py EN/dev.out EN/dev.p2.out":

#Entity in gold data: 13179<br>
#Entity in prediction: 12492

#Correct Entity : 10627<br>
Entity  precision: 0.8507<br>
Entity  recall: 0.8064<br>
Entity  F: 0.8279

#Correct Sentiment : 10224<br>
Sentiment  precision: 0.8184<br>
Sentiment  recall: 0.7758<br>
Sentiment  F: 0.7965

# ML Project Part 3

### Code for 4th Best Viterbi Algorithm

In [9]:
import math
from collections import defaultdict

def safe_log(x):
    """Safe log: returns log(x) if x > 0, else -inf."""
    return math.log(x) if x > 0 else float('-inf')

def read_dev_sentences(dev_path):
    """Reads sentences from dev.in into a list of sentences."""
    sentences = []
    with open(dev_path, 'r') as f:
        sentence = []
        for line in f:
            if line.strip():
                sentence.append(line.strip())
            else:
                if sentence:
                    sentences.append(sentence)
                    sentence = []
        if sentence:
            sentences.append(sentence)
    return sentences

def viterbi_4th_best(dev_path, output_path, emission_probs, transition_probs, tag_set, known_words):
    """Viterbi decoding that finds the 4th best sequence."""
    sentences = read_dev_sentences(dev_path)

    with open(output_path, 'w') as out:
        for sentence in sentences:
            n = len(sentence)

            # dp[i][tag] = list of (score, previous_tag, rank) tuples, top 4 at each step
            dp = defaultdict(lambda: defaultdict(list))

            dp[0]['START'].append((0, None, 1))  # (log-prob, previous tag, rank)

            for i in range(1, n+1):
                word = sentence[i-1]
                word_key = word if word in known_words else '#UNK#'

                for curr_tag in tag_set:
                    emit_prob = emission_probs[curr_tag].get(word_key, 0)
                    if emit_prob == 0:
                        continue
                    log_emit = safe_log(emit_prob)

                    candidates = []
                    for prev_tag in dp[i-1]:
                        trans_prob = transition_probs.get((prev_tag, curr_tag), 0)
                        if trans_prob == 0:
                            continue
                        log_trans = safe_log(trans_prob)

                        for (prev_score, _, _) in dp[i-1][prev_tag]:
                            score = prev_score + log_trans + log_emit
                            candidates.append((score, prev_tag))

                    # Sort candidates and pick top-4
                    candidates = sorted(candidates, key=lambda x: x[0], reverse=True)[:4]
                    for rank, (score, prev_tag) in enumerate(candidates, start=1):
                        dp[i][curr_tag].append((score, prev_tag, rank))

            # Termination: go from last word to STOP
            final_candidates = []
            for tag in dp[n]:
                trans_prob = transition_probs.get((tag, 'STOP'), 0)
                if trans_prob == 0:
                    continue
                log_trans = safe_log(trans_prob)

                for (score, _, _) in dp[n][tag]:
                    final_score = score + log_trans
                    final_candidates.append((final_score, tag))

            if not final_candidates:
                # fallback: tag everything as 'O'
                best_path = ['O'] * n
            else:
                # Sort final candidates and pick the 4th-best
                final_candidates = sorted(final_candidates, key=lambda x: x[0], reverse=True)
                
                if len(final_candidates) < 4:
                    target_index = len(final_candidates) - 1
                else:
                    target_index = 3  # 0-based index, so 3 means 4th-best

                best_last_tag = final_candidates[target_index][1]

                # Backtrack
                best_path = []
                current_tag = best_last_tag
                for i in range(n, 0, -1):
                    candidates = dp[i][current_tag]
                    # Pick the best-ranked candidate
                    candidates_sorted = sorted(candidates, key=lambda x: x[0], reverse=True)
                    best_prev_tag = candidates_sorted[0][1]
                    best_path.append(current_tag)
                    current_tag = best_prev_tag
                best_path.reverse()

            # Write output
            for word, tag in zip(sentence, best_path):
                out.write(f"{word} {tag}\n")
            out.write("\n")

### Running Part 3's code

In [10]:
data = read_training_data('EN/train')
data = replace_rare_words(data, k=3)
emission_probs_raw, tag_counts = estimate_emission_parameters(data)

from collections import defaultdict
emission_probs = defaultdict(dict)
for (tag, word), prob in emission_probs_raw.items():
    emission_probs[tag][word] = prob

sentences = read_training_sentences('EN/train')
transition_probs = estimate_transition_parameters(sentences)

known_words = set(word for (word, tag) in data)
tag_set = set(tag_counts.keys())
viterbi_4th_best('EN/dev.in', 'EN/dev.p3.out', emission_probs, transition_probs, tag_set, known_words)

### Results
After running "py EvalScript/evalResult.py EN/dev.out EN/dev.p3.out":

#Entity in gold data: 13179<br>
#Entity in prediction: 12494

#Correct Entity : 10617<br>
Entity  precision: 0.8498<br>
Entity  recall: 0.8056<br>
Entity  F: 0.8271

#Correct Sentiment : 10212<br>
Sentiment  precision: 0.8174<br>
Sentiment  recall: 0.7749<br>
Sentiment  F: 0.7955

# ML Project Part 4

### Feature Extraction

In [22]:
def extract_features(word, prev_tag):
    """Extract features for a given word and previous tag."""
    features = {}
    features['bias'] = 1
    features['word.lower=' + word.lower()] = 1
    features['prev_tag=' + prev_tag] = 1
    if word[0].isupper():
        features['is_capitalized'] = 1
    if word.isdigit():
        features['is_digit'] = 1
    return features

### Training

In [23]:
from collections import defaultdict
import random

def train_perceptron(train_data, tag_set, num_epochs=20):
    """Train Perceptron sequence tagger."""
    weights = defaultdict(lambda: defaultdict(float))

    sentences = []
    sentence = []
    for word, tag in train_data:
        if word == "" and tag == "":
            if sentence:
                sentences.append(sentence)
                sentence = []
        else:
            sentence.append((word, tag))
    if sentence:
        sentences.append(sentence)

    for epoch in range(num_epochs):
        random.shuffle(sentences)
        for sentence in sentences:
            words, gold_tags = zip(*sentence)
            n = len(words)

            # Predict tags
            pred_tags = []
            prev_tag = 'START'
            for word in words:
                scores = {}
                for tag in tag_set:
                    score = 0
                    feats = extract_features(word, prev_tag)
                    for f, value in feats.items():
                        score += weights[f][tag] * value
                    scores[tag] = score
                best_tag = max(scores, key=scores.get)
                pred_tags.append(best_tag)
                prev_tag = best_tag

            # Update weights
            prev_tag = 'START'
            for i in range(n):
                word = words[i]
                true_tag = gold_tags[i]
                pred_tag = pred_tags[i]
                if true_tag != pred_tag:
                    true_feats = extract_features(word, prev_tag)
                    pred_feats = extract_features(word, prev_tag)
                    for f, value in true_feats.items():
                        weights[f][true_tag] += value
                        weights[f][pred_tag] -= value
                prev_tag = pred_tag

    return weights

### Tagging

In [24]:
def tag_sentence(sentence, weights, tag_set):
    """Tag a sentence using learned perceptron weights."""
    pred_tags = []
    prev_tag = 'START'
    for word in sentence:
        scores = {}
        for tag in tag_set:
            score = 0
            feats = extract_features(word, prev_tag)
            for f, value in feats.items():
                score += weights[f][tag] * value
            scores[tag] = score
        best_tag = max(scores, key=scores.get)
        pred_tags.append(best_tag)
        prev_tag = best_tag
    return pred_tags

### Main Function to Train & Predict

In [25]:
def perceptron_main(train_file, dev_in_file, dev_out_file):
    """Train and run Perceptron model."""
    # Read training data
    train_data = []
    with open(train_file, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                word, tag = line.split()
                train_data.append((word, tag))
            else:
                train_data.append(("", ""))

    tag_set = set(tag for word, tag in train_data if tag)

    # Train
    weights = train_perceptron(train_data, tag_set, num_epochs=20)

    # Read dev.in
    sentences = read_dev_sentences(dev_in_file)

    # Predict and write output
    with open(dev_out_file, 'w', encoding='utf-8') as out:
        for sentence in sentences:
            pred_tags = tag_sentence(sentence, weights, tag_set)
            for word, tag in zip(sentence, pred_tags):
                out.write(f"{word} {tag}\n")
            out.write("\n")

### Running Part 4's code

In [26]:
perceptron_main('EN/train', 'EN/dev.in', 'EN/dev.p4.out')

### Results
After running "py EvalScript/evalResult.py EN/dev.out EN/dev.p4.out":

#Entity in gold data: 13179<br>
#Entity in prediction: 12494

#Correct Entity : 10617<br>
Entity  precision: 0.8498<br>
Entity  recall: 0.8056<br>
Entity  F: 0.8271

#Correct Sentiment : 10212<br>
Sentiment  precision: 0.8174<br>
Sentiment  recall: 0.7749<br>
Sentiment  F: 0.7955