In [14]:
import math

def readFile(filepath):
    y_count = {}
    emission_count = {}
    transition_counts_lr = {}  # Left-to-right transitions
    transition_counts_rl = {}  # Right-to-left transitions
    training_observations_x = set()

    with open(filepath, 'r') as file:
        y_i = "START"
        y_count["START"] = 1
        for line in file:
            line = line.strip()
            if not line:
                transition_counts_lr[(y_i, "STOP")] = transition_counts_lr.get((y_i, "STOP"), 0) + 1
                transition_counts_rl[("STOP", y_i)] = transition_counts_rl.get(("STOP", y_i), 0) + 1
                y_count["STOP"] = y_count.get("STOP", 0) + 1
                y_i = "START"
                y_count["START"] = y_count.get("START", 0) + 1
                continue

            last_space_idx = line.rfind(" ")
            x, y_j = line[:last_space_idx], line[last_space_idx + 1:]

            # Update left-to-right transition count
            transition_counts_lr[(y_i, y_j)] = transition_counts_lr.get((y_i, y_j), 0) + 1

            # Update right-to-left transition count
            transition_counts_rl[(y_j, y_i)] = transition_counts_rl.get((y_j, y_i), 0) + 1

            emission_count[(y_j, x)] = emission_count.get((y_j, x), 0) + 1
            y_count[y_j] = y_count.get(y_j, 0) + 1
            training_observations_x.add(x)
            y_i = y_j
        
        if y_i != "START":
            transition_counts_lr[(y_i, "STOP")] = transition_counts_lr.get((y_i, "STOP"), 0) + 1
            transition_counts_rl[("STOP", y_i)] = transition_counts_rl.get(("STOP", y_i), 0) + 1
            y_count["STOP"] = y_count.get("STOP", 0) + 1
        else: 
            y_count["START"] -= 1
            
    return y_count, emission_count, transition_counts_lr, transition_counts_rl, training_observations_x


def readDevIn(filePath):
    x_sequences = []
    current_sequence = []
    with open(filePath, 'r') as file:
        for line in file:
            x = line.strip()
            if not x:
                if current_sequence: 
                    x_sequences.append(current_sequence)
                    current_sequence = [] 
            else:
                current_sequence.append(x)
        if current_sequence:
            x_sequences.append(current_sequence)
    return x_sequences

def write_seq_pairs_to_file(file_path, list_of_sequences):
    with open(file_path, 'w') as file:
        for seq_pairs in list_of_sequences:
            for x, y in seq_pairs:
                 file.write(f"{x} {y}\n")
            file.write("\n")

def emission_parameters_updated(x_t, y_t, y_count, emission_count, training_observations_x, k=1):
    if x_t in training_observations_x:
        return math.log((emission_count.get((y_t, x_t), 0) + k) / (y_count[y_t] + k * len(training_observations_x)))
    else:
        return math.log(k / (y_count[y_t] + k * len(training_observations_x)))


def transition_parameters(y_i, y_j, transition_counts_lr, transition_counts_rl, y_count):
    numerator_lr = transition_counts_lr.get((y_i, y_j), 0)
    numerator_rl = transition_counts_rl.get((y_j, y_i), 0)
    denominator = y_count[y_i]
    
    if numerator_lr == 0 and numerator_rl == 0:
        return float('-inf')
    
    transition_prob_lr = math.log(numerator_lr / denominator) if numerator_lr > 0 else float('-inf')
    transition_prob_rl = math.log(numerator_rl / denominator) if numerator_rl > 0 else float('-inf')
    
    # Combine left-to-right and right-to-left transition probabilities
    combined_prob = math.log(math.exp(transition_prob_lr) + math.exp(transition_prob_rl))
    return combined_prob

def viterbi(y_count, emission_count, transition_counts_lr, transition_counts_rl, training_observations_x, x_input_seq):
    n = len(x_input_seq)
    states = list(y_count.keys())

    scores = {}
    parent_pointer = {}
    for k in range(0, n+1):
        for key in states:
            scores[(k, key)] = float("-inf")
    scores[(0, "START")] = 0.0

    for k in range(1, n+1):
        for v in states:
            max_u_score = float("-inf")
            parent = None
            for u in states:
                emission_prob = emission_parameters_updated(x_input_seq[k-1], v, y_count, emission_count, training_observations_x, 0.1)
                transition_prob = transition_parameters(u, v, transition_counts_lr, transition_counts_rl, y_count)
                possible_u_score = scores[(k-1, u)] + emission_prob + transition_prob
                if possible_u_score > max_u_score:
                    max_u_score = possible_u_score
                    parent = u
            scores[(k, v)] = max_u_score
            parent_pointer[(k, v)] = parent

    # print(scores)
    # print(parent_pointer)

    max_final_transition_score = float("-inf")
    stop_parent = None
    for v in states:
        score = scores[(n, v)] + transition_parameters(v, "STOP", transition_counts_lr, transition_counts_rl, y_count)
        if score > max_final_transition_score:
            max_final_transition_score = score
            stop_parent = v

    scores[(n+1, "STOP")] = max_final_transition_score
    parent_pointer[(n+1, "STOP")] = stop_parent

    predicted_labels = ["STOP"]
    current_label = "STOP"
    for i in range(n+1, 0, -1):
        parent = parent_pointer.get((i, current_label))
        if parent:
            predicted_labels.insert(0, parent)
            current_label = parent
        
    return predicted_labels[1:-1], scores, parent_pointer

def buildModelNwrite(readDevInPath, y_count, emission_count, transition_count_lr, transition_count_rl, training_observations_x,writeFilePath):
    x_sequences = readDevIn(readDevInPath)
    list_of_sequences = []
    for x_input_seq in x_sequences: 
        predicted_labels , _ , _ = viterbi(y_count, emission_count, transition_count_lr, transition_count_rl ,training_observations_x ,x_input_seq)
        list_of_sequences.append(list(zip(x_input_seq,predicted_labels)))
    write_seq_pairs_to_file(writeFilePath, list_of_sequences)



# RU
y_count_RU, emission_count_RU, transition_count_RU_lr, transition_count_RU_rl, training_observations_x_RU = readFile("./Data/RU/train")
buildModelNwrite("./Data/RU/dev.in", y_count_RU, emission_count_RU, transition_count_RU_lr, transition_count_RU_rl, training_observations_x_RU, "./Data/RU/dev.p2.out")

# RU
#Entity in gold data: 389
#Entity in prediction: 484

#Correct Entity : 188
#Entity  precision: 0.3884
#Entity  recall: 0.4833
#Entity  F: 0.4307

#Correct Sentiment : 129
#Sentiment  precision: 0.2665
#Sentiment  recall: 0.3316
#Sentiment  F: 0.2955


# ES
y_count_ES, emission_count_ES, transition_count_ES_lr, transition_count_ES_rl, training_observations_x_ES = readFile("./Data/ES/train")
buildModelNwrite("./Data/ES/dev.in", y_count_ES, emission_count_ES, transition_count_ES_lr, transition_count_ES_rl, training_observations_x_ES, "./Data/ES/dev.p2.out")

# # ES 
# y_count_ES, emission_count_ES, transition_count_ES,training_observations_x_ES = readFile("./Data/ES/train")
# buildModelNwrite("./Data/ES/dev.in", y_count_ES, emission_count_ES, transition_count_ES,training_observations_x_ES ,"./Data/ES/dev.p2.out")

# ES 
#Entity in gold data: 229
#Entity in prediction: 542

#Correct Entity : 134
#Entity  precision: 0.2472
#Entity  recall: 0.5852
#Entity  F: 0.3476

#Correct Sentiment : 97
#Sentiment  precision: 0.1790
#Sentiment  recall: 0.4236
#Sentiment  F: 0.2516