<a href="https://colab.research.google.com/github/ammaarahmad1999/CS563-NLP-Lab/blob/main/LAB1/Assignment_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import json
import re
import string
from tqdm import tqdm
from nltk.util import ngrams
from tabulate import tabulate
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split

In [2]:
def generate_transition_matrix(y):
    
    trigram_tags = []
    for tag_list in y:
        tag_list = ["START", "START"] + tag_list + ["STOP"]
        trigram_tags.extend(ngrams(tag_list, 3))
    trigram_count = dict(Counter(trigram_tags))

    bigram_tags = []
    for tag_list in y:
        tag_list = ["START", "START"] + tag_list + ["STOP"]
        bigram_tags.extend(ngrams(tag_list, 2))
    bigram_count = dict(Counter(bigram_tags))

    transition_matrix = defaultdict(lambda: 0.0000000001)

    for trigram in trigram_count:
        first, second, third = trigram
        transition_matrix[trigram] = trigram_count[trigram] / bigram_count[(first, second)]

    return transition_matrix

In [3]:
def generate_emission_matrix(x, y):
    word_tag_count = defaultdict(lambda: 0)
    tag_count = defaultdict(lambda: 0)

    for line, tags in zip(x, y):
        for word, tag in zip(line.split(), tags):
            tag_count[tag] += 1
            word_tag_count[(word, tag)] += 1
    
    emission_matrix = defaultdict(lambda: 0.0000000001)
    
    for word_tag in word_tag_count.keys():
        word, tag = word_tag
        emission_matrix[word_tag] = word_tag_count[word_tag] / tag_count[tag]

    return emission_matrix

In [4]:
def collapse_to_4_tags(y_train, y_test):
    tag_map = defaultdict(lambda: 'O', {
        'JJ': 'A', 'JJR': 'A', 'JJS': 'A', 'RB': 'A', 'RBR': 'A', 'RBS': 'A', 
        'WRB': 'A', 'VB': 'V', 'VBD': 'V', 'VBG': 'V', 'VBN': 'V', 'VBP': 'V', 
        'VBZ': 'V', 'NN': 'N', 'NNS': 'N', 'NNP': 'N', 'NNPS': 'N'
    })
    y_train_new = [[tag_map[tag] for tag in tag_seq] for tag_seq in y_train]
    y_test_new = [[tag_map[tag] for tag in tag_seq] for tag_seq in y_test]

    return y_train_new, y_test_new 

In [5]:
def evaluate_accuracy_metrics(correct, total):
    accuracy = sum(x for x in correct.values()) / sum(x for x in total.values())
    
    classwise_accuracy = {}

    for tag in sorted(total.keys()):
        classwise_accuracy[tag] = correct[tag] / total[tag]
    
    print(f'\nHMM Model Accuracy = {accuracy}\n')
    print('Class-wise Accuracies \n')
    print(tabulate(zip(classwise_accuracy.keys(), classwise_accuracy.values()),
                   headers=['Class (Tag)', 'Accuracy'],
                   tablefmt='orgtbl'))
    
    df = pd.DataFrame.from_dict(classwise_accuracy, orient='index')
    
    return df

In [6]:
def test_and_evaluate(x, y, transition, emission, all_tags):
    correct_predictions = defaultdict(lambda: 0)
    tag_count = defaultdict(lambda: 0)

    print(f'Evaluating {len(x)} sentences.\n')

    for sentence, actual_tag_sequence in tqdm(zip(x, y), total=len(x)):
        pred_tag_sequence = viterbi_model(sentence, transition, emission, all_tags)
        for predicted, actual in zip(pred_tag_sequence, actual_tag_sequence):
            correct_predictions[actual] += predicted == actual
            tag_count[actual] += 1

    df = evaluate_accuracy_metrics(correct_predictions, tag_count)   

    return df

In [7]:
def kappa(position, all_tags):
    return all_tags if position not in [0, -1] else ['START']

In [8]:
def viterbi_model(sentence, transition, emission, all_tags):
    sentence = sentence.split()
    pi = defaultdict(lambda: 0)
    bp = defaultdict(lambda: "OTH")
    pi[(0, 'START', 'START')] = 1.0

    n = len(sentence)

    for k in range(1, n + 1):
        u_set = kappa(k - 1, all_tags)
        v_set = kappa(k, all_tags)
        w_set = kappa(k - 2, all_tags)

        for v in v_set:
            for u in u_set:
                for w in w_set:
                    reach_prob = pi[(k - 1, w, u)] * transition[(w, u, v)] * emission[(sentence[k - 1], v)]
                    if reach_prob > pi[(k, u, v)]:
                        pi[(k, u, v)] = reach_prob
                        bp[(k, u, v)] = w
    
    u_set = kappa(n - 1, all_tags)
    v_set = kappa(n, all_tags)
    result_tags = []
    for u in u_set:
        for v in v_set:
            if len(result_tags) == 0:
                result_tags = [v, u]
            if pi[(n, u, v)] * transition[(u, v, 'STOP')] > \
            pi[(n, result_tags[1], result_tags[0])] * transition[result_tags[1], result_tags[0], 'STOP']:
                result_tags = [v, u]
    
    for k in range(n - 2, 0, -1):
        result_tags.append(bp[(k + 2, result_tags[-1], result_tags[-2])])
    
    result_tags.reverse()

    return result_tags

In [9]:
def HMM(text, tags, num_tags = 36):

    x_train, x_test, y_train, y_test = train_test_split(text, tags, test_size=0.2, random_state = 40)

    if(num_tags == 4):
        y_train, y_test = collapse_to_4_tags(y_train, y_test)

    all_tags = ['START'] + list(set(tag for tag_list in y_train for tag in tag_list)) + ['STOP']
  
    emission_matrix = generate_emission_matrix(x_train, y_train)
    transition_matrix = generate_transition_matrix(y_train)

    df = test_and_evaluate(x_test, y_test, transition_matrix, emission_matrix, all_tags)

    return df

In [10]:
def main():

    df = pd.read_json("https://raw.githubusercontent.com/ammaarahmad1999/CS563-NLP-Lab/main/LAB1/penn-data.json")
    df.columns = ['text', 'tags']
    df['text'] = df['text'].str.replace('[^\w\s]', '', regex = True)
    text = df['text'].to_numpy(dtype = object)
    tags = df['tags'].to_numpy(dtype = object)

    # For all the tags
    print('-' * 80)
    print('HMM for 36 tags : ')
    df = HMM(text, tags, num_tags = 36)
    df.to_csv("HMM_36_results.csv")
    
    # For all the tags
    print('-' * 80)
    print('HMM for 4 tags : ')
    df = HMM(text, tags, num_tags = 4)
    df.to_csv("HMM_4_results.csv")
    print('-' * 80)

if __name__ == "__main__":
    main()

--------------------------------------------------------------------------------
HMM for 36 tags : 
Evaluating 783 sentences.



100%|██████████| 783/783 [16:22<00:00,  1.26s/it]



HMM Model Accuracy = 0.8360031915546554

Class-wise Accuracies 

| Class (Tag)   |   Accuracy |
|---------------+------------|
| #             |   0        |
| ''            |   0        |
| ,             |   0        |
| -LRB-         |   0.55     |
| -RRB-         |   0.473684 |
| :             |   0.037037 |
| CC            |   0.816495 |
| CD            |   0.841705 |
| DT            |   0.925174 |
| EX            |   0.588235 |
| FW            |   0        |
| IN            |   0.909812 |
| JJ            |   0.776578 |
| JJR           |   0.78481  |
| JJS           |   0.733333 |
| LS            |   0.666667 |
| MD            |   0.908046 |
| NN            |   0.849123 |
| NNP           |   0.841945 |
| NNPS          |   0.382979 |
| NNS           |   0.818031 |
| PDT           |   0.285714 |
| PRP           |   0.898734 |
| PRP$          |   0.913386 |
| RB            |   0.822547 |
| RBR           |   0.5      |
| RBS           |   0.4      |
| RP            |   0.5      |
| SY

100%|██████████| 783/783 [00:03<00:00, 238.43it/s]


HMM Model Accuracy = 0.8695145154360768

Class-wise Accuracies 

| Class (Tag)   |   Accuracy |
|---------------+------------|
| A             |   0.774813 |
| N             |   0.852632 |
| O             |   0.939556 |
| V             |   0.805544 |
--------------------------------------------------------------------------------





In [14]:
df = pd.read_csv("HMM_4_results.csv")
df.columns = ['POS', 'Probability']
df.to_csv("HMM_4_results.csv", index = None)
df

Unnamed: 0,POS,Probability
0,A,0.774813
1,N,0.852632
2,O,0.939556
3,V,0.805544


In [13]:
df = pd.read_csv("HMM_36_results.csv")
df.columns = ['POS', 'Probability']
df.to_csv("HMM_36_results.csv", index = None)
df

Unnamed: 0,POS,Probability
0,#,0.0
1,'',0.0
2,",",0.0
3,-LRB-,0.55
4,-RRB-,0.473684
5,:,0.037037
6,CC,0.816495
7,CD,0.841705
8,DT,0.925174
9,EX,0.588235
