# Importing Libraries

In [572]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
from nltk.corpus import stopwords
from nltk import word_tokenize
import string
import numpy as np
import random


# Variable Definition
To run this the code, user only need to know about one code segment titled as ## "Variable Definition"}. In this code segment, there are five variables that users can modify to see how these parameter influence the overall performance of the decision tree classifier. The description of the variables are as follows:

decision_list_rule_boundary: Number of top ranked rule boundary (For selecting top-10 rule, set the value `10`. To add more rule in decision list, increase the value. )

target_file_name: Target corpus name (In our case, it will be "bass" or "sake")

total_size: Total training data size from the actual training data (Range: 0 to 1)
  
mu: Percentage of data for training and validation set (Range: 0 to 1. `0.8` refers to 80\% of data will be treated as training set and 20\% will be treated as validation set.)

k: Length of context sentence or number of words in context sentence


For a given code segment, the output will be observed below the code segment titled as "Sense Disambiguation using Decision List". The result will contain Accuracy, Precision and Recall.

To use the model for a isolated sentence, a function titled "predict_sense" needs to be called. This function takes a sentence as input and return the sense as "1 or 2". Definitely, you need to define the decision list first.

In [573]:
# --------------------------------------Value Alteration Allowed Start--------------------------
# Number of top ranked rule boundary
decision_list_rule_boundary = 10

# Define the target file name
target_file_name = "sake"

# define total training data size from the actual training data
total_size = 0.7

# percentage of data for training data
mu = 0.8

# length of context sentence or number of words in context sentence
k = 11

# --------------------------------------Value Alteration Allowed End--------------------------


#-------------------------Altering these values are not recommended start------------------

# How many rules needs to be selected from each criteria for calculating log likelihood
top_rules = 5



# List for context sentences
contexts = []


# List of decisions
decisionList=[]

default_sense = 1

# Default value for alpha(Because the size of corpora is small)
alpha = 0.1
#-------------------------Altering these values are not recommended end------------------



# Text Preprocessing

In [574]:
target = target_file_name
lines = open(target_file_name+".trn","r").readlines()
testlines = open(target_file_name+".tst","r").readlines()

# set the size of the training data based on the value of total_size
lines = lines[:int(len(lines)*total_size)]


train_lines = lines[:int(len(lines)*mu)]
validation_lines = lines[int(len(lines)*mu):]



In [575]:
# Prcessing the text: to extract the text and corresponding sense from each line of the file
def process_text(line):
    splitLine = line.split("\t")
    splitLine[0] = splitLine[0].replace(":","")
    splitLine[1] = splitLine[1].lower()
    splitLine[1].translate(str.maketrans('', '', string.punctuation))
    return splitLine

In [576]:
# Unpacking the training corpora into two arrays, each containing text from two senses
def unpack_corpora():
    type1Text = []
    type2Text = []
    for line in train_lines:
        splitLine = process_text(line)
        if splitLine[0] == target:
            type1Text.append(splitLine[1])
        else:
            type2Text.append(splitLine[1])
     
    print("Length of Type 1 texts:",len(type1Text), "Length of Type 2 texts:", len(type2Text))
    return type1Text, type2Text

# Contextualization of the Text

In [577]:
def convert_lower_case(data):
    return np.char.lower(data)

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def lemmatizing(data):
    lemmatizer = WordNetLemmatizer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + lemmatizer.lemmatize(w)
    return new_text
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text


def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = lemmatizing(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = remove_punctuation(data) 
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

    
    

In [578]:
# Make context of each sentences after removing punctuation, some extraneous quotation mark from the text.
def context_dictionary():
    type1Text, type2Text = unpack_corpora()
    # This for loop is for sense 1
    for sentence in type1Text:

        # preprocess the sentence
        clean_sentence = preprocess(sentence)

        # tokenizing the words from the sentence
        words = word_tokenize(clean_sentence)

        # Pre-process the words 
        words = [word for word in words]

        for i in range(0,len(words)):
            if target == words[i]:
                left = max(i-int(k/2),0)
                right = min(i+int(k/2),len(words))

                context = words[left:right]
                dict = {
                    "sentence" : context,
                    "sense" : 1,
                    "position": i
                }


                contexts.append(dict)
    # This for loop is for sense 2
    for sentence in type2Text:
         # preprocess the sentence
        clean_sentence = preprocess(sentence)

        # tokenizing the words from the sentence
        words = word_tokenize(clean_sentence)

        # Pre-process the words 
        words = [word for word in words]

        for i in range(0,len(words)):
            if target == words[i]:
                left = max(i-int(k/2),0)
                right = min(i+int(k/2),len(words))

                context = words[left:right]
                dict = {
                    "sentence" : context,
                    "sense" : 2,
                    "position": i
                }

                contexts.append(dict)
    return contexts

# Check Collocation Distribution

In [579]:
# define rules

# if seed word is at K distance from the pattern word index
def k_closest(context, index_of_pattern, words):
    for index, w in enumerate(context):
        if w == words and (index < index_of_pattern - 1 or index > index_of_pattern + 1):
            return True
    return False

# if seed word is the next of the pattern word index
def right(context, index_of_pattern, words):
    if len(context) <= index_of_pattern + 1:
        return False
    else:
        return context[index_of_pattern + 1] == words

# if seed word is the prior of the pattern word index
def left(context, index_of_pattern, words):
    if index_of_pattern == 0:
        return False
    else:
        return context[index_of_pattern - 1] == words

# if seed words are the prior of the pattern word index
def two_left(context, index_of_pattern, words):
    if index_of_pattern < 2:
        return False
    else:
        return (context[index_of_pattern - 2], context[index_of_pattern - 1]) == words

# if seed words are around the pattern word index
def surround(context, index_of_pattern, words):
    if index_of_pattern >= len(context) - 1 or index_of_pattern == 0:
        return False
    else:
        return (context[index_of_pattern - 1], context[index_of_pattern + 1]) == words

# if seed words are the prior of the pattern word index
def two_right(context, index_of_pattern, words):
    if index_of_pattern >= len(context) - 2:
        return False
    else:
        return (context[index_of_pattern + 1], context[index_of_pattern + 2]) == words


RULES = {
    0: right,
    1: left,
    2: k_closest,
    3: two_left,
    4: surround,
    5: two_right
}

In [580]:
two_right(['stephan', 'weidner', 'composer', 'bass', 'player', 'boehse', 'onkelz'], 3, ('player','boehse'))

True

# Freq Distribution in Sense 1 and Sense 2
We will count the frequency of each word to derive which word to expect within the range(+/-k) of target word.

In [581]:
def unigram_count(contexts):
    freqSense1 = {}
    freqSense2 = {}
    # Freq Distribution in Sense 1 and Sense 2
    for context in contexts:
        for word in context['sentence']:
            if context['sense']==1 and word != target:
                if freqSense1.get(word):
                    freqSense1[word]=freqSense1[word]+1;
                else:
                    freqSense1[word]=1;
            if context['sense']==2 and word != target:
                if freqSense2.get(word):
                        freqSense2[word]=freqSense2[word]+1;
                else:
                    freqSense2[word]=1;

    freq_dist_type_1 = sorted(freqSense1.items(), key=lambda x: x[1], reverse=True)    


    freq_dist_type_2 = sorted(freqSense2.items(), key=lambda x: x[1], reverse=True)    
    return freq_dist_type_1, freq_dist_type_2


# Count Next Word in Sense 1 and Sense 2


In [582]:
def forward_one_count(contexts):
    # Count Next words in Sense 1 and Sense 2
    seed_forward_1 = {}
    seed_forward_2 = {}

    for context in contexts:
        if context['sense'] == 1:
            try:
                candidate = (target, context['sentence'][context['position']+1])
            except:
                continue
            if not seed_forward_1.get(candidate):
                seed_forward_1[candidate]=1
            else:
                seed_forward_1[candidate]=seed_forward_1[candidate]+1
        else:
            try:
                candidate = (target, context['sentence'][context['position']+1])
            except:
                continue
            if not seed_forward_2.get(candidate):
                seed_forward_2[candidate]=1
            else:
                seed_forward_2[candidate]=seed_forward_2[candidate]+1

    seed_forward_1 = sorted(seed_forward_1.items(), key=lambda x: x[1], reverse=True)    
    seed_forward_2 = sorted(seed_forward_2.items(), key=lambda x: x[1], reverse=True)    


#     print(seed_forward_1[:5],seed_forward_2[:5])
    return seed_forward_1, seed_forward_2


# Count Previous Word in Sense 1 and Sense 2

In [583]:
def backward_one_count(contexts):
    # Count Previous Words in Sense 1 and Sense 2
    seed_backward_1 = {}
    seed_backward_2 = {}

    for context in contexts:
        if context['sense'] == 1:
            try:
                candidate = (context['sentence'][context['position']-1], target) 
            except:
                continue
            if not seed_backward_1.get(candidate):
                seed_backward_1[candidate]=1
            else:
                seed_backward_1[candidate]=seed_backward_1[candidate]+1
        else:
            try:
                candidate = (context['sentence'][context['position']-1], target)
            except:
                continue
            if not seed_backward_2.get(candidate):
                seed_backward_2[candidate]=1
            else:
                seed_backward_2[candidate]=seed_backward_2[candidate]+1

    seed_backward_1 = sorted(seed_backward_1.items(), key=lambda x: x[1], reverse=True)    
    seed_backward_2 = sorted(seed_backward_2.items(), key=lambda x: x[1], reverse=True)    


#     print(seed_backward_1[:5],seed_backward_2[:5])
    return seed_backward_1, seed_backward_2

# Count Next two Words in Sense 1 and Sense 2

In [584]:
def forward_two_count(contexts):

    # Count Next two Words in Sense 1 and Sense 2
    seed_forward_2_1 = {}
    seed_forward_2_2 = {}

    for context in contexts:
        if context['sense'] == 1:
            try:
                candidate = (target, context['sentence'][context['position']+1], context['sentence'][context['position']+2])
            except:
                continue
            if not seed_forward_2_1.get(candidate):
                seed_forward_2_1[candidate]=1
            else:
                seed_forward_2_1[candidate]=seed_forward_2_1[candidate]+1
        else:
            try:
                candidate = (target, context['sentence'][context['position']+1], context['sentence'][context['position']+2])
            except:
                continue
            if not seed_forward_2_2.get(candidate):
                seed_forward_2_2[candidate]=1
            else:
                seed_forward_2_2[candidate]=seed_forward_2_2[candidate]+1

    seed_forward_2_1 = sorted(seed_forward_2_1.items(), key=lambda x: x[1], reverse=True)    
    seed_forward_2_2 = sorted(seed_forward_2_2.items(), key=lambda x: x[1], reverse=True)    
#     print(seed_forward_2_1[:5],seed_forward_2_2[:5])
    return seed_forward_2_1, seed_forward_2_2


# Count Previous Two Words in Sense 1 and Sense 2

In [585]:
def backward_two_count(contexts):

    # Count Previous two Words in Sense 1 and Sense 2
    seed_backward_2_1 = {}
    seed_backward_2_2 = {}

    for context in contexts:
        if context['sense'] == 1:
            try:
                candidate = (context['sentence'][context['position']-2], context['sentence'][context['position']-1], target)
            except:
                continue
            if not seed_backward_2_1.get(candidate):
                seed_backward_2_1[candidate]=1
            else:
                seed_backward_2_1[candidate]=seed_backward_2_1[candidate]+1
        else:
            try:
                candidate = (context['sentence'][context['position']-2], context['sentence'][context['position']-1], target)
            except:
                continue
            if not seed_backward_2_2.get(candidate):
                seed_backward_2_2[candidate]=1
            else:
                seed_backward_2_2[candidate]=seed_backward_2_2[candidate]+1

    seed_backward_2_1 = sorted(seed_backward_2_1.items(), key=lambda x: x[1], reverse=True)    
    seed_backward_2_2 = sorted(seed_backward_2_2.items(), key=lambda x: x[1], reverse=True)  
    
    return seed_backward_2_1, seed_backward_2_2


# Count Surrounding Two Words in Sense 1 and Sense 2

In [586]:
def surrounding_count(contexts):
    # Count Surrounding two Words in Sense 1 and Sense 2

    seed_surround_1 = {}
    seed_surround_2 = {}

    for context in contexts:
        if context['sense'] == 1:
            try:
                candidate = (context['sentence'][context['position']-1], target, context['sentence'][context['position']+1])
            except:
                continue
            if not seed_surround_1.get(candidate):
                seed_surround_1[candidate]=1
            else:
                seed_surround_1[candidate]=seed_surround_1[candidate]+1
        else:
            try:
                candidate = (context['sentence'][context['position']-1], target, context['sentence'][context['position']+1])
            except:
                continue
            if not seed_surround_2.get(candidate):
                seed_surround_2[candidate]=1
            else:
                seed_surround_2[candidate]=seed_surround_2[candidate]+1

    seed_surround_1 = sorted(seed_surround_1.items(), key=lambda x: x[1], reverse=True)    
    seed_surround_2 = sorted(seed_surround_2.items(), key=lambda x: x[1], reverse=True)    
#     print(seed_surround_1[:5],seed_surround_2[:5])
    return seed_surround_1, seed_surround_2
# Count words within the range

# Merging Rules from Each Collocation List

In [587]:
def construct_sense():
    contexts = context_dictionary()
    
    freq_dist_type_1, freq_dist_type_2 = unigram_count(contexts)
    
    seed_forward_1, seed_forward_2 = forward_one_count(contexts)
    seed_backward_1, seed_backward_2 = backward_one_count(contexts)
    
    seed_forward_2_1, seed_forward_2_2 = forward_two_count(contexts)
    seed_backward_2_1, seed_backward_2_2 = backward_two_count(contexts)
    
    seed_surround_1, seed_surround_2 = surrounding_count(contexts)
    
#     # Merging the rules into one list for sense 1
    seed_sense_1=  freq_dist_type_1[:top_rules] + seed_forward_1[:top_rules]+seed_backward_1[:top_rules]+seed_forward_2_1[:top_rules]+seed_backward_2_1[:top_rules]+seed_surround_1[:top_rules]
#     # Merging the rules into one list for sense 2
    seed_sense_2=  freq_dist_type_2[:top_rules] + seed_forward_2[:top_rules]+seed_backward_2[:top_rules]+seed_forward_2_2[:top_rules]+seed_backward_2_2[:top_rules]+seed_surround_2[:top_rules]
    # Merging the rules into one list for sense 1
#     seed_sense_1=  freq_dist_type_1 + seed_forward_1+seed_backward_1+seed_forward_2_1+seed_backward_2_1+seed_surround_1
    # Merging the rules into one list for sense 2
#     seed_sense_2=  freq_dist_type_2+ seed_forward_2+seed_backward_2+seed_forward_2_2+seed_backward_2_2+seed_surround_2
    
#     print(seed_sense_1)
#     print(seed_sense_2)
    return seed_sense_1, seed_sense_2
construct_sense()


Length of Type 1 texts: 484 Length of Type 2 texts: 20


([('said', 34),
  ('peace', 28),
  ('child', 21),
  ('country', 20),
  ('people', 15),
  (('sake', 'peace'), 21),
  (('sake', 'child'), 13),
  (('sake', 'nation'), 10),
  (('sake', 'country'), 7),
  (('sake', 'national'), 7),
  (('sake', 'sake'), 9),
  (('life', 'sake'), 6),
  (('country', 'sake'), 6),
  (('sacrifice', 'sake'), 5),
  (('said', 'sake'), 5),
  (('sake', 'national', 'interest'), 4),
  (('sake', 'peace', 'national'), 3),
  (('sake', 'national', 'unity'), 2),
  (('sake', 'peace', 'stability'), 2),
  (('sake', 'negotiation', 'help'), 2),
  (('country', 'law', 'sake'), 3),
  (('sake', 'sake', 'sake'), 3),
  (('art', 'art', 'sake'), 2),
  (('minded', 'take', 'sake'), 2),
  (('employment', 'society', 'sake'), 1),
  (('law', 'sake', 'enforcing'), 3),
  (('take', 'sake', 'survival'), 2),
  (('change', 'sake', 'change'), 2),
  (('negotiation', 'sake', 'negotiation'), 2),
  (('god', 'sake', 'dont'), 2)],
 [('cup', 3),
  ('needed', 2),
  ('cold', 2),
  ('undated', 2),
  ('secret', 2

# Appending rules to Decision List by computing Collocation Frequency of Sense 1 and Sense 2

In [588]:
def populate_sense_in_decision_list():
    seed_sense_1, seed_sense_2 = construct_sense()
    
    
    for key1,value1 in seed_sense_1:
        dicision_dict ={
            'collocation': key1,
            'sense1': value1,
            'sense2': 0,
            'sense' : 1
        }
        for key2, value2 in seed_sense_2:
            if key2 == key1:
                dicision_dict['sense2'] = value2
        decisionList.append(dicision_dict)
        
    for key1,value1 in seed_sense_2:

        dicision_dict ={
            'collocation': key1,
            'sense1': 0,
            'sense2': value1,
            'sense' : 2
        }
        for key2, value2 in seed_sense_1:
            if key2 == key1:
                dicision_dict['sense1'] = value2
        decisionList.append(dicision_dict)

    print(decisionList)
#     return decisionList
populate_sense_in_decision_list()

Length of Type 1 texts: 484 Length of Type 2 texts: 20
[{'collocation': 'said', 'sense1': 68, 'sense2': 0, 'sense': 1}, {'collocation': 'peace', 'sense1': 56, 'sense2': 0, 'sense': 1}, {'collocation': 'child', 'sense1': 42, 'sense2': 0, 'sense': 1}, {'collocation': 'country', 'sense1': 40, 'sense2': 0, 'sense': 1}, {'collocation': 'people', 'sense1': 30, 'sense2': 0, 'sense': 1}, {'collocation': ('sake', 'peace'), 'sense1': 42, 'sense2': 0, 'sense': 1}, {'collocation': ('sake', 'child'), 'sense1': 26, 'sense2': 0, 'sense': 1}, {'collocation': ('sake', 'nation'), 'sense1': 20, 'sense2': 0, 'sense': 1}, {'collocation': ('sake', 'country'), 'sense1': 14, 'sense2': 0, 'sense': 1}, {'collocation': ('sake', 'national'), 'sense1': 14, 'sense2': 0, 'sense': 1}, {'collocation': ('sake', 'sake'), 'sense1': 18, 'sense2': 0, 'sense': 1}, {'collocation': ('life', 'sake'), 'sense1': 12, 'sense2': 0, 'sense': 1}, {'collocation': ('country', 'sake'), 'sense1': 12, 'sense2': 0, 'sense': 1}, {'collocati

# Calculating and Sorting Log Likelihood
Laplace Smoothing: For this data, relatively small alpha (between 0.1 and 0.25) tended to be effective, while noisier training data warrant larger alpha.

In [589]:
def calculate_log_decision():
    logDecisionList_preliminary = {}
    for rule in decisionList:
        probability = abs(np.log10((rule['sense1']+alpha)/(rule['sense2']+2*alpha)))
        
        
        logDecisionList_preliminary[rule['collocation']] = (probability, rule['sense'])
    logDecisionList_preliminary = sorted(logDecisionList_preliminary.items(), key=lambda x: x[1], reverse=True)
    return logDecisionList_preliminary
calculate_log_decision()

[('said', (2.532117116248804, 1)),
 ('peace', (2.4479328655921804, 1)),
 ('child', (2.323252100171687, 1)),
 (('sake', 'peace'), (2.323252100171687, 1)),
 ('country', (2.302114376956201, 1)),
 ('people', (2.1775364999298623, 1)),
 (('sake', 'child'), (2.1156105116742996, 1)),
 (('sake', 'nation'), (2.002166061756508, 1)),
 (('sake', 'sake'), (1.9566485792052033, 1)),
 (('sake', 'country'), (1.8481891169913987, 1)),
 (('sake', 'national'), (1.8481891169913987, 1)),
 ('cup', (1.792391689498254, 2)),
 (('life', 'sake'), (1.7817553746524688, 1)),
 (('country', 'sake'), (1.7817553746524688, 1)),
 (('sacrifice', 'sake'), (1.7032913781186614, 1)),
 (('said', 'sake'), (1.7032913781186614, 1)),
 ('needed', (1.6232492903979006, 2)),
 ('cold', (1.6232492903979006, 2)),
 ('undated', (1.6232492903979006, 2)),
 ('secret', (1.6232492903979006, 2)),
 (('sake', 'cup'), (1.6232492903979006, 2)),
 (('japanese', 'sake'), (1.6232492903979006, 2)),
 (('sake', 'national', 'interest'), (1.6074550232146685, 1)

# Defining Default Sense

In [590]:
# This default sense is derived from the number of sense present in the conrpora. Default sense is mianly used in baseline.
def get_default_sense():
    countSense1 = 0
    countSense2 = 0 
    logDecisionList_sample = calculate_log_decision()
    for key,value in logDecisionList_sample[:decision_list_rule_boundary]:
    #     print(value)
        if value[1] == 1:
            countSense1 = countSense1+1
        else:
            countSense2 = countSense2+1
    default_sense = 1 if countSense1 > countSense2 else 2
    print(default_sense)
#     defaultSense
get_default_sense()

1


# Predict Sense of a Sentence

In [591]:
def predict_sense(sentence):
    # By default the sense will remain the default one(Like the baseline one.)
    sense = default_sense
    # preprocess the sentence
    clean_sentence = preprocess(sentence)

    # tokenizing the words from the sentence
    words = word_tokenize(clean_sentence)

    # Pre-process the words 
    words = [word for word in words]

    logDecisionList = calculate_log_decision()
    print(logDecisionList[:decision_list_rule_boundary])
    pattern_index = words.index(target)
    for decision in logDecisionList[:decision_list_rule_boundary]:
        sanitizedDecision = decision[0]
        
        if type(sanitizedDecision) != str:
            sanitizedDecision = [ele for ele in decision[0]]
            if target in sanitizedDecision: sanitizedDecision.remove(target)

            sanitizedDecision = tuple(sanitizedDecision)
        
        # Check whether the pattern index match with any of the rule defined in decision list 
        if (k_closest(words, pattern_index, sanitizedDecision) or right(words, pattern_index, sanitizedDecision) or
        left(words, pattern_index,sanitizedDecision) or
        two_right(words, pattern_index, sanitizedDecision) or
        two_left(words, pattern_index, sanitizedDecision) or
        surround(words, pattern_index, sanitizedDecision)):
            sense = decision[1][1]
    return sense

# predict_sense("I am a sake player")

# Computing Accuracy, Precision and Recall

In [592]:
def compute_metrics(lines, analysis_type=1):
    FP = 0
    FN = 0
    TP = 0
    TN = 0
    
    text = []
    count_true = 0
    for line in lines:
        splitLine = process_text(line)
        actual_sense = 1 if splitLine[0] == target else 2
        
        predicted_sense = predict_sense(splitLine[1])
#         default_sense = defaultSense
#         print(predicted_sense, splitLine[1])
        
        if (analysis_type==1 and predicted_sense == actual_sense) or (analysis_type==2 and default_sense == actual_sense):
            count_true = count_true + 1
            
        #######################Precision and Recall Start#############################
        if analysis_type==1:
            # count true positive 
            if actual_sense ==1 and predicted_sense==1:
                TP=TP+1
            # count true negative 
            if actual_sense ==2 and predicted_sense==2:
                
                TN=TN+1
            # count false positive 
            if actual_sense ==1 and predicted_sense==2:
                
                FP=FP+1
                
            # count false negative 
            if actual_sense ==2 and predicted_sense==1:
                FN=FN+1
        # Count for baseline  
        else:
            # count true positive 
            if actual_sense ==1 and default_sense==1:
                TP=TP+1
            # count true negative 
            if actual_sense ==2 and default_sense==2:
                TN=TN+1
            # count false positive 
            if actual_sense ==1 and default_sense==2:
                
                FP=FP+1
            # count false negative 
            if actual_sense ==2 and default_sense==1:
                FN=FN+1
        #######################Precision and Recall End#############################
    print(TP, TN, FP, FN)
        
    accuracy = count_true/len(lines)
    # Precision 
    precision = TP/(TP+FP)
    # Recall
    recall = TP/(TP+FN)
    return accuracy, precision, recall
#     print("Accuracy:", accuracy)

# Test on Validation Set

In [593]:
accuracy, precision, recall = compute_metrics(validation_lines)
print("Validation accuracy = %0.4f, precision= = %0.4f, recall= = %0.4f" %
      (accuracy, precision, recall))

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

# Sense Disambiguation using Decision List

In [594]:
accuracy, precision, recall = compute_metrics(testlines)
print("Decision list test accuracy = %0.4f, precision= = %0.4f, recall= = %0.4f" %
      (accuracy, precision, recall))

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

# Baseline Sense Disambiguation

In [595]:
accuracy, precision, recall = compute_metrics(testlines,2)
print("Baseline accuracy = %0.4f, precision= = %0.4f, recall= = %0.4f" %
      (accuracy, precision, recall))

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620

[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.302114376956201, 1)), ('people', (2.1775364999298623, 1)), (('sake', 'child'), (2.1156105116742996, 1)), (('sake', 'nation'), (2.002166061756508, 1)), (('sake', 'sake'), (1.9566485792052033, 1)), (('sake', 'country'), (1.8481891169913987, 1))]
[('said', (2.532117116248804, 1)), ('peace', (2.4479328655921804, 1)), ('child', (2.323252100171687, 1)), (('sake', 'peace'), (2.323252100171687, 1)), ('country', (2.30211437695620