In [None]:
# Importing all necessary files
from lxml import etree, objectify
import nltk
import string
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
import pandas as pd 
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

In [None]:
!pip install swifter
import swifter

In [None]:
#Setting some global variables
global Tree
global TreeNew
global SenseLemmaDictionary
global SenseLemmaCorpusDictionary
SenseLemmaDictionary = {}
SenseLemmaCorpusDictionary = {}

In [None]:
# function to get senses from original dictionary: 
def get_senses(word, pos):
    item = Tree.xpath("//lexelt[@item='%s.%s']" % (word, pos))    
    senses = []
    if len(item) >= 1:
        for sense in item[0].getchildren():
            senses.append(dict(zip(sense.keys(), sense.values())))
    return senses

In [None]:
# functions to rename columns and ignore the index column
def rename_col(dataset):
    dataset_new = dataset.rename(columns = {0:"Target_Word", 1:"Sense_ID", 2:"Sentence"})
    dataset_new = dataset_new.reset_index(drop=True)
    return dataset_new

In [None]:
#convert sentence column to lower case, remove digits and punctuations
def clean_data(dataset, colname):
    stop = stopwords.words('english')
    string.punctuation = string.punctuation.replace('%', '')
    dataset["cleaned_data"] = dataset[colname].apply(lambda words: ' '.join(word.lower().translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) for word in words.split()))
    dataset["cleaned_data"] = dataset["cleaned_data"].str.replace('\d+', '')
    return dataset

In [None]:
# retrieving pos tags for the words and lemmatize it
def get_pos_wordnet(sent):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    sent = ' '.join(word.lower().translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) for word in sent.split())
    list_words = sent.split()
    final_list = []
    for i in range (len(list_words)):
        tag = nltk.pos_tag(list_words)[i][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        final_tag = tag_dict.get(tag, wordnet.NOUN)
        lemmatized_word = lemmatizer.lemmatize(list_words[i],final_tag)
        final_list.append([list_words[i],final_tag,lemmatized_word])
    return final_list

In [None]:
# function to remove stop words and words with length < 3
def remove_stopwords(pos_input_list):
    return_list = []
    stop = stopwords.words('english')
    for pos in pos_input_list:
        if (pos[2] not in stop and (len(pos[2])>2 or pos[2]=="%%")):
            return_list.append(pos)
    return return_list

In [None]:
# function to lemmatize words and add pos tags
def lemmatize_sentences(sent):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    sent = ' '.join(word.lower().translate(str.maketrans(string.punctuation, ' '*len(string.punctuation))) for word in sent.split())
    list_words = sent.split()
    lemmatize_words = ''
    for i in range (len(list_words)):
        tag = nltk.pos_tag(list_words)[i][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        final_tag = tag_dict.get(tag, wordnet.NOUN)
        lemmatize_words += " " + lemmatizer.lemmatize(list_words[i],final_tag)   
    return lemmatize_words.strip()

In [34]:
# function to lemmatize sense words
def convert_sense_to_lemma(word_pos, sense, corpus = False):
    word_pos = word_pos.strip()
    key = word_pos+ "_"+sense.get('id')
    sense_examples = ""
    if corpus:
        if (key in SenseLemmaCorpusDictionary):
            sense_examples = SenseLemmaCorpusDictionary.get(key)
        else:
            sense_examples = (
                lemmatize_sentences(sense.get('gloss').lower())
                + " | "
                + ('.'.join(lemmatize_sentences(sentence.lower()) for sentence in sense.get('examples').split(".")))
            )
            SenseLemmaCorpusDictionary[key] = sense_examples
    else:
        if (key in SenseLemmaDictionary):
            sense_examples = SenseLemmaDictionary.get(key)
        else:
            sense_examples = (
                lemmatize_sentences(sense.get('gloss').lower())
                + " | "
                + ('.'.join(lemmatize_sentences(sentence.lower()) for sentence in sense.get('examples').split(".")))
            )
            SenseLemmaDictionary[key] = sense_examples
    return sense_examples

In [None]:
# function for calculating accuracies 
def calculate_accuracy(dataframe, column_name):
    accuracy_number = 0
    i=0
    for index, row in dataframe.iterrows():
        if(int(row['Sense_ID'])==int(row[column_name])):
            accuracy_number += 1
        i += 1
    return ((accuracy_number/i)*100)


In [None]:
# function for exporting to CSV
def export_to_csv(input_data_frame, csv_path):
    tmp_df = input_data_frame.drop(['Sentence', 'cleaned_data', 'pos_data'], axis=1)
    tmp_df.to_csv(csv_path, index = False)

**SIMPLE LESK**

In [None]:
# Simple lesk Algorithm
def simple_lesk_algo(target_word, pos_data):
    target_data = target_word.split(".")
    #fetching senses of targetword from given dictionary
    senses = get_senses(target_data[0].strip(), target_data[1].strip()) # targetword, pos
    score_map = {}
    pos_sentence = []
    for pos_word in pos_data:
        # adds lemmatized words in pos_sentence
        pos_sentence.append(pos_word[2]) 
        
    for sense in senses:
        sense_score = 0
        #fetching lemmatized form of gloss+examples from dictionary sense passed
        sense_examples = convert_sense_to_lemma(target_word, sense)
        sense_example_words = sense_examples.split()
        #Overlapping words from dictionary(gloss+examples) and lemmatized test data sentence
        ovlap_words = set(sense_example_words).intersection( set(pos_sentence) )
        # total no.of overlapping words as score for that sense id
        score_map[sense.get('id')] = len(ovlap_words)
      

    max_sense = max(score_map, key=score_map.get)  # get the max sense of the score
    return max_sense

In [None]:
# read the dictionary file - original 
Parser = objectify.makeparser(recover=True)
Tree = objectify.fromstring(''.join(open('dictionary.xml').readlines()), Parser)

# read test data
train_data = pd.read_csv (r'train.data',header=None,delimiter = "|")
test_data = pd.read_csv (r'test.data',header=None,delimiter = "|")
validation_data = pd.read_csv (r'validate.data',header=None,delimiter = "|")

# rename columns for all the datasets
train_data_new = rename_col(train_data)
test_data_new = rename_col(test_data)
validation_data_new = rename_col(validation_data)


In [None]:
#Validation Data
simple_lesk_validation_df = validation_data_new
# adds a new column 'cleaned_data' with lowercase sentence
simple_lesk_validation_df = clean_data(simple_lesk_validation_df, 'Sentence') 
#get the pos data in a new column
simple_lesk_validation_df["pos_data"] = simple_lesk_validation_df['cleaned_data'].swifter.apply(lambda sentence: get_pos_wordnet(sentence)) 
# for each pos data remove stop words
simple_lesk_validation_df["pos_data"] = simple_lesk_validation_df["pos_data"].swifter.apply(lambda pos_data_list: remove_stopwords(pos_data_list))

#Simple Lesk
simple_lesk_validation_df['simple_lesk_sense_id'] = simple_lesk_validation_df.swifter.apply(lambda x: simple_lesk_algo(x['Target_Word'], x['pos_data']), axis=1)
# Calculating accuracies of validation data
print("Accuracy of validation data for simple_lesk: " + str(calculate_accuracy(simple_lesk_validation_df, "simple_lesk_sense_id"))) #49.08896034297964

# Export validation results to CSV
export_to_csv(simple_lesk_validation_df, r'validation_results_SimpleLesk.csv')

In [None]:
#Training Data
simple_lesk_train_df = train_data_new
simple_lesk_train_df = clean_data(simple_lesk_train_df, 'Sentence')
simple_lesk_train_df["pos_data"] = simple_lesk_train_df['cleaned_data'].swifter.apply(lambda sentence: get_pos_wordnet(sentence))
simple_lesk_train_df["pos_data"] = simple_lesk_train_df["pos_data"].swifter.apply(lambda pos_data_list: remove_stopwords(pos_data_list))


In [None]:
#Simple Lesk
simple_lesk_train_df['simple_lesk_sense_id'] = simple_lesk_train_df.swifter.apply(lambda x: simple_lesk_algo(x['Target_Word'], x['pos_data']), axis=1)
# Calculating accuracies of training data
print("Accuracy of training data for simple_lesk: " + str(calculate_accuracy(simple_lesk_train_df, "simple_lesk_sense_id"))) #47.65046452134105

# Export train results to CSV
export_to_csv(simple_lesk_train_df, r'train_results_SimpleLesk.csv')

**ORIGINAL LESK**

In [28]:
# funtion to return the lemmatized form of words from gloss and examples of the senses of context_sense passed to it (test data)
def get_lemma_from_context_sense(context_sense):
    dictionary_examples = ""
    for context_data in context_sense:
        for sense_data in context_data[2]:
            dictionary_examples += convert_sense_to_lemma(context_data[1], sense_data)
    return dictionary_examples

In [29]:
# function to return the target word, context_word.its pos tag, senses of the context word from dictionary if exists, index of context word 
def get_context_sense(target_data, pos_data, corpus=False):
    context_sense = []
    target_sense = []
    sentence = pos_data
    sentence_length = len(sentence)
    target_word = target_data.split(".")[0]
    target_pos = target_data.split(".")[1]
    for k in range(len(sentence)):
        if sentence[k][0] == "%%":
            target_index = k-1 # minus 1 bcz we will get k as the second occurrence of %%
            targetWord = sentence[target_index][0]
            break
    i = target_index-2 # index for word before %%
    j = target_index+2 # index for word after %%
    k = 0
    while((i>=0 or j<len(sentence)) and k<30):   
        # For previous words from target word in test data
        if(i>=0 and len(sentence[i][2].strip())>= 3 and sentence[i][2].strip() != target_word):
            context_word = sentence[i][2].strip()
            context_pos = sentence[i][1].strip()
            if(corpus):
                sense = get_new_senses(context_word,context_pos)
            else:
                sense = get_senses(context_word,context_pos)
            if len(sense) >= 1:
                context_sense.append([targetWord, context_word+"."+context_pos, sense, target_index-i]) 

        # For next words from target word in test data
        if(j<len(sentence) and len(sentence[j][2].strip())>= 3 and sentence[j][2].strip() != target_word):
            context_word = sentence[j][2].strip()
            context_pos = sentence[j][1].strip()
            if(corpus):
                sense = get_new_senses(context_word,context_pos)
            else:
                sense = get_senses(context_word,context_pos) 
            if len(sense) >= 1:
                context_sense.append([target_word,context_word+"."+context_pos,sense, j-target_index])     
        i = i-1
        j = j+1
        k = k+1
                
    return context_sense

In [30]:
# Orginal lesk Algorithm
def original_lesk_algo(target_word_pos, pos_without_stopwords):
    target_word_details = target_word_pos.split(".")
    # get senses from dictionary
    target_senses = get_senses(target_word_details[0].strip(), target_word_details[1].strip()) 
    score_map = {}
    bigram_score_map ={}

    # get context of the target word from test data and then get the context word's senses from dictionary and get their dictionary's gloss+examples lemmas
    context_sentence = get_lemma_from_context_sense(get_context_sense(target_word_pos, pos_without_stopwords)) 
    
    for sense in target_senses:      
        sense_examples = convert_sense_to_lemma(target_word_pos.strip(), sense)
        sense_example_words = sense_examples.split()
        context_example_words = context_sentence.split()

        #Fetching overlap between the meaning of context and the target words senses
        ovlap_words = set(sense_example_words).intersection( set(context_example_words) )
        context_score = len(ovlap_words)
        
        #Check for bigram overlaps
        bigrams_sense_examples = list(zip(*[sense_example_words[i:] for i in range(2)]))
        bigrams_context_examples = list(zip(*[context_example_words[i:] for i in range(2)]))
        bigram_ovlap = set(bigrams_sense_examples).intersection( set(bigrams_context_examples) )

        #Adding to the normal score but adding twice the value in order to give more weightage
        context_score = len(ovlap_words) + 2*len(bigram_ovlap)
        score_map[sense.get('id')] = context_score
        
    max_sense = max(score_map, key = score_map.get)
    return max_sense

In [None]:
#Validation Data
original_lesk_validation_df = simple_lesk_validation_df
original_lesk_validation_df['original_lesk_sense_id'] = original_lesk_validation_df.swifter.apply(lambda x: original_lesk_algo(x['Target_Word'], x['pos_data']), axis=1)

#Calculating accuracy
print("Accuracy of validation data for original_lesk: " + str(calculate_accuracy(original_lesk_validation_df, "original_lesk_sense_id"))) # 41.37191854233655 #41.264737406216504

# Export validation results to CSV
export_to_csv(original_lesk_validation_df, r'validation_results_OriginalLesk.csv')

Validation accuracy without consecutive overlapping score : 41.37% 

Validation accuracy with consecutive overlapping score : 41.26%

In [None]:
#Training Data
original_lesk_train_df = simple_lesk_train_df
original_lesk_train_df['original_lesk_sense_id'] = original_lesk_train_df.swifter.apply(lambda x: original_lesk_algo(x['Target_Word'], x['pos_data']), axis=1)

#Calculating accuracy
print("Accuracy of training data for original_lesk: " + str(calculate_accuracy(original_lesk_train_df, "original_lesk_sense_id"))) #40.025133521834746

**CORPUS LESK**

In [33]:
# function for making the new augemented dictionary by adding training data for corpus lesk
def new_dictionary():
    parser = objectify.makeparser(recover=True)
    tree = objectify.fromstring(''.join(open('dictionary.xml').readlines()), parser)
    train_data_new = rename_col(train_data)
    for index, row in train_data_new.iterrows():
        target_word = row['Target_Word'].strip()
        sense_id = str(row['Sense_ID'])
        sentence_to_add = row['Sentence']
        
        item = tree.xpath("//lexelt[@item='%s']" % (target_word))
        
        for item_sense in item[0].getchildren():
            if (str(item_sense.attrib['id']) == sense_id):
                item_sense.attrib['examples'] = item_sense.attrib['examples'] + sentence_to_add

    xml_new = etree.tostring(tree, pretty_print=True)
    # save your xml
    with open(r"new_dictionary.xml", "wb") as f:
        f.write(xml_new)


In [35]:
# function to get sense from new dictionary
def get_new_senses(word, pos):
    item = TreeNew.xpath("//lexelt[@item='%s.%s']" % (word, pos))    
    senses = []
    if len(item) >= 1:
        for sense in item[0].getchildren():
            senses.append(dict(zip(sense.keys(), sense.values())))
    return senses

In [36]:
# Corpus lesk algorithm
def corpus_lesk_algo(target_word, pos_data):
    print(target_word)
    target_data = target_word.split(".")
    senses = get_new_senses(target_data[0].strip(), target_data[1].strip())
    score_map = {}
    pos_sentence = []
    for pos_word in pos_data:
        pos_sentence.append(pos_word[2])
   
    for sense in senses:
        sense_score = 0
        sense_examples = convert_sense_to_lemma(target_word, sense, corpus=True)
        sense_example_words = sense_examples.split()
        ovlap_words = set(sense_example_words).intersection( set(pos_sentence) )
        score_map[sense.get('id')] = len(ovlap_words)
   
    max_sense = max(score_map, key=score_map.get)
    return max_sense

In [37]:
#create new augmented dictionary
new_dictionary()
ParserNew = objectify.makeparser(recover=True)
TreeNew = objectify.fromstring(''.join(open('new_dictionary.xml').readlines()), ParserNew)
    

In [None]:
# Validation Data
corpus_lesk_validation_df = simple_lesk_validation_df
corpus_lesk_validation_df['corpus_lesk_sense_id'] = corpus_lesk_validation_df.swifter.apply(lambda x: corpus_lesk_algo(x['Target_Word'], x['pos_data']), axis=1)

#Calculating accuracy
print("Accuracy of validation data for corpus_lesk: " + str(calculate_accuracy(corpus_lesk_validation_df, "corpus_lesk_sense_id"))) #83.38692390139335

# Export validation results to CSV
export_to_csv(corpus_lesk_validation_df, r'validation_results_CorpusLesk.csv')
    

In [None]:
# Training Data
corpus_lesk_train_df = simple_lesk_train_df
corpus_lesk_train_df['corpus_lesk_sense_id'] = corpus_lesk_train_df.swifter.apply(lambda x: corpus_lesk_algo(x['Target_Word'], x['pos_data']), axis=1)

# Calculating accuracy
print("Accuracy of training data for adv_original_lesk: " + str(calculate_accuracy(corpus_lesk_train_df, "corpus_lesk_sense_id"))) #98.69395449037296

# Export validation results to CSV
export_to_csv(corpus_lesk_train_df, r'training_data_results_CorpusLesk.csv')
    

We get the maximum accuracy with the Corpus Lesk, so we shall use the corpus lesk algorithm to predict the senses of the test data

In [None]:
# Test Data
corpus_lesk_test_df = test_data_new
corpus_lesk_test_df = clean_data(corpus_lesk_test_df, 'Sentence')
corpus_lesk_test_df['pos_data'] = corpus_lesk_test_df['cleaned_data'].swifter.apply(lambda sentence: get_pos_wordnet(sentence))
corpus_lesk_test_df['pos_data'] = corpus_lesk_test_df['pos_data'].swifter.apply(lambda pos_data_list: remove_stopwords(pos_data_list))

Pandas Apply:   0%|          | 0/3918 [00:00<?, ?it/s]

In [None]:
# Corpus lesk
corpus_lesk_test_df = corpus_lesk_test_df
corpus_lesk_test_df['corpus_lesk_sense_id'] = corpus_lesk_test_df.swifter.apply(lambda x: corpus_lesk_algo(x['Target_Word'], x['pos_data']), axis=1)

# Export validation results to CSV
export_to_csv(corpus_lesk_test_df, r'test_data_results_CorpusLesk.csv')
    