In [2]:
#Importing required libraries
import pandas as pd
import nltk
import string
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer  
from string import digits 
import numpy as np
import operator
from google.colab import files

In [None]:
!pip install swifter
import swifter

In [None]:
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

In [5]:
#Funtion to rename columns and remove the index column
def rename_cols(dataset):
    ds_new = dataset.rename(columns = {0:"Target_Word", 1:"Sense_ID", 2:"Sentence"})
    ds_new = ds_new.reset_index(drop=True)
    return ds_new


In [6]:
#Function to clean the given sentence like removing punctuation and digits and, convert to lowercase 
def clean(sent):
    to_remove = string.punctuation.replace('%', '')
    sent = sent.lower()
    sent = sent.translate(str.maketrans('', '', to_remove ))
    to_remove_digits = str.maketrans('', '', digits) 
    sent = sent.translate(to_remove_digits)
    return sent

In [7]:
#Function to return list of [pos, lemma of word] for each word in a given sentence
def get_pos_wordnet(sentence):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    list_words = sentence
    return_list = []
    for i in range (len(list_words)):
        # get the POS tag of the word
        tag = nltk.pos_tag(list_words)[i][1][0].upper() 
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}
        final_tag = tag_dict.get(tag, wordnet.NOUN) # default is Noun
        lemma_word = lemmatizer.lemmatize(list_words[i],final_tag) # get lemma of the word
        return_list.append([final_tag,lemma_word]) #returns POS and lemma of word
    return return_list

In [8]:
#Fetching data from csv files
d_train = pd.read_csv (r'train.data',header=None,delimiter = "|")
d_train = d_train.reset_index()
d_test = pd.read_csv (r'test.data',header=None,delimiter = "|")
d_test = d_test.reset_index()
d_val = pd.read_csv (r'validate.data',header=None,delimiter = "|")
d_val = d_val.reset_index()

In [9]:
#rename columns of the dataframes
d_train = rename_cols(d_train)
d_test = rename_cols(d_test)
d_val = rename_cols(d_val)

In [10]:
#renaming the index column to unique ids
d_test = d_test.rename(columns = {'index':"UniqueIDs"})
d_val = d_val.rename(columns = {'index':"UniqueIDs"})

In [11]:
#remove trailing spaces from target word columns
d_train['Target_Word'] = d_train.Target_Word.str.replace(' ', '') 
d_test['Target_Word'] = d_test.Target_Word.str.replace(' ', '') 
d_val['Target_Word'] = d_val.Target_Word.str.replace(' ', '')

**Supervised WSD**

In [12]:
#set window size
#comment when computing for window size N=2
N=1
#uncomment when computing for window size N=2
#N=2

#Co-occrrence feature extraction
#Function for Feature selection based on window size specified above.
def get_features(sentence):
    
    #Cleaning the sentence
    sentence = clean(sentence)
    
    #splitting the words of the sentence
    listofwords = sentence.split()

    #Getting the POS for each word and its lemma
    listofwords= get_pos_wordnet(listofwords)
    
    #removing stop words
    stop = stopwords.words('english')
    listofwords = [x for x in listofwords if x[1] not in stop]   

    #fetching the index for first occurence of %%
    index=[x[1] for x in listofwords].index('%%')

    if index-N>0:
        listofwords = listofwords[index-N:index+N+3] 
        # will list 5 words starting from prev word, %%, target word, %% and next word for N=1
    else:
        listofwords = listofwords[:index+N+1]

    index=[x[1] for x in listofwords].index('%%')
    del listofwords[index:index+3]

    return listofwords # returns the [[pos, prev word], [pos, next word]]


In [13]:
#Function to train separate model per each target word using Prior probabilites and feature probabilities
def generate_train_model(d_train , target, smooth = False):

    #limiting the training data for words with instances more than 100
    d_train["num"] = d_train.groupby("Target_Word")["Target_Word"].cumcount()+1
    d_train = d_train.loc[d_train['num'] < 100]
    d_train = d_train

    new_df_train = d_train.loc[d_train['Target_Word'] == target]
    new_df_train = new_df_train.reset_index()
    
    # to get feature vectors in the form [[pos, prev word], [pos, next word]] and each feature vectors count
    new_df_train['FeatureVec'] = new_df_train.swifter.apply(lambda x : get_features(x['Sentence']),axis=1) 
    #new_df_train['CountofFV']  = new_df_train.swifter.apply(lambda x : len(x['FeatureVec']),axis=1) 

    #Calculate Prior Probabilities
    d_senses = pd.DataFrame(new_df_train.Sense_ID.value_counts())
    # each sense id count/total count of all senses for that target word   
    d_senses['PriorProb']= d_senses['Sense_ID']/sum(d_senses['Sense_ID'])     
    list1 = list(d_senses.index)
    list2 = list(d_senses.PriorProb)
    priorprob_dict = dict(zip(list1,list2)) # sense id and prior prob
    
    #number of times the target words appears in a particular sense
    prob_word = pd.DataFrame(new_df_train.groupby(by = 'Sense_ID').sum())
    prob_word = prob_word.reset_index()
    list1 = list(prob_word.Sense_ID)
    list2 = list(prob_word.index)
    count_target_words = dict(zip(list1,list2)) # sense id and target word count
    
    #Calculating the occurrences of feature words within a context and returning format pos-word-sense_id:count
    train_feature_corpus = new_df_train['FeatureVec'].tolist()
    senseid_features_list = list(zip(new_df_train['Sense_ID'].tolist(),train_feature_corpus))
    feature_occ_incontext = {}
    for temp in senseid_features_list:
        for j in temp[1]:
            key = j[0]+'-'+j[1]+'-'+str(temp[0])
            if key not in feature_occ_incontext:
                feature_occ_incontext[key] = 1
            else:
                feature_occ_incontext[key] = feature_occ_incontext[key]+1
    
    #Creatimg a dataframe with senseid, word, numerator and denominator for individual feature probability
    feature_prob_df = pd.DataFrame(columns=['Sense_ID','Word','Numerator','Denominator']) 
    for k in feature_occ_incontext.keys():
        sense_id = k.split('-')[-1]
        word = k.split('-')[:2]
        word = "-".join(word)

        to_append = [sense_id, word, feature_occ_incontext[k], count_target_words[int(sense_id)]]
        i = len(feature_prob_df)
        feature_prob_df.loc[i] = to_append
    
    #Calculating the feature probability
    if smooth:
      #Getting the vocabulary of words
      V = len(feature_occ_incontext.keys())
      #uncomment this line for lamda =1
      #lamda = 1
      #comment this line for lamda =1
      lamda = 0.001
      #lamda = 0.01
      feature_prob_df['Probability'] = (feature_prob_df['Numerator']+lamda) / (feature_prob_df['Denominator']+(V*lamda))
    else:
      feature_prob_df['Probability'] = (feature_prob_df['Numerator']) / (feature_prob_df['Denominator'])

    feature_prob_df['Word_Sense_ID'] = feature_prob_df['Word']+ '-' +feature_prob_df['Sense_ID']
    training_dict = dict(zip(feature_prob_df['Word_Sense_ID'], feature_prob_df['Probability']))
    return training_dict, priorprob_dict



In [14]:
#uncomment below when combine training and validation, to run the model on test dataset
#d_train = d_train.append(d_val)
#d_train = d_train.reset_index()
#d_train = d_train.rename(columns = {'index':"UniqueIDs"})

#uncomment below line when using the validation data for testing
d_test = d_val

target_words_unique = d_test['Target_Word'].value_counts()
target_words_unique = pd.DataFrame(target_words_unique)
target_words_unique = target_words_unique.reset_index()
target_words_unique.columns=['Words','Count']

In [15]:
target_words = target_words_unique.Words.tolist()
predictions = pd.DataFrame(columns=['PredictedSenseId','Sentence','TargetWord','UniqueIDs', 'ActualSenseId'])

In [None]:
counter=0
for target_word in target_words:
    print("Processed ", counter,"Target words out of ", len(target_words))    
    counter = counter+1

    #Fetching the test instances for the target word from test data
    test_inst = d_test.loc[d_test['Target_Word'] == target_word]
    test_inst = test_inst.reset_index()

    #Generate the model for every target word, along with its prior probability with add-lambda smoothing
    training_model , priorprob_dict = generate_train_model(d_train, target_word, smooth = True)

    #uncomment when we want to generate the model for every target word with no Smoothing
    #training_model , priorprob_dict = generate_train_model(df_train, target_word) # throws division by zero error

    #Fetching test instances unique ids and actual sense ids
    ids = test_inst['UniqueIDs'].tolist()
    actual_sense_id = test_inst['Sense_ID'].tolist()

    j=0
    for sent in test_inst['Sentence']:
        #get the feature vectors for each test sentences
        eachsent = sent
        eachsent_fv = get_features(eachsent) 
        
        results = {}
        #Fetching unique sense ids from training data
        sense_ids = priorprob_dict.keys() 

        for sense_id in sense_ids:
            prob_vals = []
            listoffv = []
            for fv in eachsent_fv:
                test_fv = fv[0]+'-'+fv[1]+'-'+str(sense_id) #pos-word-senseid from test sentence
                if test_fv in training_model:
                    prob_vals.append(training_model[test_fv]) #Individual probabilities for each features
                    listoffv.append(test_fv) #Word_Sense_id

            #Calculating probability for each sense for a target word
            results[sense_id] = priorprob_dict[sense_id] * np.prod(prob_vals)

        #Get the max probability sense id
        max_val_senseid = max(results.items(), key=operator.itemgetter(1))[0]
        to_append = [max_val_senseid, sent, target_word, ids[j], actual_sense_id[j]]
        j = j+1
        length = len(predictions)
        predictions.loc[length] = to_append


Without smoothing the process throws a division by zero error.

**For lambda = 1 & N=1**

Commented the lambda = 0.01 and made it to lambda = 1, for validation data with N=1 as window size

In [None]:
# Checking the accuracy
val_predictions_l1_N1 = predictions

val_predictions_l1_N1['Acc'] = val_predictions_l1_N1.ActualSenseId == val_predictions_l1_N1.PredictedSenseId
accuracy = val_predictions_l1_N1['Acc'].value_counts() / len(val_predictions_l1_N1['Acc'])
print("Validation Data accuracy is %s ",str(accuracy[1])) #53.59%

Validation Data accuracy is %s  0.5359056806002144


In [None]:
#Extracting data into pickle files
val_predictions_l1_N1.to_pickle('Final_Predictions_val_lambda1.pkl')
files.download('Final_Predictions_val_lambda1.pkl')

In [None]:
#Extracting into csv files
val_predictions_l1_N1.to_csv(r'Validation_predictions_lambda_1.csv')


**For lambda = 0.001 & N=1**

Predictons when lambda = 0.001 and N window for feature vector selection is 1

In [20]:
val_predictions_l0001_N1 = predictions

#Checking the accuracy 
val_predictions_l0001_N1['Acc'] = val_predictions_l0001_N1.ActualSenseId == val_predictions_l0001_N1.PredictedSenseId
accuracy = val_predictions_l0001_N1['Acc'].value_counts()/len(val_predictions_l0001_N1['Acc'])
print("Validation Data accuracy is %s ",str(accuracy[1])) # 81.78%

Validation Data accuracy is %s  0.8177920685959271


In [21]:
#Extracting data into pickle files
val_predictions_l0001_N1.to_pickle('Final_Predictions_val_lambda0001.pkl')
files.download('Final_Predictions_val_lambda0001.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
#Extracting data into csv
val_predictions_l0001_N1.to_csv(r'Validation_predictions_lambda_l0001_N1.csv')

**For lambda = 0.01 & N=1**

Predictons when lambda = 0.01 and N window for feature vector selection is 1

In [None]:
val_predictions_l001_N1 = predictions

#Checking the accuracy 
val_predictions_l001_N1['Acc'] = val_predictions_l001_N1.ActualSenseId == val_predictions_l001_N1.PredictedSenseId
accuracy = val_predictions_l001_N1['Acc'].value_counts()/len(val_predictions_l001_N1['Acc'])
print("Validation Data accuracy is %s ",str(accuracy[1])) # 80.70%

Validation Data accuracy is %s  0.8070739549839229


In [None]:
#Extracting data into pickle files
val_predictions_l001_N1.to_pickle('Final_Predictions_val_lambda001.pkl')
files.download('Final_Predictions_val_lambda001.pkl')

In [None]:
#Extracting data into csv
val_predictions_l001_N1.to_csv(r'Validation_predictions_lambda_001.csv')

**For lambda= 0.01 and N=2**

Lambda = 0.01 gives us better accuracy of 81% than lambda = 1 which gives an accuracy of 54% 

Before moving to test data, we will check if N =2 as window size for feature selection improves the accuracy or not. Commented N=1 in this case.

In [None]:
val_predictions_l001_N2 = predictions

#Checking the accuracy 
val_predictions_l001_N2['Acc']= val_predictions_l001_N2.ActualSenseId == val_predictions_l001_N2.PredictedSenseId
accuracy = val_predictions_l001_N2['Acc'].value_counts()/len(val_predictions_l001_N2['Acc'])
print("Validation Data accuracy is %s ",str(accuracy[1])) # 74.17%

Validation Data accuracy is %s  0.7416934619506966


In [None]:
#extracting into a pickle file
val_predictions_l001_N2.to_pickle('Final_Predictions_Val_lambda001_N2.pkl')
files.download('Final_Predictions_Val_lambda001_N2.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Extracting into a csv file
val_predictions_l001_N2.to_csv(r'Final_Predictions_Val_lambda001_N2.csv')


**For lambda=0.001 and N=1 for test data predictions**

So we will be using lambda = 0.001 with N=1 for predicting senses for test data. We will also be using the train+validation data as training data in this case.

In [17]:
test_predictions_l0001_N1 = predictions

In [19]:
#extract data into pickle file
test_predictions_l0001_N1.to_pickle('Final_Predictions_test.pkl')
files.download('Final_Predictions_test.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
#Extract into a csv file
test_predictions_l0001_N1.to_csv(r'Final_Predictions_test.csv')

**BASELINE MODEL**

In [None]:
#For computing baseline accuracy, we use the Validation data
d_baseline = d_val

In [None]:
d_baseline

Unnamed: 0,UniqueIDs,Target_Word,Sense_ID,Sentence
0,0,capital.n,1,"The firm 's capital , moreover , has n't grow..."
1,1,capital.n,1,This observation leads us to another piece of...
2,2,capital.n,1,`` It 's a problem that clearly has to be res...
3,3,capital.n,1,Drexel this year eliminated its retail or ind...
4,4,capital.n,1,Municipals Rebounding stocks and weaker Treas...
...,...,...,...,...
928,928,keep.v,1,"Mr. Kaye says he has paid more than $ 70,000 ..."
929,929,keep.v,4,"Although the report , which was released befo..."
930,930,keep.v,1,Fed Chairman Greenspan was surprised by both ...
931,931,maintain.v,1,"Overall , though , the South and West still o..."


In [None]:
#retrieve list of unique target words
target_words_unique = d_baseline['Target_Word'].value_counts()
target_words_unique = pd.DataFrame(target_words_unique)
target_words_unique = target_words_unique.reset_index()
target_words_unique.columns = ['Words','Count']
targetwordlist = target_words_unique.Words.tolist()

In [None]:
target_words_unique

In [None]:
### BASELINE MODEL ###
#creating a table to store predictions
predictions = pd.DataFrame(columns=['PredictedSenseId','Sentence','ActualSenseId','UniqueIDs'])
counter=0

# Baseline Model for each target word  --> Most frequent sense with highest P(s) => Random Guess
for target_word in targetwordlist:
    print("Processed ", counter,"Target words out of ", len(targetwordlist))    
    counter = counter+1

    #Fetching test instances from df_baseline for that target word            
    test_inst = d_baseline.loc[d_baseline['Target_Word'] == target_word]
    test_inst = test_inst.reset_index()
                    
    ids = test_inst['UniqueIDs'].tolist()
    actual_senses = test_inst['Sense_ID'].tolist()
                    
    #Fetching train instances from df_train for that target word
    train_inst = d_train.loc[d_train['Target_Word'] == target_word]

    #Fetching the different sense ids for this targte word from training data
    senses = pd.DataFrame(train_inst.Sense_ID.value_counts())
    senses = senses.reset_index()

    #assign the most frequent sense_id as a prediction of the target word
    pred = senses[senses.Sense_ID == senses.Sense_ID.max()]['index'].tolist()[0]  

    i=0
    for sent in test_inst['Sentence']:
        to_append = [pred, sent,actual_senses[i],ids[i]]
        i=i+1
        length = len(predictions)
        predictions.loc[length] = to_append



In [None]:
baseline_predictions = predictions
baseline_predictions

Unnamed: 0,PredictedSenseId,Sentence,ActualSenseId,UniqueIDs
0,1,Avions Marcel Dassault-Breguet Aviation S.A. ...,1,377
1,1,"Roger Rosenblatt , editor of U.S. News & Worl...",1,378
2,1,Asked to compare her visit to Mr. Mosbacher '...,1,379
3,1,"Pension funds , insurers and other behemoths ...",1,380
4,1,Freeport-McMoRan Inc. said a temporary cessat...,2,381
...,...,...,...,...
928,1,This may be true whether the farm is owned or...,2,288
929,1,"But a lawyer for Triland Investment Group , t...",1,112
930,1,`` I was buying at the close ( Friday ) and I...,1,516
931,1,The location was disclosed as the U.S. began ...,1,572


In [None]:
#Add a new column for measuring Accuracy
baseline_predictions['Accuracy'] = baseline_predictions.PredictedSenseId == baseline_predictions.ActualSenseId
acc = baseline_predictions['Accuracy'].value_counts()/len(baseline_predictions['Accuracy'])
print("Baseline accuracy is %s ",str(acc[1]))

Baseline accuracy is %s  0.8113612004287245


*Accuracy of Baseline Model = 81.13%*