# EECS 498 - Assignment 4 - Word Sense Disambiguation
### By: Alexander "AJ" Goldstein - uniquename: ajva

In [49]:
import csv
import sys
import pandas as pd
import scipy
import numpy as np
import math
from __future__ import division
import random
from random import shuffle
import matplotlib.pyplot as plt
import operator

## Pre-processing Functions:

### 1) establish possible senses for current word disambiguation

In [50]:
def establish_senses(dataFile):
    
    senses_dict = {}
    senses_features_dict = {}
    sense_counts = {}
    total_instances = 0
    total_senses = 0
    
    with open(dataFile) as data:
        
        # reset read file pointer
        data.seek(0)
        
        # for each line in the file...
        for line in data:
            
            # if it's the start of a new instance...
            if line.find("<instance") != -1:
                total_instances += 1
                sense = next(data).split(" ")[2].split('%')[1].strip("\"/>\n")
                
                # if it's a new sense...
                if sense not in senses_dict:
                    total_senses += 1
                    senses_dict[sense] = 1.0
                    senses_features_dict[sense] = {}
                    sense_counts[sense] = 1
                    
                # otherwise just index the sense count
                else:
                    sense_counts[sense] += 1
        
    return senses_dict, senses_features_dict, total_instances, total_senses, sense_counts

### 2) parse the training folds for data counts

In [51]:
def parse_datafile(dataFile, senses_dict, senses_features_dict, fold, numFolds = 5):
    
    stop_words_list = ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'ca', 'could', 'did', 'do', 'does', 'doing', 'done', 'down', 'due', 'during', 'each', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'have', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'if', 'in', 'inc', 'indeed', 'into', 'is', 'it', 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'just', 'made', 'make', 'many', 'may', 'me', 'meanwhile', 'might', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'quite', 'rather', 're', 'really', 'regarding', 'same', 'say', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', 'show', 'side', 'since', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'under', 'until', 'up', 'unless', 'upon', 'us', 'used', 'using', 'various', 'very', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion', 'gajillion', 'bazillion', 'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eigth', 'ninth', 'tenth', 'eleventh', 'twelveth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', 'sventeenth', 'eighteenth', 'nineteenth', 'twentieth', '', 'thirtieth', 'fortieth', 'fiftieth', 'sixtieth', 'seventieth', 'eightieth', 'ninetieth', 'hundreth', 'thousandth', 'millionth', 'billionth', 'trillionth', 'quadrillionth', 'gajillionth', 'bazillionth']
    stripList = ['.', '(', ')', ',', '-', '!', '?']
    
    with open(dataFile) as data:
        train_count = 0
        instance_count = 0
        
        # reset read file pointer
        data.seek(0)
        
        # for each line in the file...
        for line in data:

            # if it's the start of a new instance...
            if line.find("<instance") != -1:
                instance_count += 1
                ID = line.split(" ")[1].split("=")[1].strip('\"')
                sense = next(data).split(" ")[2].split('%')[1].strip("\"/>\n")
                
                # if you're on a proper TRAINING line
                if instance_count%numFolds != fold:
                    train_count += 1
                    startContext = next(data)
                    
                    # capture the message (and it's features)
                    features = next(data).strip('. \n').split(" ")
                    for word in features:
                        
                        # strip excess puntuation
                        for stripItem in stripList:
                            word = word.strip(stripItem)
                        word = word.lower().strip()
                        
                        # store all new words as "present" features for each sense
                        if word.find("<head>") == -1 and word not in stop_words_list:
                            for sense in senses_dict:
                                if word not in senses_features_dict[sense]:
                                    senses_features_dict[sense][word] = 1.0
                                    
    return senses_dict, senses_features_dict, instance_count, train_count

### 3) make predictions on test fold instances

In [295]:
def predict_instances(dataFile, outputFile, senses_dict, senses_features_dict, sense_counts, fold, numFolds = 5):
    
    stop_words_list = ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'ca', 'could', 'did', 'do', 'does', 'doing', 'done', 'down', 'due', 'during', 'each', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'has', 'have', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'if', 'in', 'inc', 'indeed', 'into', 'is', 'it', 'its', 'itself', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'just', 'made', 'make', 'many', 'may', 'me', 'meanwhile', 'might', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'quite', 'rather', 're', 'really', 'regarding', 'same', 'say', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'she', 'should', 'show', 'side', 'since', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'take', 'ten', 'than', 'that', 'the', 'their', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'under', 'until', 'up', 'unless', 'upon', 'us', 'used', 'using', 'various', 'very', 'very', 'via', 'was', 'we', 'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves', 'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 'ten', 'eleven', 'twelve', 'thirteen', 'fourteen', 'fifteen', 'sixteen', 'seventeen', 'eighteen', 'nineteen', 'twenty', 'thirty', 'forty', 'fifty', 'sixty', 'seventy', 'eighty', 'ninety', 'hundred', 'thousand', 'million', 'billion', 'trillion', 'quadrillion', 'gajillion', 'bazillion', 'first', 'second', 'third', 'fourth', 'fifth', 'sixth', 'seventh', 'eigth', 'ninth', 'tenth', 'eleventh', 'twelveth', 'thirteenth', 'fourteenth', 'fifteenth', 'sixteenth', 'sventeenth', 'eighteenth', 'nineteenth', 'twentieth', '', 'thirtieth', 'fortieth', 'fiftieth', 'sixtieth', 'seventieth', 'eightieth', 'ninetieth', 'hundreth', 'thousandth', 'millionth', 'billionth', 'trillionth', 'quadrillionth', 'gajillionth', 'bazillionth']
    stripList = ['.', '(', ')', ',', '-', '!', '?']
    
    # start new fold print
    outputFile.write('Fold ' + str(fold+1) + '\n')
    
    with open(dataFile) as data:
        test_count = 0
        correct_count = 0
        instance_count = 0
        
        # reset read file pointer
        data.seek(0)
        
        # for each line in the file...
        for line in data:

            # if it's the start of a new instance...
            if line.find("<instance") > -1:
                instance_count += 1
                ID = line.split(" ")[1].split("=")[1].strip('\"')
                true_sense = next(data).split(" ")[2].split('%')[1].strip("\"/>\n")
                
                # if you're on a proper TESTING line
                if instance_count%numFolds == fold:
                    test_count += 1
                    startContext = next(data)
                    
                    # capture the message (and it's features)
                    features = next(data).strip('. \n').split(" ")
                    for word in features:
                        
                        # strip excess puntuation
                        for stripItem in stripList:
                            word = word.strip(stripItem)
                        word = word.lower().strip()
                        
                        # store all new TEST words as "absent" features for each sense
                        if word.find("<head>") == -1 and word not in stop_words_list:
                            for sense in senses_dict:
                                if word not in senses_features_dict[sense]:
                                    senses_features_dict[sense][word] = 0.0
                    
                    # conduct add-one smoothing
                    for sense in senses_dict:
                        for word in senses_features_dict[sense]:
                            senses_features_dict[sense][word] += 0.01
                    
                    # calculate the argmax (probabilities for each sense)
                    pred_probs = {}
                    for sense in senses_dict:
                        pred_probs[sense] = 1.0
                        
                        # factor in conditional probability for each word
                        for word in features:
                            
                            # strip excess puntuation
                            for stripItem in stripList:
                                word = word.strip(stripItem)
                            word = word.lower().strip()
                            
                            # calculate & factor-in feature probabilities
                            if word.find("<head>") == -1 and word not in stop_words_list:
                                feat_prob = senses_features_dict[sense][word]/sense_counts[sense]
                                pred_probs[sense] = pred_probs[sense] * feat_prob
                        
                        # calculate & factor-in sense probability
                        sense_prob = sense_counts[sense]/sum(sense_counts.values())
                        pred_probs[sense] = pred_probs[sense]*sense_prob
                        #print(sense, pred_probs[sense])
                    
                    # identify the sense with the highest probability
                    pred_sense = max(pred_probs.items(), key=operator.itemgetter(1))[0]
                    sense_probs = [count / sum(sense_counts.values()) for count in sense_counts.values()]
                    pred_sense = np.random.choice(sense_counts.keys(), p=sense_probs)
                    
                    # check if prediction is correct
                    if pred_sense == true_sense:
                        correct_count += 1

                    # output prediction
                    outputFile.write(ID + ' ' + ID.split('.')[0] + '%' + pred_sense + '\n')
                    
                # otherwise, if not a test instance    
                else:
                    next(data)
                    next(data)
                
                context_end = next(data)
                instance_close_line = next(data)
                empty_line = next(data)
    
    accuracy = float(correct_count/test_count)
    print('Fold ' + str(fold+1) + ' accuracy: ' + str(accuracy))
    outputFile.write('...'+'\n')
    
    return accuracy

## MAIN FUNCTION :

### 1) read in filename

In [296]:
# SCRIPT NOTE: switch out arguments

# read in file
dataFile = "WSD/tank.wsd"
#dataFile = sys.argv[1]

### 2) create output file

In [297]:
# create output file: <word>.wsd.out
outputName = dataFile + ".out"
outputFile = open(outputName, "w")

### 3) parse & predict for all 5 CV folds

In [298]:
accuracy_list = []
for fold in [0,1,2,3,4]:
    
    # establish senses
    senses, features, tot_inst, tot_sens, sense_counts = establish_senses(dataFile)
    
    # parse the training folds for counts needed
    senses, features, inst_count, train_count = parse_datafile(dataFile, senses, features, fold)
    
    # make predictions on test fold instances
    accuracy = predict_instances(dataFile, outputFile, senses, features, sense_counts, fold)
    accuracy_list.append(accuracy)

avg_accuracy = sum(accuracy_list)/len(accuracy_list)
print('Average Accuracy: ' + str(avg_accuracy))

outputFile.close()

Fold 1 accuracy: 0.375
Fold 2 accuracy: 0.512195121951
Fold 3 accuracy: 0.475
Fold 4 accuracy: 0.575
Fold 5 accuracy: 0.55
Average Accuracy: 0.49743902439
