In [253]:
import numpy as np
from itertools import groupby
import pandas as pd
import math

UNKNOWN_TOKEN = "#UNK#"

filepath = "ML Project/EN/train"
observationSequence = ['RT','RT', '@shaunproulx', ':', 'Encore', '!', '@bifnaked', 'for', 'the', 'whole', '@ShaunProulxShow', 'tonight', 'Intimate', 'convo', 'with', 'rock', 'star', '/', 'cancer', 'survivor', '-', '10:30', 'E', '...']
tagSequence = ['O','O','O','O','B-INTJ','O','O','B-PP','B-NP','I-NP','O','B-NP','O','B-NP','I-NP','B-PP','B-NP','I-NP','I-NP','I-NP','I-NP','I-NP','O','B-NP','I-NP','O']

In [254]:
def prepare_data(file):
    """
    Prepare the file, and returns a list of lists of "{observation} {label}"
    file : the name of the file to read
    """

    lines = [line for line in file]
    chunks = (list(g) for k, g in groupby(lines, key=lambda x: x != '\n') if k)
    return [[observation.rstrip('\n') for observation in chunk] for chunk in chunks]

file = open("ML Project/EN/train")
sequence = prepare_data(file)
# print (sequence)

[['RT O', '@shaunproulx O', ': O', 'Encore B-INTJ', '! O', '@bifnaked O', 'for B-PP', 'the B-NP', 'whole I-NP', '@ShaunProulxShow O', 'tonight B-NP', '! O', 'Intimate B-NP', 'convo I-NP', 'with B-PP', 'the B-NP', 'rock I-NP', 'star I-NP', '/ I-NP', 'cancer I-NP', 'survivor I-NP', '- O', '10:30 B-NP', 'E I-NP', '... O'], ['Pick B-VP', 'up B-PRT', 'here B-NP', '. O', 'Deliver B-VP', 'on B-PP', 'Dayton B-NP', 'NJ I-NP', 'tomorrow B-NP', '(@ B-PP', 'Ball B-NP', 'Metal I-NP', 'Container I-NP', ') O'], ['Cant B-VP', 'wait I-VP', 'for B-PP', 'the B-NP', 'ravens I-NP', 'game I-NP', 'tomorrow B-NP', '.... O', 'go B-VP', 'ray B-NP', 'rice I-NP', '!!!!!!! O'], ['@CELLY32 O', 'I B-NP', 'went B-VP', 'to B-PP', 'kolb B-NP', 'for B-PP', 'a B-NP', 'week I-NP', 'lol B-INTJ', 'that B-NP', 'shit I-NP', 'was B-VP', 'trash B-ADJP', 'lmao B-INTJ'], ['the B-NP', 'day I-NP', 'you B-NP', 'go B-VP', 'shirtless B-ADJP', 'in B-PP', 'school B-NP', 'is B-VP', 'the B-NP', 'day I-NP', 'I B-NP', 'give B-VP', 'birth B-

In [255]:
def get_observations(sequence,k=1):
    observationsDict = {}
    sequence = (item for sublist in sequence for item in sublist)
    for item in sequence:
        observation = item.rsplit(" ", 1)[0]
        if observation not in observationsDict:  
             observationsDict[observation] = 1
        else:  
             observationsDict[observation] += 1
        
    observationsList = list(observationsDict)
    for observation in observationsDict:
        if observationsDict[observation] < k:
            observationsList.remove(observation)
    
    observationsList.append(UNKNOWN_TOKEN)
    
    return observationsList,observationsDict
    
observationsList,observationsDict = get_observations(sequence)
# print (observationsList)

['RT', '@shaunproulx', ':', 'Encore', '!', '@bifnaked', 'for', 'the', 'whole', '@ShaunProulxShow', 'tonight', 'Intimate', 'convo', 'with', 'rock', 'star', '/', 'cancer', 'survivor', '-', '10:30', 'E', '...', 'Pick', 'up', 'here', '.', 'Deliver', 'on', 'Dayton', 'NJ', 'tomorrow', '(@', 'Ball', 'Metal', 'Container', ')', 'Cant', 'wait', 'ravens', 'game', '....', 'go', 'ray', 'rice', '!!!!!!!', '@CELLY32', 'I', 'went', 'to', 'kolb', 'a', 'week', 'lol', 'that', 'shit', 'was', 'trash', 'lmao', 'day', 'you', 'shirtless', 'in', 'school', 'is', 'give', 'birth', 'pig', '@MyssLidia', 'If', 'u', 'call', 'someone', '5x', "'s", 'and', 'they', 'do', "n't", 'answer', ',', 'not', 'get', 'hint', 'MAYBE', 'just', 'wanna', 'talk', '!!!!!', '@PhilHiPhy', 'when', 'r', 'an', 'upgrade', '?:', 'P', 'miss', 'being', 'able', 'run', 'dance', 'around', 'all', 'instead', 'of', 'having', 'catch', 'my', 'breath', 'every', 'five', 'minutes', 'life', '@BeeeestDJ', 'it', 'seems', 'like', 'i', 'gotta', 'ignore', 'blame'

In [256]:
def get_tags(sequence):
    tagsList = []
    sequence = (item for sublist in sequence for item in sublist)
    for item in sequence:
        tag = item.rsplit(" ", 1)[1]
        if tag not in tagsList: 
            tagsList.append(tag)
            
    return tagsList

tagsList = get_tags(sequence)
# print (tagsList)

['O', 'B-INTJ', 'B-PP', 'B-NP', 'I-NP', 'B-VP', 'B-PRT', 'I-VP', 'B-ADJP', 'B-SBAR', 'B-ADVP', 'I-INTJ', 'B-CONJP', 'I-CONJP', 'I-ADVP', 'I-ADJP', 'I-SBAR', 'I-PP']


In [249]:
def initialize_emission_weights(observationsList,tagsList):
    emissionWeights = {}
    for tag in tagsList:
        emissionWeights[tag] = {}
        for observation in observationsList:
            emissionWeights[tag][observation] = [0.0,0.0]
            
    return emissionWeights

def initialize_transition_weights(tagsList):
    transitionWeights = {}
    from_Y_List = ['START'] + list(tagsList)
    to_Y_List = list(tagsList) + ['STOP']
    for from_Y_Tag in from_Y_List:
        transitionWeights[from_Y_Tag] = {}
        for to_Y_Tag in to_Y_List:
            transitionWeights[from_Y_Tag][to_Y_Tag] = [0.0,0.0]
            
    return transitionWeights

emissionWeights = initialize_emission_weights(observationsList,tagsList)
transitionWeights = initialize_transition_weights(tagsList)
print (emissionWeights)

{'O': {'RT': [0.0, 0.0], '@shaunproulx': [0.0, 0.0], ':': [0.0, 0.0], 'Encore': [0.0, 0.0], '!': [0.0, 0.0], '@bifnaked': [0.0, 0.0], 'for': [0.0, 0.0], 'the': [0.0, 0.0], 'whole': [0.0, 0.0], '@ShaunProulxShow': [0.0, 0.0], 'tonight': [0.0, 0.0], 'Intimate': [0.0, 0.0], 'convo': [0.0, 0.0], 'with': [0.0, 0.0], 'rock': [0.0, 0.0], 'star': [0.0, 0.0], '/': [0.0, 0.0], 'cancer': [0.0, 0.0], 'survivor': [0.0, 0.0], '-': [0.0, 0.0], '10:30': [0.0, 0.0], 'E': [0.0, 0.0], '...': [0.0, 0.0], 'Pick': [0.0, 0.0], 'up': [0.0, 0.0], 'here': [0.0, 0.0], '.': [0.0, 0.0], 'Deliver': [0.0, 0.0], 'on': [0.0, 0.0], 'Dayton': [0.0, 0.0], 'NJ': [0.0, 0.0], 'tomorrow': [0.0, 0.0], '(@': [0.0, 0.0], 'Ball': [0.0, 0.0], 'Metal': [0.0, 0.0], 'Container': [0.0, 0.0], ')': [0.0, 0.0], 'Cant': [0.0, 0.0], 'wait': [0.0, 0.0], 'ravens': [0.0, 0.0], 'game': [0.0, 0.0], '....': [0.0, 0.0], 'go': [0.0, 0.0], 'ray': [0.0, 0.0], 'rice': [0.0, 0.0], '!!!!!!!': [0.0, 0.0], '@CELLY32': [0.0, 0.0], 'I': [0.0, 0.0], 'went'

In [250]:
print (transitionWeights)

{'START': {'O': [0.0, 0.0], 'B-INTJ': [0.0, 0.0], 'B-PP': [0.0, 0.0], 'B-NP': [0.0, 0.0], 'I-NP': [0.0, 0.0], 'B-VP': [0.0, 0.0], 'B-PRT': [0.0, 0.0], 'I-VP': [0.0, 0.0], 'B-ADJP': [0.0, 0.0], 'B-SBAR': [0.0, 0.0], 'B-ADVP': [0.0, 0.0], 'I-INTJ': [0.0, 0.0], 'B-CONJP': [0.0, 0.0], 'I-CONJP': [0.0, 0.0], 'I-ADVP': [0.0, 0.0], 'I-ADJP': [0.0, 0.0], 'I-SBAR': [0.0, 0.0], 'I-PP': [0.0, 0.0], 'STOP': [0.0, 0.0]}, 'O': {'O': [0.0, 0.0], 'B-INTJ': [0.0, 0.0], 'B-PP': [0.0, 0.0], 'B-NP': [0.0, 0.0], 'I-NP': [0.0, 0.0], 'B-VP': [0.0, 0.0], 'B-PRT': [0.0, 0.0], 'I-VP': [0.0, 0.0], 'B-ADJP': [0.0, 0.0], 'B-SBAR': [0.0, 0.0], 'B-ADVP': [0.0, 0.0], 'I-INTJ': [0.0, 0.0], 'B-CONJP': [0.0, 0.0], 'I-CONJP': [0.0, 0.0], 'I-ADVP': [0.0, 0.0], 'I-ADJP': [0.0, 0.0], 'I-SBAR': [0.0, 0.0], 'I-PP': [0.0, 0.0], 'STOP': [0.0, 0.0]}, 'B-INTJ': {'O': [0.0, 0.0], 'B-INTJ': [0.0, 0.0], 'B-PP': [0.0, 0.0], 'B-NP': [0.0, 0.0], 'I-NP': [0.0, 0.0], 'B-VP': [0.0, 0.0], 'B-PRT': [0.0, 0.0], 'I-VP': [0.0, 0.0], 'B-ADJP': 

In [251]:
def get_transition_parameters(tagsList,sequence):
    from_Y_List = ['START'] + tagsList
    to_Y_List = tagsList + ['STOP']
    length = len(from_Y_List)
    transitionDF = pd.DataFrame(np.zeros((length,length)),index = from_Y_List,columns = to_Y_List)
    
    sentenceVector = []
    for sentence in sequence:
        observationVector = []
        tagVector = []
        for word in sentence:
            observation = word.rsplit(" ", 1)[0]
            tag = word.rsplit(" ", 1)[1]
            observationVector.append(observation)
            tagVector.append(tag)
        sentenceVector.append([observationVector,tagVector])
        
    for sentence in sentenceVector:
        observationVector,tagVector = sentence
        length = len(observationVector)
        for i in range(length+1):
            if i == 0 :
                transitionDF.loc['START',tagVector[0]] += 1

            elif i == length:
                transitionDF.loc[tagVector[i-1],'STOP'] +=1

            else:
                transitionDF.loc[tagVector[i-1],tagVector[i]] += 1
    
    for i in range(len(transitionDF.index)):
        transitionDF.iloc[i,:] /= transitionDF.iloc[i,:].sum()
            
    return transitionDF

transitionParameters = get_transition_parameters(tagsList,sequence)
print (transitionParameters)

                O    B-INTJ      B-PP      B-NP      I-NP      B-VP     B-PRT  \
START    0.421053  0.054446  0.014519  0.346642  0.000000  0.110708  0.000000   
O        0.266089  0.062072  0.019626  0.300776  0.000913  0.104062  0.000456   
B-INTJ   0.287938  0.038911  0.019455  0.124514  0.000000  0.038911  0.000000   
B-PP     0.062849  0.002793  0.005587  0.870112  0.000000  0.033520  0.000000   
B-NP     0.156431  0.004697  0.052746  0.053829  0.429913  0.244581  0.004335   
I-NP     0.247374  0.016282  0.123950  0.076681  0.371849  0.107143  0.002101   
B-VP     0.059533  0.004521  0.089676  0.378297  0.002261  0.008289  0.032404   
B-PRT    0.223529  0.035294  0.270588  0.352941  0.000000  0.035294  0.000000   
I-VP     0.079077  0.006590  0.120264  0.332784  0.000000  0.008237  0.041186   
B-ADJP   0.245283  0.037736  0.182390  0.075472  0.006289  0.050314  0.000000   
B-SBAR   0.070423  0.000000  0.000000  0.859155  0.000000  0.000000  0.000000   
B-ADVP   0.217617  0.020725 

In [252]:
def get_emission_counts(sequence,observationsList,tagsList):
    sentenceVector = []
    for sentence in sequence:
        observationVector = []
        tagVector = []
        for word in sentence:
            observation = word.rsplit(" ", 1)[0]
            tag = word.rsplit(" ", 1)[1]
            observationVector.append(observation)
            tagVector.append(tag)
        sentenceVector.append([observationVector,tagVector])

    observationsLength = len(observationsList)
    tagsLength = len(tagsList)
    emissionDF = pd.DataFrame(np.zeros((observationsLength,tagsLength)),index = observationsList,columns = tagsList)
    emissionCount = pd.Series(np.zeros(tagsLength),index = tagsList)

    for sentence in sentenceVector:
        observationVector,tagVector = sentence
        for i in range(len(observationVector)):
            observation,tag = observationVector[i],tagVector[i]
            emissionDF.loc[observation,tag] += 1
            emissionCount[tag] += 1
        
    return emissionDF,emissionCount

def get_smooth_emission_counts(sequence,observationsList,tagsList,k=1):
    emissionDF,emissionCount = get_emission_counts(sequence,observationsList,tagsList)
    
    observationCount = emissionDF.sum(axis=1)
    fail = observationCount[observationCount < k]

    unknown = emissionDF.loc[fail.index].sum(axis=0)
    unknown.name = UNKNOWN_TOKEN
   
    smoothEmissionDF = emissionDF
    smoothEmissionDF = smoothEmissionDF.drop(fail.index, axis=0) 
    smoothEmissionDF.loc[UNKNOWN_TOKEN] = unknown    
    
    emissionParameters = smoothEmissionDF/emissionCount
    return emissionParameters

emissionParameters = get_smooth_emission_counts(sequence,observationsList,tagsList)
print (emissionParameters)

                             O    B-INTJ      B-PP      B-NP      I-NP  \
RT                    0.049749  0.000000  0.000000  0.000000  0.000000   
@shaunproulx          0.000456  0.000000  0.000000  0.000000  0.000000   
:                     0.073026  0.007782  0.000000  0.000000  0.000000   
Encore                0.000000  0.003891  0.000000  0.000000  0.000000   
!                     0.071200  0.000000  0.000000  0.000000  0.000000   
@bifnaked             0.000456  0.000000  0.000000  0.000000  0.000000   
for                   0.000000  0.000000  0.107542  0.000000  0.000525   
the                   0.000000  0.000000  0.000000  0.067919  0.003676   
whole                 0.000000  0.000000  0.000000  0.000000  0.002101   
@ShaunProulxShow      0.000456  0.000000  0.000000  0.000000  0.000000   
tonight               0.000000  0.000000  0.000000  0.014812  0.001050   
Intimate              0.000000  0.000000  0.000000  0.000361  0.000000   
convo                 0.000000  0.0000

In [288]:
def initialize_emission_weights(observationsList,tagsList,emissionParameters):
    emissionWeights = {}
    for tag in tagsList:
        emissionWeights[tag] = {}
        for observation in observationsList:
            emissionWeights[tag][observation] = [0.0,0.0]
    
    for tag in emissionWeights:
        for observation in emissionWeights[tag]:
            emissionWeights[tag][observation] = [emissionParameters[tag][observation],emissionParameters[tag][observation]]
    
    return emissionWeights

emissionWeights = initialize_emission_weights(observationsList,tagsList,emissionParameters)
print (emissionWeights)

{'O': {'RT': [0.04974897307165678, 0.04974897307165678], '@shaunproulx': [0.00045641259698767686, 0.00045641259698767686], ':': [0.0730260155180283, 0.0730260155180283], 'Encore': [0.0, 0.0], '!': [0.07120036513007759, 0.07120036513007759], '@bifnaked': [0.00045641259698767686, 0.00045641259698767686], 'for': [0.0, 0.0], 'the': [0.0, 0.0], 'whole': [0.0, 0.0], '@ShaunProulxShow': [0.00045641259698767686, 0.00045641259698767686], 'tonight': [0.0, 0.0], 'Intimate': [0.0, 0.0], 'convo': [0.0, 0.0], 'with': [0.0, 0.0], 'rock': [0.0, 0.0], 'star': [0.0, 0.0], '/': [0.0, 0.0], 'cancer': [0.0, 0.0], 'survivor': [0.0, 0.0], '-': [0.01597444089456869, 0.01597444089456869], '10:30': [0.0, 0.0], 'E': [0.0, 0.0], '...': [0.054313099041533544, 0.054313099041533544], 'Pick': [0.0, 0.0], 'up': [0.0, 0.0], 'here': [0.0, 0.0], '.': [0.14331355545413055, 0.14331355545413055], 'Deliver': [0.0, 0.0], 'on': [0.0, 0.0], 'Dayton': [0.0, 0.0], 'NJ': [0.0, 0.0], 'tomorrow': [0.0, 0.0], '(@': [0.0, 0.0], 'Ball'

In [287]:
def initialize_transition_weights(tagsList,transitionParameters):
    transitionWeights = {}
    from_Y_List = ['START'] + list(tagsList)
    to_Y_List = list(tagsList) + ['STOP']
    for from_Y_Tag in from_Y_List:
        transitionWeights[from_Y_Tag] = {}
        for to_Y_Tag in to_Y_List:
            transitionWeights[from_Y_Tag][to_Y_Tag] = [0.0,0.0]
            
    for from_Y_Tag in transitionWeights:
        for to_Y_Tag in transitionWeights[from_Y_Tag]:
            transitionWeights[from_Y_Tag][to_Y_Tag] = [transitionParameters[to_Y_Tag][from_Y_Tag],transitionParameters[to_Y_Tag][from_Y_Tag]]
    
    return transitionWeights

transitionWeights = initialize_transition_weights(tagsList,transitionParameters)
print (transitionWeights)

{'START': {'O': [0.42105263157894735, 0.42105263157894735], 'B-INTJ': [0.0544464609800363, 0.0544464609800363], 'B-PP': [0.014519056261343012, 0.014519056261343012], 'B-NP': [0.3466424682395644, 0.3466424682395644], 'I-NP': [0.0, 0.0], 'B-VP': [0.11070780399274047, 0.11070780399274047], 'B-PRT': [0.0, 0.0], 'I-VP': [0.0, 0.0], 'B-ADJP': [0.009074410163339383, 0.009074410163339383], 'B-SBAR': [0.003629764065335753, 0.003629764065335753], 'B-ADVP': [0.038112522686025406, 0.038112522686025406], 'I-INTJ': [0.0, 0.0], 'B-CONJP': [0.0018148820326678765, 0.0018148820326678765], 'I-CONJP': [0.0, 0.0], 'I-ADVP': [0.0, 0.0], 'I-ADJP': [0.0, 0.0], 'I-SBAR': [0.0, 0.0], 'I-PP': [0.0, 0.0], 'STOP': [0.0, 0.0]}, 'O': {'O': [0.2660885440438156, 0.2660885440438156], 'B-INTJ': [0.06207211319032405, 0.06207211319032405], 'B-PP': [0.019625741670470105, 0.019625741670470105], 'B-NP': [0.30077590141487903, 0.30077590141487903], 'I-NP': [0.0009128251939753537, 0.0009128251939753537], 'B-VP': [0.104062072113

In [295]:
def viterbi(observationSequence,observationsList,emissionWeights,transitionWeights,tagsList):
    pi = [{tag: [0.0, ''] for tag in tagsList} for o in observationSequence]

    # Initialization
    for tag in tagsList:
        score = 0.0
        # account for transition features
        score += transitionWeights['START'][tag][0]

        # account for emission features
        if observationSequence[0] in observationsList:  # if this word is not ##UNK##
            score += emissionWeights[tag][observationSequence[0]][0]
        else:  # if this word is ##UNK##
            score += emissionWeights[tag]['#UNK#'][0]

        pi[0][tag] = [score, 'START']

    # Recursive case
    for k in range(1, len(observationSequence)):  # pi[k][c_tag] = max(a(p_tag, c_tag)...)
        for from_Y_Tag in tagsList:
            for to_Y_Tag in tagsList:
                score = pi[k-1][to_Y_Tag][0]
                # account for transition features
                score += transitionWeights[to_Y_Tag][from_Y_Tag][0]

                if score > pi[k][from_Y_Tag][0]:
                    pi[k][from_Y_Tag] = [score, to_Y_Tag]

            # Since the emission score is not dependent on p_tag, we add it in outside the previous loop
            if observationSequence[k] in observationsList:  # if this word is not ##UNK##
                pi[k][from_Y_Tag][0] += emissionWeights[from_Y_Tag][observationSequence[k]][0]
            else:  # if this word is ##UNK##
                pi[k][from_Y_Tag][0] += emissionWeights[from_Y_Tag]['#UNK#'][0]

    # Finally
    result = [0.0, '']
    for to_Y_Tag in tagsList:
        # account for final transition to '##STOP##'
        score = pi[-1][to_Y_Tag][0] + transitionWeights[to_Y_Tag]['STOP'][0]

        if score > result[0]:
            result = [score, to_Y_Tag]

    print (pi)
    # Backtracking
    prediction = [result[1]]
    for k in reversed(range(len(observationSequence))):
        if k == 0: break  # skip START tag
        prediction.insert(0, pi[k][prediction[0]][1])

    return prediction

predictionTagSequence = viterbi(['RT', '@shaunproulx', ':', 'Encore', '!', '@bifnaked', 'for', 'the', 'whole', '@ShaunProulxShow', 'tonight', 'Intimate', 'convo', 'with', 'rock', 'star', '/', 'cancer', 'survivor', '-', '10:30', 'E', '...'],observationsList,emissionWeights,transitionWeights,tagsList)
print (predictionTagSequence)

[{'O': [0.4708016046506041, 'START'], 'B-INTJ': [0.0544464609800363, 'START'], 'B-PP': [0.014519056261343012, 'START'], 'B-NP': [0.3466424682395644, 'START'], 'I-NP': [0.0, 'START'], 'B-VP': [0.11070780399274047, 'START'], 'B-PRT': [0.0, 'START'], 'I-VP': [0.0, 'START'], 'B-ADJP': [0.009074410163339383, 'START'], 'B-SBAR': [0.003629764065335753, 'START'], 'B-ADVP': [0.038112522686025406, 'START'], 'I-INTJ': [0.0, 'START'], 'B-CONJP': [0.0018148820326678765, 'START'], 'I-CONJP': [0.0, 'START'], 'I-ADVP': [0.0, 'START'], 'I-ADJP': [0.0, 'START'], 'I-SBAR': [0.0, 'START'], 'I-PP': [0.0, 'START']}, {'O': [0.7373465612914074, 'O'], 'B-INTJ': [0.5328737178409282, 'O'], 'B-PP': [0.4904273463210742, 'O'], 'B-NP': [1.0, 'I-SBAR'], 'I-NP': [0.7765557630372523, 'B-NP'], 'B-VP': [1.0, 'I-CONJP'], 'B-PRT': [0.4712580172475918, 'O'], 'I-VP': [0.4708016046506041, 'O'], 'B-ADJP': [0.48312474476927136, 'O'], 'B-SBAR': [0.48038626918734534, 'O'], 'B-ADVP': [0.5191813399312979, 'O'], 'I-INTJ': [0.4712580

In [110]:
def updateWeights(observationSequence, goldTags, predictedTags, m_training, emissionFeatures, transitionFeatures):
    """ Helper function to update weights """
    goldTags.insert(0, 'START')
    goldTags.append('STOP')
    predictedTags.insert(0, 'START')
    predictedTags.append('STOP')
    observationSequence.insert(0, '')
    observationSequence.append('')
    
    for i in range(len(goldTags)):
        if goldTags[i] != predictedTags[i]:
            # Update weights for emission features
            if observationSequence[i] in m_training:  # if this word is not ##UNK##
                emissionFeatures[goldTags[i]][observationSequence[i]][0] += 1
                emissionFeatures[goldTags[i]][observationSequence[i]][1] += emissionFeatures[goldTags[i]][observationSequence[i]][0]

                emissionFeatures[predictedTags[i]][observationSequence[i]][0] -= 1
                emissionFeatures[predictedTags[i]][observationSequence[i]][1] += emissionFeatures[predictedTags[i]][observationSequence[i]][0]
            else:  # if this word is ##UNK##
                emissionFeatures[goldTags[i]]['#UNK#'][0] += 1
                emissionFeatures[goldTags[i]]['#UNK#'][1] += emissionFeatures[goldTags[i]]['#UNK#'][0]

                emissionFeatures[predictedTags[i]]['#UNK#'][0] -= 1
                emissionFeatures[predictedTags[i]]['#UNK#'][1] += emissionFeatures[predictedTags[i]]['#UNK#'][0]

            # Update weights for transition features
            transitionFeatures[goldTags[i-1]][goldTags[i]][0] += 1
            transitionFeatures[goldTags[i-1]][goldTags[i]][1] += transitionFeatures[goldTags[i-1]][goldTags[i]][0]
            transitionFeatures[goldTags[i]][goldTags[i+1]][0] += 1
            transitionFeatures[goldTags[i]][goldTags[i+1]][1] += transitionFeatures[goldTags[i]][goldTags[i+1]][0]

            transitionFeatures[predictedTags[i-1]][predictedTags[i]][0] -= 1
            transitionFeatures[predictedTags[i-1]][predictedTags[i]][1] += transitionFeatures[predictedTags[i-1]][predictedTags[i]][0]
            transitionFeatures[predictedTags[i]][predictedTags[i+1]][0] -= 1
            transitionFeatures[predictedTags[i]][predictedTags[i+1]][1] += transitionFeatures[predictedTags[i]][predictedTags[i+1]][0]

    return emissionFeatures, transitionFeatures

emissionDict, transitionDict = updateWeights(['RT', '@shaunproulx', ':', 'Encore', '!', '@bifnaked', 'for', 'the', 'whole', '@ShaunProulxShow', 'tonight', 'Intimate', 'convo', 'with', 'rock', 'star', '/', 'cancer', 'survivor', '-', '10:30', 'E', '...'],
                                             ['O','O','O','B-INTJ','O','O','B-PP','B-NP','I-NP','O','B-NP','O','B-NP','I-NP','B-PP','B-NP','I-NP','I-NP','I-NP','I-NP','I-NP','O','B-NP','I-NP','O'],
                                             predictionTagSequence, observationsList, emissionDict, transitionDict)

KeyError: ''