In [376]:
import numpy as np
from itertools import groupby
import pandas as pd
import math

UNKNOWN_TOKEN = "#UNK#"

filepath = "ML Project/EN/train"

In [377]:
def prepare_data(file):
    """
    Prepare the file, and returns a list of lists of "{observation} {label}"
    file : the name of the file to read
    """

    lines = [line for line in file]
    chunks = (list(g) for k, g in groupby(lines, key=lambda x: x != '\n') if k)
    return [[observation.rstrip('\n') for observation in chunk] for chunk in chunks]

file = open("ML Project/EN/train")
sequence = prepare_data(file)

In [378]:
def get_observations(sequence,k=1):
    observationsDict = {}
    sequence = (item for sublist in sequence for item in sublist)
    for item in sequence:
        observation = item.rsplit(" ", 1)[0]
        if observation not in observationsDict:  
             observationsDict[observation] = 1
        else:  
             observationsDict[observation] += 1
        
    observationsList = list(observationsDict)
    for observation in observationsDict:
        if observationsDict[observation] < k:
            observationsList.remove(observation)
    
    observationsList.append(UNKNOWN_TOKEN)
    
    return observationsList,observationsDict
    
observationsList,observationsDict = get_observations(sequence)

In [379]:
def get_tags(sequence):
    tagsList = []
    sequence = (item for sublist in sequence for item in sublist)
    for item in sequence:
        tag = item.rsplit(" ", 1)[1]
        if tag not in tagsList: 
            tagsList.append(tag)
            
    return tagsList

tagsList = get_tags(sequence)

In [380]:
def get_transition_parameters(tagsList,sequence):
    from_Y_List = ['START'] + tagsList
    to_Y_List = tagsList + ['STOP']
    length = len(from_Y_List)
    transitionDF = pd.DataFrame(np.zeros((length,length)),index = from_Y_List,columns = to_Y_List)
    
    sentenceVector = []
    for sentence in sequence:
        observationVector = []
        tagVector = []
        for word in sentence:
            observation = word.rsplit(" ", 1)[0]
            tag = word.rsplit(" ", 1)[1]
            observationVector.append(observation)
            tagVector.append(tag)
        sentenceVector.append([observationVector,tagVector])
        
    for sentence in sentenceVector:
        observationVector,tagVector = sentence
        length = len(observationVector)
        for i in range(length+1):
            if i == 0 :
                transitionDF.loc['START',tagVector[0]] += 1

            elif i == length:
                transitionDF.loc[tagVector[i-1],'STOP'] +=1

            else:
                transitionDF.loc[tagVector[i-1],tagVector[i]] += 1
    
    for i in range(len(transitionDF.index)):
        transitionDF.iloc[i,:] /= transitionDF.iloc[i,:].sum()
            
    return transitionDF

transitionParameters = get_transition_parameters(tagsList,sequence)

In [381]:
def get_emission_counts(sequence,observationsList,tagsList):
    sentenceVector = []
    for sentence in sequence:
        observationVector = []
        tagVector = []
        for word in sentence:
            observation = word.rsplit(" ", 1)[0]
            tag = word.rsplit(" ", 1)[1]
            observationVector.append(observation)
            tagVector.append(tag)
        sentenceVector.append([observationVector,tagVector])

    observationsLength = len(observationsList)
    tagsLength = len(tagsList)
    emissionDF = pd.DataFrame(np.zeros((observationsLength,tagsLength)),index = observationsList,columns = tagsList)
    emissionCount = pd.Series(np.zeros(tagsLength),index = tagsList)

    for sentence in sentenceVector:
        observationVector,tagVector = sentence
        for i in range(len(observationVector)):
            observation,tag = observationVector[i],tagVector[i]
            emissionDF.loc[observation,tag] += 1
            emissionCount[tag] += 1
        
    return emissionDF,emissionCount

def get_smooth_emission_counts(sequence,observationsList,tagsList,k=1):
    emissionDF,emissionCount = get_emission_counts(sequence,observationsList,tagsList)
    
    observationCount = emissionDF.sum(axis=1)
    fail = observationCount[observationCount < k]

    unknown = emissionDF.loc[fail.index].sum(axis=0)
    unknown.name = UNKNOWN_TOKEN
   
    smoothEmissionDF = emissionDF
    smoothEmissionDF = smoothEmissionDF.drop(fail.index, axis=0) 
    smoothEmissionDF.loc[UNKNOWN_TOKEN] = unknown    
    
    emissionParameters = smoothEmissionDF/emissionCount
    return emissionParameters

emissionParameters = get_smooth_emission_counts(sequence,observationsList,tagsList)

In [382]:
def initialize_emission_weights(observationsList,tagsList,emissionParameters):
    emissionWeights = {}
    for tag in tagsList:
        emissionWeights[tag] = {}
        for observation in observationsList:
            emissionWeights[tag][observation] = [0.0,0.0]
    
    for tag in emissionWeights:
        for observation in emissionWeights[tag]:
            emissionWeights[tag][observation] = [emissionParameters[tag][observation],emissionParameters[tag][observation]]
    
    return emissionWeights

emissionWeights = initialize_emission_weights(observationsList,tagsList,emissionParameters)

In [383]:
def initialize_transition_weights(tagsList,transitionParameters):
    transitionWeights = {}
    from_Y_List = ['START'] + list(tagsList)
    to_Y_List = list(tagsList) + ['STOP']
    for from_Y_Tag in from_Y_List:
        transitionWeights[from_Y_Tag] = {}
        for to_Y_Tag in to_Y_List:
            transitionWeights[from_Y_Tag][to_Y_Tag] = [0.0,0.0]
            
    for from_Y_Tag in transitionWeights:
        for to_Y_Tag in transitionWeights[from_Y_Tag]:
            transitionWeights[from_Y_Tag][to_Y_Tag] = [transitionParameters[to_Y_Tag][from_Y_Tag],transitionParameters[to_Y_Tag][from_Y_Tag]]
    
    return transitionWeights

transitionWeights = initialize_transition_weights(tagsList,transitionParameters)

In [384]:
def viterbi(observationSequence,observationsList,emissionWeights,transitionWeights,tagsList):
    pi = [{tag: [0.0, ''] for tag in tagsList} for o in observationSequence]

    for tag in tagsList:
        score = 0.0
        score += transitionWeights['START'][tag][0]

        if observationSequence[0] in observationsList: 
            score += emissionWeights[tag][observationSequence[0]][0]
        else: 
            score += emissionWeights[tag]['#UNK#'][0]

        pi[0][tag] = [score, 'START']

    for k in range(1, len(observationSequence)): 
        for from_Y_Tag in tagsList:
            for to_Y_Tag in tagsList:
                score = pi[k-1][to_Y_Tag][0]
                score += transitionWeights[to_Y_Tag][from_Y_Tag][0]

                if score > pi[k][from_Y_Tag][0]:
                    pi[k][from_Y_Tag] = [score, to_Y_Tag]

            if observationSequence[k] in observationsList: 
                pi[k][from_Y_Tag][0] += emissionWeights[from_Y_Tag][observationSequence[k]][0]
            else: 
                pi[k][from_Y_Tag][0] += emissionWeights[from_Y_Tag]['#UNK#'][0]

    result = [0.0, '']
    for to_Y_Tag in tagsList:
        score = pi[-1][to_Y_Tag][0] + transitionWeights[to_Y_Tag]['STOP'][0]

        if score > result[0]:
            result = [score, to_Y_Tag]

    prediction = [result[1]]
    for k in reversed(range(len(observationSequence))):
        if k == 0: break  
        prediction.insert(0, pi[k][prediction[0]][1])

    return prediction

# predictedTagSequence = viterbi(observationSequence,observationsList,emissionWeights,transitionWeights,tagsList)

In [385]:
def updateWeights(observationSequence,tagSequence,predictedTagSequence,observationsList,emissionWeights,transitionWeights):
    tagSequence.insert(0, 'START')
    tagSequence.append('STOP')
    predictedTagSequence.insert(0, 'START')
    predictedTagSequence.append('STOP')
    observationSequence.insert(0, '')
    observationSequence.append('')
    
    for i in range(len(tagSequence)):
        if tagSequence[i] != predictedTagSequence[i]:
            if tagSequence[i]!='START' and tagSequence[i]!='STOP' and predictedTagSequence[i]!='START' and predictedTagSequence[i]!='STOP':
                if observationSequence[i] in observationsList:  # if word is not unknown
                    emissionWeights[tagSequence[i]][observationSequence[i]][0] += 1
                    emissionWeights[tagSequence[i]][observationSequence[i]][1] += emissionWeights[tagSequence[i]][observationSequence[i]][0]

                    emissionWeights[predictedTagSequence[i]][observationSequence[i]][0] -= 1
                    emissionWeights[predictedTagSequence[i]][observationSequence[i]][1] += emissionWeights[predictedTagSequence[i]][observationSequence[i]][0]
                else:  
                    emissionWeights[tagSequence[i]]['#UNK#'][0] += 1
                    emissionWeights[tagSequence[i]]['#UNK#'][1] += emissionWeights[tagSequence[i]]['#UNK#'][0]

                    emissionWeights[predictedTagSequence[i]]['#UNK#'][0] -= 1
                    emissionWeights[predictedTagSequence[i]]['#UNK#'][1] += emissionWeights[predictedTagSequence[i]]['#UNK#'][0]

                transitionWeights[tagSequence[i-1]][tagSequence[i]][0] += 1
                transitionWeights[tagSequence[i-1]][tagSequence[i]][1] += transitionWeights[tagSequence[i-1]][tagSequence[i]][0]
                transitionWeights[tagSequence[i]][tagSequence[i+1]][0] += 1
                transitionWeights[tagSequence[i]][tagSequence[i+1]][1] += transitionWeights[tagSequence[i]][tagSequence[i+1]][0]

                transitionWeights[predictedTagSequence[i-1]][predictedTagSequence[i]][0] -= 1
                transitionWeights[predictedTagSequence[i-1]][predictedTagSequence[i]][1] += transitionWeights[predictedTagSequence[i-1]][predictedTagSequence[i]][0]
                transitionWeights[predictedTagSequence[i]][predictedTagSequence[i+1]][0] -= 1
                transitionWeights[predictedTagSequence[i]][predictedTagSequence[i+1]][1] += transitionWeights[predictedTagSequence[i]][predictedTagSequence[i+1]][0]

    return emissionWeights, transitionWeights

# emissionWeights, transitionWeights = updateWeights(observationSequence,tagSequence,predictedTagSequence, observationsList, emissionWeights, transitionWeights)

In [386]:
def trainModel(filePath,observationsList,emissionWeights,transitionWeights,numberOfIterations=4):
    n = 0  

    for t in range(numberOfIterations):
        print ('Training model',t+1,'times')
        observationSequence = []
        tagSequence = []
        for line in open(filePath, 'r'):
            strippedLine = line.rstrip()
            if strippedLine: 
                strippedLine = strippedLine.rsplit(' ', 1)
                observation = strippedLine[0]  
                tag = strippedLine[1] 
                observationSequence.append(observation)
                tagSequence.append(tag)
            else:
                predictedTagSequence = viterbi(observationSequence,observationsList,emissionWeights,transitionWeights,tagsList)
                emissionWeights, transitionWeights = updateWeights(observationSequence,tagSequence,predictedTagSequence, observationsList, emissionWeights, transitionWeights)
                
                n += 1
                observationSequence = []
                tagSequence = []

    for tag in list(emissionWeights):
        for observation in emissionWeights[tag]:
            emissionWeights[tag][observation][1] /= (n+1)

    for to_Y_Tag in list(transitionWeights):
        for from_Y_Tag in transitionWeights[to_Y_Tag]:
            transitionWeights[to_Y_Tag][from_Y_Tag][1] /= (n+1)

    return emissionWeights, transitionWeights

# trainModel(filepath,observationsList,emissionWeights,transitionWeights,numberOfIterations=4)

In [392]:
def sentimentAnalysis(inputPath,observationsList,emissionWeights,transitionWeights,tagsList,outputPath):
    f = open(outputPath, 'w')

    observationSequence = []
    for line in open(inputPath, 'r'):
        observation = line.rstrip()
        if observation:
            observationSequence.append(observation)
        else:
            predictedTagSequence = viterbi(observationSequence,observationsList,emissionWeights,transitionWeights,tagsList)
            for i in range(len(observationSequence)):
                f.write('%s %s\n' % (observationSequence[i], predictedTagSequence[i]))
            f.write('\n')
            observationSequence = []

    print ('Finished writing to file %s' % (outputPath))
    return f.close()


ENtrain = "ML Project/EN/train"
FRtrain = "ML Project/FR/train"
ENinput = 'ML Project/EN/test.in'
FRinput = 'ML Project/FR/test.in'
ENoutput = 'ML Project/EN/test.p5.out'
FRoutput = 'ML Project/FR/test.p5.out'
file = open(ENtrain)
sequence = prepare_data(file)
observationsList,observationsDict = get_observations(sequence)
tagsList = get_tags(sequence)
transitionParameters = get_transition_parameters(tagsList,sequence)
emissionParameters = get_smooth_emission_counts(sequence,observationsList,tagsList)
emissionWeights = initialize_emission_weights(observationsList,tagsList,emissionParameters)
transitionWeights = initialize_transition_weights(tagsList,transitionParameters)
trainModel(ENtrain,observationsList,emissionWeights,transitionWeights,numberOfIterations=4)
sentimentAnalysis(ENinput,observationsList,emissionWeights,transitionWeights,tagsList,ENoutput)

Training model 1 times
Training model 2 times
Training model 3 times
Training model 4 times
Finished writing to file ML Project/EN/test.p5.out
