# Sentiment Analysis System using HMM (Continued)

In [1]:
import numpy as np 
import pandas as pd

We now have a new function for processing unlabelled data (test data for our predictions).

In [2]:
def get_data(filename):
    f = open(filename, 'r', encoding="utf8")
    lines = f.readlines()
    datas = []
    
    start = 0
    for i in range(len(lines)):
        if lines[i] == '\n':
            datas.append(lines[start:i])
            start = i+1
        lines[i] = lines[i].replace('\n','')
        lines[i] = tuple(lines[i].split(' '))
        
    # check formatting
#     for i in range(len(datas)):
#         for j in range(len(datas[i])):
#             assert len(datas[i][j])==2
            
    
    for i in range(len(datas)):
        data = datas[i]
        x = [word[0] for word in data]
        y = [word[1] for word in data]
        datas[i] = [x,y]
    
    all_x = []
    for i in range(len(datas)):
        for j in range(len(datas[i][0])):
            all_x.append(datas[i][0][j])
    x_set = frozenset(all_x)
    
    all_y = []
    for i in range(len(datas)):
        for j in range(len(datas[i][0])):
            all_y.append(datas[i][1][j])
    y_set = frozenset(all_y)
    
    return dict(data=datas,x_set=x_set,y_set=y_set)

def get_unlabelled_data(filename):
    f = open(filename, 'r', encoding="utf8")
    lines = f.readlines()
    datas = []
    
    start = 0
    for i in range(len(lines)):
        if lines[i] == '\n':
            datas.append(lines[start:i])
            start = i+1
        lines[i] = lines[i].replace('\n','')
            
    return datas


### (5 pts) Write a function that estimates the transition parameters from the training set using MLE

    q(y_i|y_i-1) = Count(y_i-1, y_i) / Count(y_i-1)
    
*The following special cases are also considered: q(STOP|y_n) and q(y_1|START).*

In [3]:
data_dict = get_data('EN/train')

In [4]:
def get_transmission_params(data_dict):
    from_y = ['START'] + list(data_dict['y_set']) 
    to_y = list(data_dict['y_set']) + ['STOP']
    l = len(from_y)
    transmission_count = pd.DataFrame(np.zeros((l,l)),index=from_y,columns=to_y)

    datas = data_dict['data']
    for instance in datas:
        x_vector,y_vector = instance
        length = len(y_vector)
        for i in range(length+1):
            if i == 0 :
                transmission_count.loc['START',y_vector[0]] += 1

            elif i == length:
                transmission_count.loc[y_vector[i-1],'STOP'] +=1

            else:
                transmission_count.loc[y_vector[i-1],y_vector[i]] += 1

    y_count = transmission_count.sum(axis=1) 
    transmission_params = transmission_count
    for i in range(len(transmission_count.index)):
        transmission_params.iloc[i,:] /= transmission_params.iloc[i,:].sum()
    return transmission_params


The following shows the tranmission parameters that we get from using the above function:

In [5]:
trans_params = get_transmission_params(data_dict)
trans_params

Unnamed: 0,I-positive,B-neutral,B-positive,I-negative,O,I-neutral,B-negative,STOP
START,0.0,0.005339,0.04378,0.0,0.940203,0.0,0.010678,0.0
I-positive,0.406919,0.0,0.0,0.0,0.584843,0.0,0.0,0.008237
B-neutral,0.0,0.0,0.0,0.0,0.784615,0.2,0.0,0.015385
B-positive,0.298013,0.0,0.0,0.0,0.688742,0.0,0.0,0.013245
I-negative,0.0,0.0,0.0,0.398496,0.601504,0.0,0.0,0.0
O,0.0,0.002269,0.046448,0.0,0.860119,0.0,0.014933,0.076231
I-neutral,0.0,0.0,0.0,0.0,0.565217,0.434783,0.0,0.0
B-negative,0.0,0.0,0.0,0.209424,0.782723,0.0,0.0,0.007853


In [6]:
trans_params.sum(axis=1)

START         1.0
I-positive    1.0
B-neutral     1.0
B-positive    1.0
I-negative    1.0
O             1.0
I-neutral     1.0
B-negative    1.0
dtype: float64

### (15 pts) Use the estimated transition and emission parameters, implement the Viterbi algorithm to compute the following (for a sentence with n words):

    y*_1,..., y∗_n = argmax_y1,...,yn(p)(x1,...,xn,y1,...,yn)

In [7]:
def get_emission_counts(data_dict):
    """
    returns (DataFrame,Series) 
    an emission count (y->x) DataFrame and y count Series
    """
    data = data_dict['data']
    x_set = data_dict['x_set']
    y_set = data_dict['y_set']
    count_em_df = pd.DataFrame(np.zeros((len(x_set),len(y_set))),index=x_set,columns=y_set)
    count_y = pd.Series(np.zeros(len(y_set)),index=y_set)

    for instance in data:
        x_vector,y_vector = instance
        for i in range(len(x_vector)):
            x,y = x_vector[i],y_vector[i]
            count_em_df.loc[x,y]+=1
            count_y[y]+=1
    return count_em_df,count_y

def get_modified_counts(data_dict,k):
    count_em_df,count_y = get_emission_counts(data_dict)
    
    counts_x = count_em_df.sum(axis=1)
    fail = counts_x[counts_x<k]

    unk = count_em_df.loc[fail.index].sum(axis=0)
    unk.name = '#UNK#'
   
    modified_df = count_em_df.append(unk)
    modified_df = modified_df.drop(fail.index, axis=0) 
    
    return modified_df,count_y


def get_modified_emission_params(data_dict,k=3):
    """
    returns DataFrame representing conditional probabilities P(y|x)
    """
    count_em_df,count_y = get_modified_counts(data_dict,k)
    return count_em_df/count_y


In [8]:
def vertibi(x_vector,trans_params,em_params):
    """
    x_vector: a list of string which represent the sequence of observations
    trans_params: a DataFrame with index from_states, columns to_states 
                  containing the transmission probabilities
    em_params: a DataFrame with index observations, columns states which contains
               the emission probabilities
    """

    states = trans_params.index.tolist()
    states.remove('START')
    states.remove('O')
    states = ['O']+states

    arr = np.zeros((len(states),len(x_vector))) *np.nan
    arr2 = np.zeros((len(states),len(x_vector))) *np.nan
    t1 = pd.DataFrame(arr,index=states,columns=x_vector)
    t2 = pd.DataFrame(arr2,index=states,columns=x_vector)

    for i in range(len(states)):
        t1.iloc[i,0] = trans_params.loc['START',t1.index[i]] * em_params.loc[x_vector[0],t1.index[i]]
        t2.iloc[i,0] = 0

    for i in range(1,len(x_vector)):
        for j in range(len(states)):
            em_prob = em_params.loc[x_vector[i],states[j]] #prob of getting x_i given state j
            maxx = None
            argmax = None
            for k in range(len(states)):
                prob = t1.iloc[k,i-1] * trans_params.loc[states[k],states[j]] * em_prob
                if maxx is None:
                    argmax = k
                    maxx = prob
                    
                elif prob>maxx:
                    argmax = k
                    maxx = prob

            t1.iloc[j,i] = maxx
            t2.iloc[j,i] = argmax

    
    for i in range(len(states)):
        prob = t1.iloc[i,len(x_vector)-1]
        t1.iloc[i,len(x_vector)-1] = t1.iloc[i,len(x_vector)-1] * trans_params.loc[states[i],'STOP']
       
    prediction_indx = []
    maxx = None
    argmax = None
    for k in range(len(states)):
        prob = t1.iloc[k,len(x_vector)-1]
        if maxx is None:
            maxx = prob
            argmax = k
        elif prob > maxx:
            maxx = prob
            argmax = k

    prediction_indx.append(argmax)
    for i in range(len(x_vector)-1,0,-1):
        indx = t2.iloc[int(prediction_indx[0]),i]
        prediction_indx = [indx] + prediction_indx

    prediction = [states[int(i)] for i in prediction_indx]
    return prediction
    
    
def decode(fin,fout,trans_params,em_params):
    word_bag = em_params.index.tolist()
    unlabelled_datas = get_unlabelled_data(fin)
    
    results = []
    for obs_vector in unlabelled_datas:
        copy = []
        for i in range(len(obs_vector)):
            if obs_vector[i] in word_bag:
                copy.append(obs_vector[i])
            else:
                copy.append('#UNK#')
        result = vertibi(copy,trans_params,em_params)
        assert len(result) == len(obs_vector)
        results.append(result)
    
    fout = open(fout, 'w', encoding="utf8")
    for i in range(len(unlabelled_datas)):
        for j in range(len(unlabelled_datas[i])):
            x = unlabelled_datas[i][j]
            y = results[i][j]
            fout.write('{} {}\n'.format(x,y))
        fout.write('\n')
    fout.close
    print("vertibi decoding complete")


## Training and Decoding on EN data Results

In [9]:
data_dict = get_data('EN/train')
a = get_transmission_params(data_dict)
b = get_modified_emission_params(data_dict,k=3)
decode('EN/dev.in','EN/dev.p3.out',a,b)

vertibi decoding complete


```
>python3 evalResult.py EN/dev.out EN/dev.p3.out

#Entity in gold data: 226
#Entity in prediction: 162

#Correct Entity : 104
Entity  precision: 0.6420
Entity  recall: 0.4602
Entity  F: 0.5361

#Correct Sentiment : 64
Sentiment  precision: 0.3951
Sentiment  recall: 0.2832
Sentiment  F: 0.3299
```

## Training and Decoding on CN data Results

In [10]:
data_dict = get_data('CN/train')
a = get_transmission_params(data_dict)
b = get_modified_emission_params(data_dict,k=3)
decode('CN/dev.in','CN/dev.p3.out',a,b)

vertibi decoding complete


```
>python3 evalResult.py CN/dev.out CN/dev.p3.out

#Entity in gold data: 362
#Entity in prediction: 158

#Correct Entity : 64
Entity  precision: 0.4051
Entity  recall: 0.1768
Entity  F: 0.2462

#Correct Sentiment : 47
Sentiment  precision: 0.2975
Sentiment  recall: 0.1298
Sentiment  F: 0.1808
```

## Training and Decoding on FR data Results

In [11]:
data_dict = get_data('FR/train')
a = get_transmission_params(data_dict)
b = get_modified_emission_params(data_dict,k=3)
decode('FR/dev.in','FR/dev.p3.out',a,b)

vertibi decoding complete


```
>python3 evalResult.py FR/dev.out FR/dev.p3.out

#Entity in gold data: 223
#Entity in prediction: 166

#Correct Entity : 112
Entity  precision: 0.6747
Entity  recall: 0.5022
Entity  F: 0.5758

#Correct Sentiment : 72
Sentiment  precision: 0.4337
Sentiment  recall: 0.3229
Sentiment  F: 0.3702
```

## Training and Decoding on SG data Results

In [12]:
data_dict = get_data('SG/train')
a = get_transmission_params(data_dict)
b = get_modified_emission_params(data_dict,k=3)
decode('SG/dev.in','SG/dev.p3.out',a,b)

vertibi decoding complete


```
>python3 evalResult.py SG/dev.out SG/dev.p3.out

#Entity in gold data: 1382
#Entity in prediction: 723

#Correct Entity : 386
Entity  precision: 0.5339
Entity  recall: 0.2793
Entity  F: 0.3667

#Correct Sentiment : 244
Sentiment  precision: 0.3375
Sentiment  recall: 0.1766
Sentiment  F: 0.2318
```