In [1]:
import numpy as np 
import pandas as pd

## Loading training data

In [2]:
df=pd.read_csv('trainwords.txt', sep='\n', names = ['headlines'], header = 0) 
df.head() 

Unnamed: 0,headlines
0,Peter_B-PER Blackburn_I-PER
1,BRUSSELS_B-LOC 1996-08-22_O
2,The_O European_B-ORG Commission_I-ORG said_O o...
3,Germany_B-LOC 's_O representative_O to_O the_O...
4,_O We_O do_O n't_O support_O any_O such_O reco...


In [3]:
f = np.genfromtxt("trainwords.txt", delimiter= '\n',dtype= None, unpack = True) 

## Assigning index to words and tags

In [4]:
f_ind_X = np.genfromtxt("index_to_word.txt",delimiter= '\n',dtype= None, unpack = True) 
f_ind_X

array(['*OOV*', '"', '$', ..., 'yr', 'yuan', 'zinc'],
      dtype='|S24')

In [5]:
f_ind_Y = np.genfromtxt("index_to_tag.txt",delimiter= '\n',dtype= None, unpack = True) 
f_ind_Y

array(['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG',
       'I-PER', 'O'],
      dtype='|S6')

In [6]:
#Using dictionaries to store the index of words and tags
d1 = {} 
d2 = {} 
d3 = {} 
d4 = {} 

for i in range(len(f_ind_X)):
    d1[f_ind_X[i]] = i+1
    d3[i+1] = f_ind_X[i]
    
for i in range(len(f_ind_Y)):    
    d2[f_ind_Y[i]] = i+1
    d4[i+1] = f_ind_Y[i]

## Separating the words and tags

In [7]:
lw = []
for j in range(len(f)):
    words1 = f[j].split()
    lw.append(len(words1))
           
X = np.zeros((len(f),max(lw)))
Y = np.zeros((len(f),max(lw)))

for j in range(len(f)):
    words1 = f[j].split()
    for i in range(len(words1)): 
        words2 = np.array(words1[i].split('_'))
        X[j][i] = d1.get(words2[0])
        Y[j][i] = d2.get(words2[1])

## Calculating the initialization probabilities

In [8]:
pi = np.zeros(len(d2))

Np = []
for i in range(len(d2)):
    Np.append(Y[:,0].tolist().count(i+1) + 1)

for i in range(len(d2)):
    pi[i] = Np[i]/sum(Np)

## Calculating emission and transition probabilities probabilities 

In [9]:
Nb = np.zeros((len(d2),len(d1)))

Na = np.zeros((len(d2),len(d2)))

for j in range(len(f)):
    words1 = f[j].split()
    for i in range(len(words1)): 
        words2 = np.array(words1[i].split('_'))
        X[j][i] = d1.get(words2[0])
        Y[j][i] = d2.get(words2[1])
        Nb[int(Y[j][i])-1][int(X[j][i])-1] = Nb[int(Y[j][i])-1][int(X[j][i])-1] + 1 
        if(i<len(words1)-1):
            Na[int(Y[j][i])-1][int(Y[j][i+1])-1] = Na[int(Y[j][i])-1][int(Y[j][i+1])-1] + 1
        else:
            continue 
        
Nb = Nb + np.ones((len(d2),len(d1))) 

b = np.zeros((len(d2),len(d1))) 

b = (Nb.T / sum(Nb.T)).T

Na = Na + np.ones((len(d2),len(d2)))

a = np.zeros((len(d2),len(d2))) 

a = (Na.T / sum(Na.T)).T

A = a   #Transition probabilites
B = b   #Emission probabilities 
PI = pi #Initialization probabilities

## Getting predictions on test data

In [10]:
def prediction(pi,a,b,i):
    f_test = np.genfromtxt("testwords.txt",delimiter= '\n',dtype= None, unpack = True)    
    f_test = f_test[i]
    f_test = f_test.tolist()
    words1_ = f_test.split()
    X1 = np.zeros((1,len(words1_)))
    Y1 = np.zeros((1,len(words1_)))
    
    words20 = [] 
    for i in range(len(words1_)):
        words2_ = words1_[i].split('_')
        X1[0][i] = d1.get(words2_[0])
        Y1[0][i] = d2.get(words2_[1])
        words20.append(words2_[0])       
    alpha = pi*b[:,int(X1[0][0])-1] 
    alpha = np.matrix(alpha) 
    alpha = alpha.T 
    a = np.matrix(a) 
    b = np.matrix(b) 
    ALPHA = np.array(alpha) 
    for i in range(1,len(words1_)): 
        alpha = np.multiply(b[:,int(X1[0][i]-1)],(a.T*alpha))
        ALPHA = np.hstack((ALPHA,np.array(alpha)))
        
    beta = np.ones((len(d2),1)) 
    
    beta = np.matrix(beta) 
    
    a = np.matrix(a) 
    
    BETA = np.array(beta)
    for i in range(1,len(words1_)):
        beta = a*np.multiply(b[:,int(X1[0][len(words1_)-i]-1)],beta)
        BETA = np.hstack((BETA,np.array(beta)))
    BETA = np.flip(BETA,1)                                   
    prob = ALPHA*BETA
        
    pred = []
    
    for i in range(len(words1_)):
        a = np.argmax(prob[:,i])
        pred1 = d4[a+1]
        join_list = [words20[i],pred1]
        pred.append('_'.join(join_list))
        #pred.append(d4[np.argmax(prob[:,i])+1])
    return pred 

In [11]:
f_test1 = np.genfromtxt("testwords.txt",delimiter= '\n',dtype= None, unpack = True)
output = open("output1.txt",'w')  
for i in range(10):
    arrp = prediction(PI,A,B,i) 
    for j in range(len(arrp)): 
        ant = arrp[j] 
        if(j == len(arrp)-1): 
            #print('{}\n'.format(ant)) 
            output.write('{}\n'. format(ant))  
        else:
            #print('{}'.format(ant)) 
            output.write('{} '. format(ant)) 
output.close()    

## Predictions for first 10 data from test file

In [12]:
predictions = np.genfromtxt("output.txt",delimiter= '\n',dtype= None, unpack = True) 
predictions

array([ 'CRICKET_O -_O *OOV*_O *OOV*_O *OOV*_O AT_O TOP_O AFTER_O *OOV*_O *OOV*_O ._O',
       'LONDON_B-LOC 1996-08-30_O',
       'West_B-LOC Indian_I-ORG all-rounder_O Phil_B-PER Simmons_I-PER took_O four_O for_O 38_O on_O Friday_O as_O Leicestershire_B-ORG beat_O Somerset_B-ORG by_O an_O innings_O and_O 39_O runs_O in_O two_O days_O to_O take_O over_O at_O the_O head_O of_O the_O county_O championship_O ._O',
       'Their_O stay_O on_O top_O ,_O though_O ,_O may_O be_O *OOV*_O as_O title_O rivals_O Essex_B-ORG ,_O Derbyshire_B-ORG and_O Surrey_B-ORG all_O closed_O in_O on_O victory_O while_O Kent_B-ORG made_O up_O for_O lost_O time_O in_O their_O *OOV*_O match_O against_O Nottinghamshire_B-ORG ._O',
       'After_O bowling_O Somerset_B-ORG out_O for_O 83_O on_O the_O opening_O morning_O at_O *OOV*_O Road_O ,_O Leicestershire_B-ORG extended_O their_O first_O innings_O by_O 94_O runs_O before_O being_O bowled_O out_O for_O *OOV*_O with_O England_B-LOC *OOV*_O Andy_B-PER *OOV*_I-PER t