Load Training Data File

In [135]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
data=pd.read_csv('train.tsv', sep='\t',header=None)


Data Cleaning

In [136]:
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [137]:
data

Unnamed: 0,0,1
0,konsiltan,k-on-s-i-l-t-an
1,depotwa,d-e-p-o-t-w-a
2,sosyopwofesyonèl,s-o-s-y-o-p-w-o-f-e-s-y-o-n-è-l
3,vejetal,v-e-j-e-t-a-l
4,repibliye,r-e-p-i-b-l-i-y-e
...,...,...
12806,Remi,R-e-m-i
12807,diskriminatwa,d-i-s-k-r-i-m-i-n-a-t-w-a
12808,rejè,r-e-j-è
12809,manzè,m-an-z-è


Add separate columns to the orignal dataframe for data manipulation

In [138]:
data.columns=['words','labels']
data['tagged_labels'] = data.apply(lambda _: '', axis=1)
data['train_words']=data.apply(lambda _: '', axis=1)


Function to convert labels into tagged labels 

In [139]:
def conv_to_tags(word):
  lis_grapheme=list(word.split("-"))
  label_lis=[]
  for x in lis_grapheme:
    if len(x)==2:
      label_lis.append(x[0].replace(x[0],"B"))
      label_lis.append(x[1].replace(x[1],"I"))
    elif len(x)==3:
      label_lis.append(x[0].replace(x[0],"B"))
      label_lis.append(x[1].replace(x[1],"I"))
      label_lis.append(x[2].replace(x[2],"I"))
    else:
      label_lis.append("B")

  string_lis=''.join(label_lis)    
  string=''+string_lis[0]
  for x in range(1,len(string_lis)):
    if string_lis[x]=="B":
      string+=string_lis[x].replace(string_lis[x],'-B')
    else:
      string+=string_lis[x].replace(string_lis[x],'-I')   
      #print(string_lis[x])
  return string


Function to convert each input word into individual letters

In [140]:
def words_conv(word):
  string=''+word[0]
  for x in range(1,len(word)):
    string+=word[x].replace(word[x],'-'+word[x])
  return string

In [141]:
word='mason'
words_conv(word)

'm-a-s-o-n'

In [142]:
word='m-a-s-on'
print(conv_to_tags(word))

B-B-B-B-I


Add updated columns to orignal dataframe

In [143]:
for x in range(len(data['labels'])):
  data['tagged_labels'][x]=conv_to_tags(data['labels'][x])
for x in range(len(data['words'])):
  data['train_words'][x]=words_conv(data['words'][x]) 

In [144]:
data

Unnamed: 0,words,labels,tagged_labels,train_words
0,konsiltan,k-on-s-i-l-t-an,B-B-I-B-B-B-B-B-I,k-o-n-s-i-l-t-a-n
1,depotwa,d-e-p-o-t-w-a,B-B-B-B-B-B-B,d-e-p-o-t-w-a
2,sosyopwofesyonèl,s-o-s-y-o-p-w-o-f-e-s-y-o-n-è-l,B-B-B-B-B-B-B-B-B-B-B-B-B-B-B-B,s-o-s-y-o-p-w-o-f-e-s-y-o-n-è-l
3,vejetal,v-e-j-e-t-a-l,B-B-B-B-B-B-B,v-e-j-e-t-a-l
4,repibliye,r-e-p-i-b-l-i-y-e,B-B-B-B-B-B-B-B-B,r-e-p-i-b-l-i-y-e
...,...,...,...,...
12806,Remi,R-e-m-i,B-B-B-B,R-e-m-i
12807,diskriminatwa,d-i-s-k-r-i-m-i-n-a-t-w-a,B-B-B-B-B-B-B-B-B-B-B-B-B,d-i-s-k-r-i-m-i-n-a-t-w-a
12808,rejè,r-e-j-è,B-B-B-B,r-e-j-è
12809,manzè,m-an-z-è,B-B-I-B-B,m-a-n-z-è


In [145]:
updated_data=data[['train_words','tagged_labels']]
updated_data

Unnamed: 0,train_words,tagged_labels
0,k-o-n-s-i-l-t-a-n,B-B-I-B-B-B-B-B-I
1,d-e-p-o-t-w-a,B-B-B-B-B-B-B
2,s-o-s-y-o-p-w-o-f-e-s-y-o-n-è-l,B-B-B-B-B-B-B-B-B-B-B-B-B-B-B-B
3,v-e-j-e-t-a-l,B-B-B-B-B-B-B
4,r-e-p-i-b-l-i-y-e,B-B-B-B-B-B-B-B-B
...,...,...
12806,R-e-m-i,B-B-B-B
12807,d-i-s-k-r-i-m-i-n-a-t-w-a,B-B-B-B-B-B-B-B-B-B-B-B-B
12808,r-e-j-è,B-B-B-B
12809,m-a-n-z-è,B-B-I-B-B


In [146]:
def data_to_lis_conv(data):
  data_lis=[]
  for i in range(len(data)):
    temp=[]
    for j in range(len(data[i])):
      temp.append(data[i][j])
    data_lis.append(temp)
  return data_lis


In [147]:
features_data=updated_data['train_words']
labels_data=updated_data['tagged_labels']

features=data_to_lis_conv(features_data)
labels=data_to_lis_conv(labels_data)

In [148]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1)

In [149]:
def train_test_tuples(X_train,X_test):
  train = []
  for i in range(len(X_train)):
      sent = []
      for j in range(len(X_train[i])):
          sent.append((X_train[i][j], y_train[i][j]))
      train.append(sent)
  test = []
  for i in range(len(X_test)):
      sent = []
      for j in range(len(X_test[i])):
          sent.append((X_test[i][j], y_test[i][j]))
      test.append(sent)
  return train,test

In [None]:
print(train[0])
print(test[0])

[('k', 'B'), ('-', '-'), ('e', 'B'), ('-', '-'), ('n', 'I'), ('-', '-'), ('z', 'B')]
[('e', 'B'), ('-', '-'), ('s', 'B'), ('-', '-'), ('p', 'B'), ('-', '-'), ('i', 'B'), ('-', '-'), ('r', 'B'), ('-', '-'), ('i', 'B'), ('-', '-'), ('t', 'B'), ('-', '-'), ('i', 'B'), ('-', '-'), ('s', 'B')]


In [150]:
train,test=train_test_tuples(X_train,X_test)
tagger = nltk.HiddenMarkovModelTagger.train(train)    

In [156]:
print(tagger.evaluate_per_tag(test))

Tag | Prec.  | Recall | F-measure
----+--------+--------+-----------
  - | 1.0000 | 1.0000 | 1.0000
  B | 0.9995 | 0.9713 | 0.9852
  I | 0.7635 | 0.9947 | 0.8639

