Load Training Data File

In [57]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
data=pd.read_csv('train.tsv', sep='\t',header=None)


Data Cleaning

In [58]:
data.dropna(inplace=True)
data.reset_index(drop=True, inplace=True)

In [59]:
data

Unnamed: 0,0,1
0,konsiltan,k-on-s-i-l-t-an
1,depotwa,d-e-p-o-t-w-a
2,sosyopwofesyonèl,s-o-s-y-o-p-w-o-f-e-s-y-o-n-è-l
3,vejetal,v-e-j-e-t-a-l
4,repibliye,r-e-p-i-b-l-i-y-e
...,...,...
12806,Remi,R-e-m-i
12807,diskriminatwa,d-i-s-k-r-i-m-i-n-a-t-w-a
12808,rejè,r-e-j-è
12809,manzè,m-an-z-è


Add separate columns to the orignal dataframe for data manipulation

In [60]:
data.columns=['words','labels']
data['tagged_labels'] = data.apply(lambda _: '', axis=1)
data['train_words']=data.apply(lambda _: '', axis=1)


Function to convert labels into tagged labels 

In [61]:
def conv_to_tags(word):
  lis_grapheme=list(word.split("-"))
  label_lis=[]
  for x in lis_grapheme:
    if len(x)==2:
      label_lis.append(x[0].replace(x[0],"B"))
      label_lis.append(x[1].replace(x[1],"I"))
    elif len(x)==3:
      label_lis.append(x[0].replace(x[0],"B"))
      label_lis.append(x[1].replace(x[1],"I"))
      label_lis.append(x[2].replace(x[2],"I"))
    else:
      label_lis.append("B")

  string_lis=''.join(label_lis)    
  string=''+string_lis[0]
  for x in range(1,len(string_lis)):
    if string_lis[x]=="B":
      string+=string_lis[x].replace(string_lis[x],'-B')
    else:
      string+=string_lis[x].replace(string_lis[x],'-I')   
      #print(string_lis[x])
  return string


Function to convert each input word into individual letters

In [62]:
def words_conv(word):
  string=''+word[0]
  for x in range(1,len(word)):
    string+=word[x].replace(word[x],'-'+word[x])
  return string

In [63]:
word='mason'
words_conv(word)

'm-a-s-o-n'

In [64]:
word='m-a-s-on'
print(conv_to_tags(word))

B-B-B-B-I


Add updated columns to orignal dataframe

In [65]:
for x in range(len(data['labels'])):
  data['tagged_labels'][x]=conv_to_tags(data['labels'][x])
for x in range(len(data['words'])):
  data['train_words'][x]=words_conv(data['words'][x]) 

In [66]:
data

Unnamed: 0,words,labels,tagged_labels,train_words
0,konsiltan,k-on-s-i-l-t-an,B-B-I-B-B-B-B-B-I,k-o-n-s-i-l-t-a-n
1,depotwa,d-e-p-o-t-w-a,B-B-B-B-B-B-B,d-e-p-o-t-w-a
2,sosyopwofesyonèl,s-o-s-y-o-p-w-o-f-e-s-y-o-n-è-l,B-B-B-B-B-B-B-B-B-B-B-B-B-B-B-B,s-o-s-y-o-p-w-o-f-e-s-y-o-n-è-l
3,vejetal,v-e-j-e-t-a-l,B-B-B-B-B-B-B,v-e-j-e-t-a-l
4,repibliye,r-e-p-i-b-l-i-y-e,B-B-B-B-B-B-B-B-B,r-e-p-i-b-l-i-y-e
...,...,...,...,...
12806,Remi,R-e-m-i,B-B-B-B,R-e-m-i
12807,diskriminatwa,d-i-s-k-r-i-m-i-n-a-t-w-a,B-B-B-B-B-B-B-B-B-B-B-B-B,d-i-s-k-r-i-m-i-n-a-t-w-a
12808,rejè,r-e-j-è,B-B-B-B,r-e-j-è
12809,manzè,m-an-z-è,B-B-I-B-B,m-a-n-z-è


In [67]:
updated_data=data[['train_words','tagged_labels']]
updated_data

Unnamed: 0,train_words,tagged_labels
0,k-o-n-s-i-l-t-a-n,B-B-I-B-B-B-B-B-I
1,d-e-p-o-t-w-a,B-B-B-B-B-B-B
2,s-o-s-y-o-p-w-o-f-e-s-y-o-n-è-l,B-B-B-B-B-B-B-B-B-B-B-B-B-B-B-B
3,v-e-j-e-t-a-l,B-B-B-B-B-B-B
4,r-e-p-i-b-l-i-y-e,B-B-B-B-B-B-B-B-B
...,...,...
12806,R-e-m-i,B-B-B-B
12807,d-i-s-k-r-i-m-i-n-a-t-w-a,B-B-B-B-B-B-B-B-B-B-B-B-B
12808,r-e-j-è,B-B-B-B
12809,m-a-n-z-è,B-B-I-B-B


In [68]:
def data_to_lis_conv(data):
  data_lis=[]
  for i in range(len(data)):
    temp=[]
    for j in range(len(data[i])):
      temp.append(data[i][j])
    data_lis.append(temp)
  return data_lis


In [69]:
features_data=updated_data['train_words']
labels_data=updated_data['tagged_labels']

features=data_to_lis_conv(features_data)
labels=data_to_lis_conv(labels_data)

In [109]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1)

In [113]:
def train_test_tuples(X_train,X_test):
  train = []
  for i in range(len(X_train)):
      sent = []
      for j in range(len(X_train[i])):
          sent.append((X_train[i][j], y_train[i][j]))
      train.append(sent)
  test = []
  for i in range(len(X_test)):
      sent = []
      for j in range(len(X_test[i])):
          sent.append((X_test[i][j], y_test[i][j]))
      test.append(sent)
  return train,test

In [114]:
train,test=train_test_tuples(X_train,X_test)
tagger = nltk.HiddenMarkovModelTagger.train(train)    

In [124]:
trainer = nltk.HiddenMarkovModelTrainer()
tagger_new = trainer.train_supervised(train)

In [122]:
print(train[0])
print(test[0])

[('k', 'B'), ('-', '-'), ('e', 'B'), ('-', '-'), ('n', 'I'), ('-', '-'), ('z', 'B')]
[('e', 'B'), ('-', '-'), ('s', 'B'), ('-', '-'), ('p', 'B'), ('-', '-'), ('i', 'B'), ('-', '-'), ('r', 'B'), ('-', '-'), ('i', 'B'), ('-', '-'), ('t', 'B'), ('-', '-'), ('i', 'B'), ('-', '-'), ('s', 'B')]


In [125]:
print("Accuracy on test data : ",round(tagger_new.evaluate(test),3)*100,'%')


  Function evaluate() has been deprecated.  Use accuracy(gold)
  instead.
  """Entry point for launching an IPython kernel.


Accuracy on test data :  98.4 %


#Testing

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
trans_labels = le.fit_transform(updated_data['tagged_labels'])
#data['labels'] = le.fit_transform(data['labels'])

In [None]:
trans_labels[0]

195

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

count = CountVectorizer(max_features=1500)
input_text = count.fit_transform(updated_data['train_words'].values.astype('U'))

#x_train_cv,x_test_cv,y_train_cv,y_test_cv = train_test_split(input_text,updated_data['tagged_labels'],test_size=0.1)
x_train_cv,x_test_cv,y_train_cv,y_test_cv = train_test_split(input_text,trans_labels,test_size=0.1)

In [None]:
type(count)

TypeError: ignored

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(x_train_cv.reshape(1,-1),y_train_cv.reshape(-1,1))

from sklearn.metrics import accuracy_score
pred = tree.predict(x_test_cv)
accuracy_score(y_test_cv,pred)

ValueError: ignored

LSTM

In [None]:
import nltk
import keras
import tensorflow
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize

In [None]:
from sklearn.model_selection import train_test_split
x = updated_data['words']
y = updated_data['tagged_labels']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1, random_state=41)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, Embedding, Input, LSTM, Conv1D, MaxPool1D, Bidirectional, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

In [None]:
vocab = data['words'].tolist()

In [None]:
number_words = len(list(set(vocab)))
number_words

12812

In [None]:
x_test.head()

1410         Jewòm
9973      sepandan
10817    egzateman
2932      filalang
5563         Gawou
Name: words, dtype: object

In [None]:
# tokenizing all tweets 
tokenizer = Tokenizer(num_words=number_words)
tokenizer.fit_on_texts(x_train)

train_sequence = tokenizer.texts_to_sequences(x_train)

#test_sequence = tokenizer.texts_to_sequences(x_test)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_train = pad_sequences(train_sequence,maxlen=2, padding = 'post')

In [None]:
y_train_cat = to_categorical(y_train,5)

ValueError: ignored