In [2]:
import pandas as pd
data = pd.read_csv('/content/train.tsv', sep = '\t',header= None)
data.columns = ['words','segmentation']
data 

Unnamed: 0,words,segmentation
0,konsiltan,k-on-s-i-l-t-an
1,depotwa,d-e-p-o-t-w-a
2,sosyopwofesyonèl,s-o-s-y-o-p-w-o-f-e-s-y-o-n-è-l
3,vejetal,v-e-j-e-t-a-l
4,repibliye,r-e-p-i-b-l-i-y-e
...,...,...
12807,Remi,R-e-m-i
12808,diskriminatwa,d-i-s-k-r-i-m-i-n-a-t-w-a
12809,rejè,r-e-j-è
12810,manzè,m-an-z-è


In [3]:
#data['segmentation'] = data['segmentation'].str.replace('-','  ')
data['segmentation'] = data['segmentation'].str.lower()

In [4]:
data_list_division = data['segmentation'].to_list()
res = []
for word_division in data_list_division:
  split_word = word_division.split('-')
  temp = []
  for letter in split_word:
    temp.append((letter[0], 'B'))
    if len(letter) > 1:
      for item in letter[1:]:
        temp.append((item, 'I'))
  res.append(temp)

In [5]:
vocab = list(set([w for sent in res for (w,t) in sent]))
vocab.append('<PAD>')
print(len(vocab))
tags = list(set([t for sent in res for (w,t) in sent]))
tags.append('<PAD>')
print(tags)
print(res[0]) 

28
['B', 'I', '<PAD>']
[('k', 'B'), ('o', 'B'), ('n', 'I'), ('s', 'B'), ('i', 'B'), ('l', 'B'), ('t', 'B'), ('a', 'B'), ('n', 'I')]


In [6]:
max_len = 0
for i in res:
  if max_len < len(i):
      max_len = len(i)

max_len

18

In [7]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional 
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np

In [8]:
word2index = {w: i for i, w in enumerate(vocab)}
tag2index = {t: i for i, t in enumerate(tags)}
onehot = [[word2index[w[0]] for w in s] for s in res]
x = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab)-1)

In [9]:
onehot_y = [[tag2index[w[1]] for w in s] for s in res]
y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post", value=tag2index["<PAD>"])
y = [to_categorical(i, num_classes=len(tags)) for i in y]
y = np.asarray(y)

In [10]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.1, random_state=41)

In [11]:
#LSTM model using keras library

model = Sequential()
model.add(tf.keras.Input(shape=(x_train.shape[-1]), name="word_input"))
model.add(Embedding(input_dim = len(vocab), output_dim=10,input_length=max_len))

model.add((LSTM(256, return_sequences=True)))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(tf.keras.layers.TimeDistributed(Dense(len(tags),activation='softmax')))

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy',metrics = ['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 18, 10)            280       
                                                                 
 lstm (LSTM)                 (None, 18, 256)           273408    
                                                                 
 dense (Dense)               (None, 18, 64)            16448     
                                                                 
 dropout (Dropout)           (None, 18, 64)            0         
                                                                 
 time_distributed (TimeDistr  (None, 18, 3)            195       
 ibuted)                                                         
                                                                 
Total params: 290,331
Trainable params: 290,331
Non-trainable params: 0
__________________________________________________

In [18]:
history = model.fit(x_train, y_train, batch_size=16, epochs=2, validation_split=0.1)

Epoch 1/2
Epoch 2/2


In [19]:
model.evaluate(x_test,y_test)



[0.011390814557671547, 0.9958831667900085]

In [20]:
y_true, y_pred = [], []
for i in range(len(x_test)):
  pred = model.predict(np.array([x_test[i]]))
  p = np.argmax(pred, axis=-1)
  p_actual = np.argmax(y_test[i],axis=-1)
  for i in range(len(p_actual)):
    if p_actual[i] == 2: break
    y_true.append(p_actual[i])
    y_pred.append(p[0][i])

In [21]:
pred = model.predict(np.array([x_test[5]]))
p = np.argmax(pred,axis=-1)
p_actual = np.argmax(y_test[5],axis=-1)
print(p)
print(p_actual)

[[0 1 0 0 0 0 1 0 2 2 2 2 2 2 2 2 2 2]]
[0 1 0 0 0 0 1 0 2 2 2 2 2 2 2 2 2 2]


In [22]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true,y_pred)

0.9891030052764396

In [23]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0, 1, 2])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(array([0.99859658, 0.90454545, 0.        ]),
 array([0.98938187, 0.98636927, 0.        ]),
 array([0.99396787, 0.94368702, 0.        ]),
 array([7911,  807,    0]))