<a href="https://colab.research.google.com/github/aithaprasad/NLP_Kreyol_Segmentation/blob/main/Kreyol_Segmentation_NLP_Supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional 
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import numpy as np

In [2]:
data = pd.read_csv("kreyol_segmentation_train.tsv",delimiter="\t",on_bad_lines='skip')

In [3]:
data.columns=['word','division']

In [4]:
data[0:5]['word']

0             depotwa
1    sosyopwofesyonèl
2             vejetal
3           repibliye
4               mason
Name: word, dtype: object

In [5]:
data[0:5]['division']

0                      d-e-p-o-t-w-a
1    s-o-s-y-o-p-w-o-f-e-s-y-o-n-è-l
2                      v-e-j-e-t-a-l
3                  r-e-p-i-b-l-i-y-e
4                           m-a-s-on
Name: division, dtype: object

In [6]:
data.shape

(12811, 2)

In [7]:
data_list_word = data['word'].to_list()

In [8]:
data_list_division = data['division'].to_list()

In [9]:
assert len(data_list_word) == len(data_list_division) == 12811

In [10]:
res = []
for word_division in data_list_division:
  split_word = word_division.split('-')
  temp = []
  for letter in split_word:
    temp.append((letter[0], 'B'))
    if len(letter) > 1:
      for item in letter[1:]:
        temp.append((item, 'I'))
  res.append(temp)

In [11]:
vocab = list(set([w for sent in res for (w,t) in sent]))
vocab.append('<PAD>')
print(len(vocab))
tags = list(set([t for sent in res for (w,t) in sent]))
tags.append('<PAD>')
print(tags)
print(res[0]) 

52
['I', 'B', '<PAD>']
[('d', 'B'), ('e', 'B'), ('p', 'B'), ('o', 'B'), ('t', 'B'), ('w', 'B'), ('a', 'B')]


In [12]:
max_len = 0
for lst in res:
  max_len = max(max_len, len(lst))
max_len

18

In [13]:
word2index = {w: i for i, w in enumerate(vocab)}
tag2index = {t: i for i, t in enumerate(tags)}
onehot = [[word2index[w[0]] for w in s] for s in res]
X = pad_sequences(maxlen=max_len, sequences=onehot, padding="post", value=len(vocab)-1)

In [14]:
onehot_y = [[tag2index[w[1]] for w in s] for s in res]
y = pad_sequences(maxlen=max_len, sequences=onehot_y, padding="post", value=tag2index["<PAD>"])
y = [to_categorical(i, num_classes=len(tags)) for i in y]

In [15]:
y = np.asarray(y)

In [16]:
len(X)

12811

In [17]:
X_train, y_train, X_test, y_test = X[:11000], y[:11000], X[11000:], y[11000:]

In [18]:
model = Sequential()
model.add(Embedding(input_dim=len(vocab), output_dim=50, input_length=max_len))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1)))
model.add(TimeDistributed(Dense(len(tags), activation="softmax")))
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(X_train, y_train, batch_size=5, epochs=2, validation_split=0.1, verbose=1)

Epoch 1/2
Epoch 2/2


In [19]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 18, 50)            2600      
                                                                 
 bidirectional (Bidirectiona  (None, 18, 200)          120800    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 18, 3)            603       
 ibuted)                                                         
                                                                 
Total params: 124,003
Trainable params: 124,003
Non-trainable params: 0
_________________________________________________________________


In [22]:
test_sent = 3
pred = model.predict(np.array([X_test[test_sent]]))
#print(pred)
p = np.argmax(pred, axis=-1)
p_actual = np.argmax(y_test[test_sent],axis=-1)
print(p[0])
print(p_actual)

[1 1 1 1 1 0 2 2 2 2 2 2 2 2 2 2 2 2]
[1 1 1 1 1 0 2 2 2 2 2 2 2 2 2 2 2 2]


In [23]:
y_true, y_pred = [], []
for test_sent in range(len(X_test)):
  pred = model.predict(np.array([X_test[test_sent]]))
  p = np.argmax(pred, axis=-1)
  p_actual = np.argmax(y_test[test_sent],axis=-1)
  for i in range(len(p_actual)):
    if p_actual[i] == 2: break
    y_true.append(p_actual[i])
    y_pred.append(p[0][i])

In [24]:
from sklearn.metrics import f1_score
f1_score(y_true, y_pred, average='weighted')

0.9984917536637214

In [25]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)

0.9984901462174189

In [26]:
from sklearn.metrics import precision_recall_fscore_support
precision_recall_fscore_support(y_true, y_pred, average=None, labels=[0, 1, 2])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


(array([0.98876404, 0.99939215, 0.        ]),
 array([0.99341486, 0.99895842, 0.        ]),
 array([0.991084  , 0.99917524, 0.        ]),
 array([ 1063, 11521,     0]))