In [None]:
!pip uninstall tensorflow
!pip uninstall keras
!pip install keras==2.2.4
!pip install tensorflow==1.15.0
!pip install git+https://www.github.com/keras-team/keras-contrib.git
!pip install h5py
!pip install pickle5

In [None]:
import numpy as np
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
import pickle5 as pickle
import platform
import os
import pandas as pd
from future.utils import iteritems
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [None]:
from google.colab import drive
drive.mount("content/")

Mounted at content/


In [None]:
def get_path(*args):
    spec_path = os.sep.join(args)
    return os.path.join(os.getcwd(), spec_path)

In [None]:
sentences_path = get_path("content","MyDrive","sentences.txt")
labels_path = get_path("content","MyDrive","labels.txt")

if sentences_path:
  print(f"Sentences exist -> {sentences_path}")
if labels_path:
  print(f"Labels exist -> {labels_path}")

Sentences exist -> /content/content/MyDrive/sentences.txt
Labels exist -> /content/content/MyDrive/labels.txt


In [None]:
words = ["ali","veli","bla","bla"]
set_words = set(words)
print(set_words)

{'ali', 'veli', 'bla'}


In [None]:
# load and process data

def parse_data(will_parsed_row_of_corpus):

  file_path = get_path("content","MyDrive","ner_data_set.DUMP") 

  f = open(file_path, "r")

  line_count = 0

  data = dict()
  data["sentence_id"] = list()
  data["tag"] = list()
  data["word"] = list()

  sentence_id_arr = data["sentence_id"]
  tag_arr = data["tag"]
  word_arr = data["word"]


  for line in f.readlines():

    if line_count > will_parsed_row_of_corpus:
      break

    line_count += 1
    # each line seperated by ht (horizontal tabs)
    splitted = line.split("\t")
    
    if len(splitted) ==3:
      tag_split = splitted[1].split(" ")
      word_split = splitted[2].split(" ")
    
      for tag, word in zip(tag_split, word_split):
          word = word.lower()
          word = word.strip()
          if word[len(word)-1] == "\n":
              word = word[:-1]
          
          sentence_id_arr.append(line_count)
          tag_arr.append(tag)
          word_arr.append(word)

  df = pd.DataFrame(data, columns=["sentence_id", "tag", "word"])

  print(df.head())
  print(f"data shape -> {df.shape[0]}")

  return df

class SentenceGetter(object):
    
    def __init__(self, dataset):
        self.n_sent = 1
        self.dataset = dataset
        self.empty = False
        agg_func = lambda s: [(w, t) for w,t in zip(s["word"].values.tolist(),
                                                        s["tag"].values.tolist())]
        self.grouped = self.dataset.groupby("sentence_id").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None



def analyze_and_preprocess(df, test_size=0.2):
  global max_len_seq, vocabulary, number_of_words, number_of_tags, tags, word_to_ids, tag_to_ids, ids_to_tags
  
  # creating sentence sequences with tags from data (preprocess)
  sentences = SentenceGetter(df).sentences
  print(f"Number of sentences in data set {len(sentences)}")
  print(sentences[10])

  # it should be static because when we split for just testing max length might be different
  # and it will produce error while padding and test will not be finish correctly 
  # I set it 100 because in TBMM corpus there might be longer sentences than this corpus includes.
  max_len_seq = 200

  vocabulary = list(set(df["word"].values))
  vocabulary.append("ENDPAD")

  number_of_words = len(vocabulary) 

  tags = list(set(df["tag"].values))
  print(tags)
  number_of_tags = len(tags)


  # creating index dictionaries for words and tags
  word_to_ids = {w: i for i, w in enumerate(vocabulary)}
  tag_to_ids = {t: i for i, t in enumerate(tags)}
  ids_to_tags = {v: k for k, v in iteritems(tag_to_ids)}

  # fill with padding value (ENDPAD) until being equal all sequence size with longest sequence
  X = [[word_to_ids[w[0]] for w in s] for s in sentences]
  X = pad_sequences(maxlen=max_len_seq, sequences=X, padding="post",value=number_of_words - 1)

  # fill with padding value (O) until being equal all sequence size with longest sequence
  y_id = [[tag_to_ids[w[1]] for w in s] for s in sentences]
  y = pad_sequences(maxlen=max_len_seq, sequences=y_id, padding="post", value= -1)

  y = [to_categorical(i, num_classes=number_of_tags) for i in y]

  # save initial config data
  with open('config.pkl', 'wb') as outp:
      pickle.dump((vocabulary, tags), outp)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)

  return X_train, X_test, y_train, y_test, vocabulary, tags

def load_data(will_parsed_row):
  df = parse_data(will_parsed_row)
  # data preprocess and splitting

  return analyze_and_preprocess(df)

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Bidirectional, LSTM
from keras_contrib.layers import CRF
import pickle

EMBED_DIM = 200
BiRNN_UNITS = 200


def create_model(will_parsed_row, train=True):

    X_train=None
    y_train=None 
    X_test=None 
    y_test=None
    vocabulary=None
    tags=None

    if train:
        X_train, X_test, y_train, y_test, vocabulary, tags = load_data(will_parsed_row)
    else:
        with open('config.pkl', 'rb') as inp:
            (vocabulary, tags) = pickle.load(inp)

    model = Sequential()
    model.add(Embedding(len(vocabulary), EMBED_DIM, mask_zero=True))  # Random embedding
    model.add(Bidirectional(LSTM(BiRNN_UNITS // 2, return_sequences=True)))
    model.add(LSTM(units=BiRNN_UNITS // 2, return_sequences=True ))
    crf = CRF(len(tags), sparse_target=True)
    model.add(crf)
    model.summary()
    model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy])

    if train:
        return model,X_train, X_test, y_train, y_test
    else:
        return model, vocabulary, tags

In [None]:
EPOCHS = 10
model, X_train, X_test, y_train, y_test = create_model(1000)
# train model
model.fit( X_train, np.array(y_train), batch_size=16,epochs=EPOCHS, validation_split=0.2)
model.save("ner_model.h5")

   sentence_id         tag      word
0            1    B-PERSON    corina
1            1    I-PERSON  casanova
2            1           O         ,
3            1  B-LOCATION  i̇sviçre
4            1           O   federal
data shape -> 16966
Number of sentences in data set 1001
[('denton', 'B-LOCATION'), (',', 'O'), ('amerika', 'B-LOCATION'), ('birleşik', 'I-LOCATION'), ("devletleri'nde", 'I-LOCATION'), ('teksas', 'B-LOCATION'), ('eyaletinin', 'O'), ('denton', 'B-LOCATION'), ('bölgesindeki', 'O'), ('bir', 'O'), ('şehirdir', 'O'), ('.', 'O')]
['I-ORGANIZATION', 'I-MISC', 'I-PERSON', 'B-LOCATION', 'B-ORGANIZATION', 'B-PERSON', 'O', 'B-MISC', 'I-LOCATION']
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, None, 200)         1351600   
_________________________________________________________________
bidirectional_17 (Bidirectio (None, None, 200)         240800    
________



Train on 640 samples, validate on 160 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
def process_test_data(data, vocabulary, tags, maxlen=200):
  print(vocabulary)
  word_to_ids = {w: i for i, w in enumerate(vocabulary)}
  tag_to_ids = {t: i for i, t in enumerate(tags)}
  ids_to_tags = {v: k for k, v in iteritems(tag_to_ids)}
  X = [[word_to_ids[w.lower()] for w in s.split()] for s in data]
  X = pad_sequences(maxlen=max_len_seq, sequences=X, padding="post",value=number_of_words - 1)
  return X, tag_to_ids, ids_to_tags

In [None]:
# prepare test
model, vocab, tags = create_model(0,train=False)
print(tags)
predict_text = ["etmeye , rafael kazanmış buyin öğrenci filmidir kaçırdı"]
X, tag_to_ids, ids_to_tags = process_test_data(predict_text, vocab, tags)
print(X)
print(tag_to_ids)
print(ids_to_tags)

model.load_weights("ner_model.h5")
raw = model.predict(np.array(X))

 # acumulate the scores by tag
for i, sentence in enumerate(X):
  predictions = np.argmax(raw[i], axis=-1)
  print(predictions)
  for id, (pred, word) in enumerate(zip(predictions, predict_text[0].split())):
    print(f"{word} -> {tags[pred]}")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, None, 200)         1351600   
_________________________________________________________________
bidirectional_16 (Bidirectio (None, None, 200)         240800    
_________________________________________________________________
crf_16 (CRF)                 (None, None, 9)           1908      
Total params: 1,594,308
Trainable params: 1,594,308
Non-trainable params: 0
_________________________________________________________________




['I-ORGANIZATION', 'I-MISC', 'I-PERSON', 'B-LOCATION', 'B-ORGANIZATION', 'B-PERSON', 'O', 'B-MISC', 'I-LOCATION']
["vivian'ı", 'conan', 'etmeye', 'rafael', 'kazanmış', 'buyin', 'öğrenci', 'filmidir', 'kaçırdı', 'is', 'uzerindedir', 'talep', 'stewart', 'oklahoma', 'raguşa', 'eylem', 'ölçüde', 'pek', 'metal', 'itibaren', 'yürürlüğe', 'varoalr', "1995'de", 'ülkeleriyle', 'televizyonda', "aversa'da", "fransa'ya", 'adımlarla', 'iii', 'tahminen', 'masal', 'gemisi', 'türkmenistan', 'suprema', 'süren', 'çıktılar', 'sergileyip', 'kizi', 'saltanattan', 'uzaklardan', 'günümüzde', 'peşindeki', "i̇talya'daki", 'hooker', 'muazzam', 'çizgi', 'varisi', 'porticus', 'kuzeyde', 'krallığını', 'hisatsu', 'as', "i̇talya'nın", '4400', "mulroney'in", 'transfer', 'silahçı', "i̇stanbul'da", 'orta', 'kilisesi', 'görülebilecek', 'korumasında', 'hall', 'hissetti', 'halk', 'disiplinli', 'yükselmiştir', 'd.', 'göçmüş', "çekoslovakya'nın", 'elektrik', 'didier', 'futbol', 'istediği', 'teminat', 'sebeplerle', 'topluluğ