# Imports

In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm, tnrange


import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from collections import Counter
from gensim import models
import copy
import os
from io import StringIO

In [3]:
###### Keras module is only used for PREPROCESSING not TRAINING ######

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [4]:
# Global variables

# n gram model  = 2 * C + 1
C = 1
word_vector_dim = 300

# Loading Data

In [5]:
!pwd
!ls


/media/ayushjain1144/New Linux/NER
activations.py		  initialization.py	README.md	    vocab.npy
Conll.ipynb		  loss.py		test_features.npy
dataset			  NER_NN_network.ipynb	train_features.npy
initial_experiment.ipynb  nn.py			val_features.npy


In [6]:
dataset_base_dir = './dataset/'
train_data_file = os.path.join(dataset_base_dir, 'train.txt')
val_data_file = os.path.join(dataset_base_dir, 'valid.txt')
test_data_file = os.path.join(dataset_base_dir, 'test.txt')

train_data = open(train_data_file, 'r').read().lower()
test_data = open(test_data_file, 'r').read().lower()
val_data = open(val_data_file, 'r').read().lower()

In [7]:
!head -20 dataset/train.txt

-DOCSTART- -X- -X- O

EU NNP B-NP B-ORG
rejects VBZ B-VP O
German JJ B-NP B-MISC
call NN I-NP O
to TO B-VP O
boycott VB I-VP O
British JJ B-NP B-MISC
lamb NN I-NP O
. . O O

Peter NNP B-NP B-PER
Blackburn NNP I-NP I-PER

BRUSSELS NNP B-NP B-LOC
1996-08-22 CD I-NP O

The DT B-NP O
European NNP I-NP B-ORG


In [8]:
!head -20 dataset/test.txt

-DOCSTART- -X- -X- O

SOCCER NN B-NP O
- : O O
JAPAN NNP B-NP B-LOC
GET VB B-VP O
LUCKY NNP B-NP O
WIN NNP I-NP O
, , O O
CHINA NNP B-NP B-PER
IN IN B-PP O
SURPRISE DT B-NP O
DEFEAT NN I-NP O
. . O O

Nadim NNP B-NP B-PER
Ladki NNP I-NP I-PER

AL-AIN NNP B-NP B-LOC
, , O O


In [9]:
!head -20 dataset/valid.txt

-DOCSTART- -X- -X- O

CRICKET NNP B-NP O
- : O O
LEICESTERSHIRE NNP B-NP B-ORG
TAKE NNP I-NP O
OVER IN B-PP O
AT NNP B-NP O
TOP NNP I-NP O
AFTER NNP I-NP O
INNINGS NNP I-NP O
VICTORY NN I-NP O
. . O O

LONDON NNP B-NP B-LOC
1996-08-30 CD I-NP O

West NNP B-NP B-MISC
Indian NNP I-NP I-MISC
all-rounder NN I-NP O


In [12]:
TRAINDATA = StringIO(train_data)

train_df = pd.read_csv(TRAINDATA, sep=" ", header=None)
train_df.columns = ["word", "pos_tag", "chunk_tag", "NER_tag"]
train_df = train_df[1:]

TESTDATA = StringIO(test_data)
test_df = pd.read_csv(TESTDATA, sep=" ", header=None)
test_df.columns = ["word", "pos_tag", "chunk_tag", "NER_tag"]
test_df = test_df[1:]

VALDATA = StringIO(val_data)
val_df = pd.read_csv(VALDATA, sep=" ", header=None)
val_df.columns = ["word", "pos_tag", "chunk_tag", "NER_tag"]
val_df = val_df[1:]

In [13]:
train_df[250:300]

Unnamed: 0,word,pos_tag,chunk_tag,NER_tag
251,farm,nn,i-np,o
252,ministers,nns,i-np,o
253,',pos,b-np,o
254,meeting,nn,i-np,o
255,of,in,b-pp,o
256,causing,vbg,b-vp,o
257,unjustified,jj,b-adjp,o
258,alarm,nn,b-np,o
259,through,in,b-pp,o
260,,o,o,


In [14]:
test_df.head()

Unnamed: 0,word,pos_tag,chunk_tag,NER_tag
1,soccer,nn,b-np,o
2,-,:,o,o
3,japan,nnp,b-np,b-loc
4,get,vb,b-vp,o
5,lucky,nnp,b-np,o


In [15]:
val_df.head()

Unnamed: 0,word,pos_tag,chunk_tag,NER_tag
1,cricket,nnp,b-np,o
2,-,:,o,o
3,leicestershire,nnp,b-np,b-org
4,take,nnp,i-np,o
5,over,in,b-pp,o


In [16]:
# This means that our model needs to predict NULL as named entity recognition

val_df[val_df.isnull().any(axis=1)]["pos_tag"].head()

1138    o
1160    o
1164    o
1192    o
1239    o
Name: pos_tag, dtype: object

In [17]:
train_df[train_df.isnull().any(axis=1)]["pos_tag"].head()

75     o
93     o
260    o
264    o
366    o
Name: pos_tag, dtype: object

In [18]:
train_df["NER_tag"].fillna("no_tag", inplace=True)
test_df["NER_tag"].fillna("no_tag", inplace=True)
val_df["NER_tag"].fillna("no_tag", inplace=True)
# train_df[train_df['NER_tag'] == 'no_tag']

In [19]:
train_df.dropna()
test_df.dropna()
val_df.dropna()

Unnamed: 0,word,pos_tag,chunk_tag,NER_tag
1,cricket,nnp,b-np,o
2,-,:,o,o
3,leicestershire,nnp,b-np,b-org
4,take,nnp,i-np,o
5,over,in,b-pp,o
...,...,...,...,...
51573,.,.,o,o
51574,--,:,o,o
51575,dhaka,nnp,b-np,b-org
51576,newsroom,nnp,i-np,i-org


In [20]:
train_df.shape[0]

204566

# Vocabulary

In [21]:
num_train = train_df.shape[0]
num_val = val_df.shape[0]
num_test = test_df.shape[0]

train_word_set = set(train_df["word"].to_list())
test_word_set = set(test_df["word"].to_list())
val_word_set = set(val_df["word"].to_list())

word_set = train_word_set.union(test_word_set, val_word_set)
word_list = list(word_set)
word_list.extend(['start_tk', 'end_tk'])
print(f"Total unique words: {len(word_list)}")

ner_tags_list = list(set(train_df['NER_tag'].to_list()))
print(f"Unique Ner Tags: {ner_tags_list}, number: {len(ner_tags_list)}")

num_words = len(word_list)
num_tags = len(ner_tags_list)

Total unique words: 26872
Unique Ner Tags: ['no_tag', 'i-per', 'b-per', 'i-org', 'i-misc', 'i-loc', 'b-loc', 'b-org', 'o', 'b-misc'], number: 10


In [0]:
# convering the string data to indices dictionary

word2idx = {w: i for i, w in enumerate(word_list)}
tag2idx = {t: i for i, t in enumerate(ner_tags_list)}

In [0]:
tag2idx

In [0]:
word2idx

# Forming train and test sentences

In [0]:
def get_tagged_sentences(df):
  tagged_list = [(w, t) for w, t in zip(df["word"], df["NER_tag"])]
  final = []

  for ele in tagged_list:

    if not final:  # if list is empty
      final.append([ele])
    
    elif final[-1][-1][0] == '.': # if the last tuple of last list is ('.', ..), form new list
      final.append([ele])

    else:       # add it to running list
      final[-1].append(ele) 

  return final
   



In [0]:
train_sentences = get_tagged_sentences(train_df)
test_sentences = get_tagged_sentences(test_df)
val_sentences = get_tagged_sentences(val_df)

In [0]:
val_sentences[:2]

In [0]:
max_len_train = len(max(train_sentences, key=len))
max_len_train

# Feature Extraction

In [0]:
MAX_LEN = 512

# converting into indices
X_train = [[word2idx[w[0]] for w in s] for s in train_sentences]
X_val = [[word2idx[w[0]] for w in s] for s in val_sentences]
X_test = [[word2idx[w[0]] for w in s] for s in test_sentences]

# padding with Max len = 512
X_train = pad_sequences(maxlen=MAX_LEN, sequences=X_train, padding="post", value=MAX_LEN + 1)
X_val = pad_sequences(maxlen=MAX_LEN, sequences=X_val, padding="post", value=MAX_LEN + 1)
X_test = pad_sequences(maxlen=MAX_LEN, sequences=X_test, padding="post", value=MAX_LEN + 1)

# converting tags to indices
y_train = [[tag2idx[w[1]] for w in s] for s in train_sentences]
y_val = [[tag2idx[w[1]] for w in s] for s in val_sentences]
y_test = [[tag2idx[w[1]] for w in s] for s in test_sentences]

# padding with Max len = 512
y_train = pad_sequences(maxlen=MAX_LEN, sequences=y_train, padding="post", value=tag2idx["no_tag"])
y_val = pad_sequences(maxlen=MAX_LEN, sequences=y_val, padding="post", value=tag2idx["no_tag"])
y_test = pad_sequences(maxlen=MAX_LEN, sequences=y_test, padding="post", value=tag2idx["no_tag"])


# Making labels to one hot encoded

y_train = [to_categorical(i, num_classes=num_tags) for i in y_train]
y_val = [to_categorical(i, num_classes=num_tags) for i in y_val]
y_test = [to_categorical(i, num_classes=num_tags) for i in y_test]




In [0]:
y_train[0]

In [0]:
X_train[0]

# Word2Vec

In [0]:
# Google's pretrained word2vec model

word2vec_model = models.KeyedVectors.load_word2vec_format('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=10 ** 5)


In [0]:
a = word2vec_model["computer"]
a

In [0]:
# np.random.rand(*a.shape).shape

In [0]:
def get_word2vec_indices(vocabulary):
  """returns wordvec index list and the number of oov words"""

  vocab_matrix  = np.zeros(shape=(len(word_list), word_vector_dim))
  oov = 0

  for s, idx in vocabulary.items():

    try:
      vocab_matrix[idx] = word2vec_model[s]

    except:
      if s == 'start_tk':
        n = np.zeros_like(a)
        n[0] = 1
        vocab_matrix[idx] = n
      elif s == 'end_tk':
        n = np.zeros_like(a)
        n[1] = 1
        vocab_matrix[idx] = n
      else:
        oov += 1
        vocab_matrix[idx] = np.random.randn(*a.shape)
  return vocab_matrix, oov

In [0]:
vocabulary_matrix, oov = get_word2vec_indices(word2idx)

In [0]:
vocabulary_matrix.shape

In [0]:
n = np.zeros_like(a)
n[0] = 1
n

In [0]:
# window size = 2 * c + 1; 
def append_start_end_marker(sentences, C=1):
  start_set = [('start_tk', 'no_tag') for i in range(C)]
  end_set = [('end_tk', 'no_tag') for i in range(C)]

  mod_sentences = []
  n_gram = []
  labels = []
  for sent in sentences:
    mod_sentences = [ *start_set,  *sent,  *end_set]

    for i in range(len(mod_sentences) - (2 * C)):
      n_gram.append([word[0] for word in mod_sentences[i: i+ 2 * C + 1]])
      labels.append(mod_sentences[i + C ][1])
      

  return n_gram, labels

train_n_grams, train_labels = append_start_end_marker(train_sentences, C)
test_n_grams, test_labels = append_start_end_marker(test_sentences, C)
val_n_grams, val_labels = append_start_end_marker(val_sentences, C)


In [0]:
train_n_grams[0]

In [0]:
train_labels[:20]

In [0]:
train_sentences[0]

In [0]:
vocabulary_matrix[word2idx['start_tk']]

In [0]:

def get_wordvec_features(n_grams):

  features = np.zeros(shape=(len(n_grams), (2 * C + 1) * word_vector_dim))

  for i in range(len(n_grams)):
    vec = np.array([vocabulary_matrix[word2idx[w]] for w in n_grams[i]]).flatten()
    features[i] = vec
  
  return features

################### run it only once, load it from pickle files ##################
# train_features = get_wordvec_features(train_n_grams)
# test_features = get_wordvec_features(test_n_grams)
# val_features = get_wordvec_features(val_n_grams)


In [0]:
np.save('train_features.npy', train_features)
np.save('test_features.npy', test_features)
np.save('val_features.npy', val_features)

np.save('vocab.npy', vocabulary_matrix)

# Load saved features

In [0]:
train_features = np.load('train_features.npy')
test_features = np.load('test_features.npy')
val_features = np.load('val_features.npy')

In [0]:
train_features.shape

In [0]:
MAX_LEN = 32


# converting into indices
X_train, oov_train = get_word2vec_indices(train_n_grams)
X_val, oov_val = get_word2vec_indices(val_n_grams)
X_test, oov_test = get_word2vec_indices(test_n_grams)

# # padding with Max len = 512
# X_train = pad_sequences(maxlen=MAX_LEN, sequences=X_train, padding="post", value=MAX_LEN + 1)
# X_val = pad_sequences(maxlen=MAX_LEN, sequences=X_val, padding="post", value=MAX_LEN + 1)
# X_test = pad_sequences(maxlen=MAX_LEN, sequences=X_test, padding="post", value=MAX_LEN + 1)

# converting tags to indices
y_train = [tag2idx[w] for w in train_labels] 
y_val = [tag2idx[w] for w in val_labels]
y_test = [tag2idx[w] for w in test_labels]


# # padding with Max len = 512
# y_train = pad_sequences(maxlen=MAX_LEN, sequences=y_train, padding="post", value=tag2idx["o"])
# y_val = pad_sequences(maxlen=MAX_LEN, sequences=y_val, padding="post", value=tag2idx["o"])
# y_test = pad_sequences(maxlen=MAX_LEN, sequences=y_test, padding="post", value=tag2idx["o"])


# Making labels to one hot encoded

y_train = [to_categorical(i, num_classes=num_tags) for i in y_train]
y_val = [to_categorical(i, num_classes=num_tags) for i in y_val]
y_test = [to_categorical(i, num_classes=num_tags) for i in y_test]

### Number of OOV words

In [0]:
print(f"% of OOV words in train set = {oov_train/num_train}")
print(f"% of OOV words in val set = {oov_val/num_val}")
print(f"% of OOV words in test set = {oov_test/num_test}")