# Imports

In [0]:
import numpy as np
import pandas as pd
from tqdm import tqdm, tnrange


import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
from collections import Counter
from gensim import models
import copy
import os
from io import StringIO

In [0]:
###### Keras module is only used for PREPROCESSING not TRAINING ######

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

# Loading Data

In [10]:
!pwd
!ls


/media/ayushjain1144/New Linux/NER
activations.py	dataset  file.json  initial_experiment.ipynb  README.md


In [0]:
dataset_base_dir = './dataset/'
train_data_file = os.path.join(dataset_base_dir, 'train.txt')
val_data_file = os.path.join(dataset_base_dir, 'valid.txt')
test_data_file = os.path.join(dataset_base_dir, 'test.txt')

train_data = open(train_data_file, 'r').read().lower()
test_data = open(test_data_file, 'r').read().lower()
val_data = open(val_data_file, 'r').read().lower()

In [221]:
!head -20 dataset/train.txt

-DOCSTART- -X- -X- O

EU NNP B-NP B-ORG
rejects VBZ B-VP O
German JJ B-NP B-MISC
call NN I-NP O
to TO B-VP O
boycott VB I-VP O
British JJ B-NP B-MISC
lamb NN I-NP O
. . O O

Peter NNP B-NP B-PER
Blackburn NNP I-NP I-PER

BRUSSELS NNP B-NP B-LOC
1996-08-22 CD I-NP O

The DT B-NP O
European NNP I-NP B-ORG


In [222]:
!head -20 dataset/test.txt

-DOCSTART- -X- -X- O

SOCCER NN B-NP O
- : O O
JAPAN NNP B-NP B-LOC
GET VB B-VP O
LUCKY NNP B-NP O
WIN NNP I-NP O
, , O O
CHINA NNP B-NP B-PER
IN IN B-PP O
SURPRISE DT B-NP O
DEFEAT NN I-NP O
. . O O

Nadim NNP B-NP B-PER
Ladki NNP I-NP I-PER

AL-AIN NNP B-NP B-LOC
, , O O


In [223]:
!head -20 dataset/valid.txt

-DOCSTART- -X- -X- O

CRICKET NNP B-NP O
- : O O
LEICESTERSHIRE NNP B-NP B-ORG
TAKE NNP I-NP O
OVER IN B-PP O
AT NNP B-NP O
TOP NNP I-NP O
AFTER NNP I-NP O
INNINGS NNP I-NP O
VICTORY NN I-NP O
. . O O

LONDON NNP B-NP B-LOC
1996-08-30 CD I-NP O

West NNP B-NP B-MISC
Indian NNP I-NP I-MISC
all-rounder NN I-NP O


In [224]:
train_data



In [0]:
TRAINDATA = StringIO(train_data)

train_df = pd.read_csv(TRAINDATA, sep=" ", header=None)
train_df.columns = ["word", "pos_tag", "chunk_tag", "NER_tag"]
train_df = train_df[1:]

TESTDATA = StringIO(test_data)
test_df = pd.read_csv(TESTDATA, sep=" ", header=None)
test_df.columns = ["word", "pos_tag", "chunk_tag", "NER_tag"]
test_df = test_df[1:]

VALDATA = StringIO(val_data)
val_df = pd.read_csv(VALDATA, sep=" ", header=None)
val_df.columns = ["word", "pos_tag", "chunk_tag", "NER_tag"]
val_df = val_df[1:]

In [226]:
train_df[250:300]

Unnamed: 0,word,pos_tag,chunk_tag,NER_tag
251,farm,nn,i-np,o
252,ministers,nns,i-np,o
253,',pos,b-np,o
254,meeting,nn,i-np,o
255,of,in,b-pp,o
256,causing,vbg,b-vp,o
257,unjustified,jj,b-adjp,o
258,alarm,nn,b-np,o
259,through,in,b-pp,o
260,,o,o,


In [227]:
test_df.head()

Unnamed: 0,word,pos_tag,chunk_tag,NER_tag
1,soccer,nn,b-np,o
2,-,:,o,o
3,japan,nnp,b-np,b-loc
4,get,vb,b-vp,o
5,lucky,nnp,b-np,o


In [229]:
val_df.head()

Unnamed: 0,word,pos_tag,chunk_tag,NER_tag
1,cricket,nnp,b-np,o
2,-,:,o,o
3,leicestershire,nnp,b-np,b-org
4,take,nnp,i-np,o
5,over,in,b-pp,o


In [230]:
# This means that our model needs to predict NULL as named entity recognition

val_df[train_df.isnull().any(axis=1)]["pos_tag"].head()

  """Entry point for launching an IPython kernel.


75      rp
93      dt
260     dt
264    nnp
366    nnp
Name: pos_tag, dtype: object

In [0]:
train_df["NER_tag"].fillna("no_tag", inplace=True)
test_df["NER_tag"].fillna("no_tag", inplace=True)
val_df["NER_tag"].fillna("no_tag", inplace=True)
# train_df[train_df['NER_tag'] == 'no_tag']

In [232]:
train_df[train_df.isnull().any(axis=1)]["NER_tag"].head()

38240    b-org
Name: NER_tag, dtype: object

In [264]:
train_df.shape[0]

204566

# Vocabulary

In [265]:
num_train = train_df.shape[0]
num_val = val_df.shape[0]
num_test = test_df.shape[0]

train_word_set = set(train_df["word"].to_list())
test_word_set = set(test_df["word"].to_list())
val_word_set = set(val_df["word"].to_list())

word_set = train_word_set.union(test_word_set, val_word_set)
word_list = list(word_set)
print(f"Total unique words: {len(word_list)}")

ner_tags_list = list(set(train_df['NER_tag'].to_list()))
print(f"Unique Ner Tags: {ner_tags_list}, number: {len(ner_tags_list)}")

num_words = len(word_list)
num_tags = len(ner_tags_list)

Total unique words: 26870
Unique Ner Tags: ['no_tag', 'i-misc', 'b-misc', 'b-org', 'b-loc', 'i-per', 'b-per', 'o', 'i-loc', 'i-org'], number: 10


In [0]:
# convering the string data to indices dictionary

word2idx = {w: i for i, w in enumerate(word_list)}
tag2idx = {t: i for i, t in enumerate(ner_tags_list)}

In [235]:
tag2idx

{'no_tag': 0,
 'i-misc': 1,
 'b-misc': 2,
 'b-org': 3,
 'b-loc': 4,
 'i-per': 5,
 'b-per': 6,
 'o': 7,
 'i-loc': 8,
 'i-org': 9}

In [0]:
word2idx

# Forming train and test sentences

In [0]:
def get_tagged_sentences(df):
  tagged_list = [(w, t) for w, t in zip(df["word"], df["NER_tag"])]
  final = []

  for ele in tagged_list:

    if not final:  # if list is empty
      final.append([ele])
    
    elif final[-1][-1][0] == '.': # if the last tuple of last list is ('.', ..), form new list
      final.append([ele])

    else:       # add it to running list
      final[-1].append(ele) 

  return final
   



In [0]:
train_sentences = get_tagged_sentences(train_df)
test_sentences = get_tagged_sentences(test_df)
val_sentences = get_tagged_sentences(val_df)

In [238]:
val_sentences[:2]

[[('cricket', 'o'),
  ('-', 'o'),
  ('leicestershire', 'b-org'),
  ('take', 'o'),
  ('over', 'o'),
  ('at', 'o'),
  ('top', 'o'),
  ('after', 'o'),
  ('innings', 'o'),
  ('victory', 'o'),
  ('.', 'o')],
 [('london', 'b-loc'),
  ('1996-08-30', 'o'),
  ('west', 'b-misc'),
  ('indian', 'i-misc'),
  ('all-rounder', 'o'),
  ('phil', 'b-per'),
  ('simmons', 'i-per'),
  ('took', 'o'),
  ('four', 'o'),
  ('for', 'o'),
  ('38', 'o'),
  ('on', 'o'),
  ('friday', 'o'),
  ('as', 'o'),
  ('leicestershire', 'b-org'),
  ('beat', 'o'),
  ('somerset', 'b-org'),
  ('by', 'o'),
  ('an', 'o'),
  ('innings', 'o'),
  ('and', 'o'),
  ('39', 'o'),
  ('runs', 'o'),
  ('in', 'o'),
  ('two', 'o'),
  ('days', 'o'),
  ('to', 'o'),
  ('take', 'o'),
  ('over', 'o'),
  ('at', 'o'),
  ('the', 'o'),
  ('head', 'o'),
  ('of', 'o'),
  ('the', 'o'),
  ('county', 'o'),
  ('championship', 'o'),
  ('.', 'o')]]

In [239]:
max_len_train = len(max(train_sentences, key=len))
max_len_train

1232

# Feature Extraction

In [0]:
MAX_LEN = 512

# converting into indices
X_train = [[word2idx[w[0]] for w in s] for s in train_sentences]
X_val = [[word2idx[w[0]] for w in s] for s in val_sentences]
X_test = [[word2idx[w[0]] for w in s] for s in test_sentences]

# padding with Max len = 512
X_train = pad_sequences(maxlen=MAX_LEN, sequences=X_train, padding="post", value=MAX_LEN + 1)
X_val = pad_sequences(maxlen=MAX_LEN, sequences=X_val, padding="post", value=MAX_LEN + 1)
X_test = pad_sequences(maxlen=MAX_LEN, sequences=X_test, padding="post", value=MAX_LEN + 1)

# converting tags to indices
y_train = [[tag2idx[w[1]] for w in s] for s in train_sentences]
y_val = [[tag2idx[w[1]] for w in s] for s in val_sentences]
y_test = [[tag2idx[w[1]] for w in s] for s in test_sentences]

# padding with Max len = 512
y_train = pad_sequences(maxlen=MAX_LEN, sequences=y_train, padding="post", value=tag2idx["O"])
y_val = pad_sequences(maxlen=MAX_LEN, sequences=y_val, padding="post", value=tag2idx["O"])
y_test = pad_sequences(maxlen=MAX_LEN, sequences=y_test, padding="post", value=tag2idx["O"])


# Making labels to one hot encoded

y_train = [to_categorical(i, num_classes=num_tags) for i in y_train]
y_val = [to_categorical(i, num_classes=num_tags) for i in y_val]
y_test = [to_categorical(i, num_classes=num_tags) for i in y_test]




In [198]:
y_train[0]

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [201]:
X_train[0]

array([ 8325,  1841, 14976,  1413,  9367, 10684,  8958,  2851, 14197,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,   513,   513,   513,   513,   513,   513,   513,
         513,   513,

# Word2Vec

In [0]:
# Google's pretrained word2vec model

word2vec_model = models.KeyedVectors.load_word2vec_format('https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=10 ** 5)


In [250]:
a = word2vec_model["computer"]
a

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [251]:
np.random.rand(*a.shape).shape

(300,)

In [0]:
def get_word2vec_indices(sentences):
  """returns wordvec index list and the number of oov words"""
  X = []
  oov = 0

  for s in sentences:
    temp = []
    for w in s:
      try:
        temp.append(word2vec_model[w[0]])
      except:
        oov += 1
        temp.append(np.random.rand(*a.shape))
    X.append(temp)
  return X, oov

In [0]:
MAX_LEN = 32


# converting into indices
X_train, oov_train = get_word2vec_indices(train_sentences)
X_val, oov_val = get_word2vec_indices(val_sentences)
X_test, oov_test = get_word2vec_indices(test_sentences)

# padding with Max len = 512
X_train = pad_sequences(maxlen=MAX_LEN, sequences=X_train, padding="post", value=MAX_LEN + 1)
X_val = pad_sequences(maxlen=MAX_LEN, sequences=X_val, padding="post", value=MAX_LEN + 1)
X_test = pad_sequences(maxlen=MAX_LEN, sequences=X_test, padding="post", value=MAX_LEN + 1)

# converting tags to indices
y_train = [[tag2idx[w[1]] for w in s] for s in train_sentences]
y_val = [[tag2idx[w[1]] for w in s] for s in val_sentences]
y_test = [[tag2idx[w[1]] for w in s] for s in test_sentences]

# padding with Max len = 512
y_train = pad_sequences(maxlen=MAX_LEN, sequences=y_train, padding="post", value=tag2idx["o"])
y_val = pad_sequences(maxlen=MAX_LEN, sequences=y_val, padding="post", value=tag2idx["o"])
y_test = pad_sequences(maxlen=MAX_LEN, sequences=y_test, padding="post", value=tag2idx["o"])


# Making labels to one hot encoded

y_train = [to_categorical(i, num_classes=num_tags) for i in y_train]
y_val = [to_categorical(i, num_classes=num_tags) for i in y_val]
y_test = [to_categorical(i, num_classes=num_tags) for i in y_test]

### Number of OOV words

In [268]:
print(f"% of OOV words in train set = {oov_train/num_train}")
print(f"% of OOV words in val set = {oov_val/num_val}")
print(f"% of OOV words in test set = {oov_test/num_test}")

% of OOV words in train set = 0.41284964265811525
% of OOV words in val set = 0.40737150280163636
% of OOV words in test set = 0.4406514518375656
