<a href="https://colab.research.google.com/github/ayushjain1144/NER/blob/master/Conll.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
from tqdm import tqdm, tnrange


import matplotlib.pyplot as plt
%matplotlib inline

In [0]:
from collections import Counter
import gensim.models.word2vec as word2vec
import copy
import os
from io import StringIO

# Loading Data

In [10]:
!pwd
!ls


/media/ayushjain1144/New Linux/NER
activations.py	dataset  file.json  initial_experiment.ipynb  README.md


In [0]:
dataset_base_dir = './dataset/'
train_data_file = os.path.join(dataset_base_dir, 'train.txt')
val_data_file = os.path.join(dataset_base_dir, 'valid.txt')
test_data_file = os.path.join(dataset_base_dir, 'test.txt')

train_data = open(train_data_file, 'r').read().upper()
test_data = open(test_data_file, 'r').read().upper()
val_data = open(val_data_file, 'r').read().upper()

In [39]:
!head -20 dataset/train.txt

-DOCSTART- -X- -X- O

EU NNP B-NP B-ORG
rejects VBZ B-VP O
German JJ B-NP B-MISC
call NN I-NP O
to TO B-VP O
boycott VB I-VP O
British JJ B-NP B-MISC
lamb NN I-NP O
. . O O

Peter NNP B-NP B-PER
Blackburn NNP I-NP I-PER

BRUSSELS NNP B-NP B-LOC
1996-08-22 CD I-NP O

The DT B-NP O
European NNP I-NP B-ORG


In [40]:
!head -20 dataset/test.txt

-DOCSTART- -X- -X- O

SOCCER NN B-NP O
- : O O
JAPAN NNP B-NP B-LOC
GET VB B-VP O
LUCKY NNP B-NP O
WIN NNP I-NP O
, , O O
CHINA NNP B-NP B-PER
IN IN B-PP O
SURPRISE DT B-NP O
DEFEAT NN I-NP O
. . O O

Nadim NNP B-NP B-PER
Ladki NNP I-NP I-PER

AL-AIN NNP B-NP B-LOC
, , O O


In [41]:
!head -20 dataset/valid.txt

-DOCSTART- -X- -X- O

CRICKET NNP B-NP O
- : O O
LEICESTERSHIRE NNP B-NP B-ORG
TAKE NNP I-NP O
OVER IN B-PP O
AT NNP B-NP O
TOP NNP I-NP O
AFTER NNP I-NP O
INNINGS NNP I-NP O
VICTORY NN I-NP O
. . O O

LONDON NNP B-NP B-LOC
1996-08-30 CD I-NP O

West NNP B-NP B-MISC
Indian NNP I-NP I-MISC
all-rounder NN I-NP O


In [70]:
train_data



In [0]:
TRAINDATA = StringIO(train_data)

train_df = pd.read_csv(TRAINDATA, sep=" ", header=None)
train_df.columns = ["word", "pos_tag", "chunk_tag", "NER_tag"]
train_df = train_df[1:]

TESTDATA = StringIO(test_data)
test_df = pd.read_csv(TESTDATA, sep=" ", header=None)
test_df.columns = ["word", "pos_tag", "chunk_tag", "NER_tag"]
test_df = test_df[1:]

VALDATA = StringIO(val_data)
val_df = pd.read_csv(VALDATA, sep=" ", header=None)
val_df.columns = ["word", "pos_tag", "chunk_tag", "NER_tag"]
val_df = val_df[1:]

In [127]:
train_df[250:300]

Unnamed: 0,word,pos_tag,chunk_tag,NER_tag
251,FARM,NN,I-NP,O
252,MINISTERS,NNS,I-NP,O
253,',POS,B-NP,O
254,MEETING,NN,I-NP,O
255,OF,IN,B-PP,O
256,CAUSING,VBG,B-VP,O
257,UNJUSTIFIED,JJ,B-ADJP,O
258,ALARM,NN,B-NP,O
259,THROUGH,IN,B-PP,O
260,,O,O,


In [108]:
test_df.head()

Unnamed: 0,word,pos_tag,chunk_tag,NER_tag
1,SOCCER,NN,B-NP,O
2,-,:,O,O
3,JAPAN,NNP,B-NP,B-LOC
4,GET,VB,B-VP,O
5,LUCKY,NNP,B-NP,O


In [94]:
(val_df.head()

Unnamed: 0,word,pos_tag,chunk_tag,NER_tag
1,CRICKET,NNP,B-NP,O
2,-,:,O,O
3,LEICESTERSHIRE,NNP,B-NP,B-ORG
4,TAKE,NNP,I-NP,O
5,OVER,IN,B-PP,O


In [121]:
# This means that our model needs to predict NULL as named entity recognition

val_df[train_df.isnull().any(axis=1)]["pos_tag"].head()

  """Entry point for launching an IPython kernel.


75      RP
93      DT
260     DT
264    NNP
366    NNP
Name: pos_tag, dtype: object

In [0]:
train_df["NER_tag"].fillna("no_tag", inplace=True)
test_df["NER_tag"].fillna("no_tag", inplace=True)
val_df["NER_tag"].fillna("no_tag", inplace=True)
# train_df[train_df['NER_tag'] == 'no_tag']

In [148]:
train_df[train_df.isnull().any(axis=1)]["NER_tag"].head()

105612    O
105614    O
105616    O
Name: NER_tag, dtype: object

# Vocabulary

In [149]:
train_word_set = set(train_df["word"].to_list())
test_word_set = set(test_df["word"].to_list())
val_word_set = set(val_df["word"].to_list())

word_set = train_word_set.union(test_word_set, val_word_set)
word_list = list(word_set)
print(f"Total unique words: {len(word_list)}")

ner_tags_list = list(set(train_df['NER_tag'].to_list()))
print(f"Unique Ner Tags: {ner_tags_list}, number: {len(ner_tags_list)}")

Total unique words: 26870
Unique Ner Tags: ['I-LOC', 'no_tag', 'B-PER', 'O', 'B-MISC', 'B-LOC', 'I-PER', 'B-ORG', 'I-ORG', 'I-MISC'], number: 10


In [0]:
# convering the string data to indices dictionary

word2idx = {w: i for i, w in enumerate(word_list)}
tag2idx = {t: i for i, t in enumerate(ner_tags_list)}

In [151]:
tag2idx

{'I-LOC': 0,
 'no_tag': 1,
 'B-PER': 2,
 'O': 3,
 'B-MISC': 4,
 'B-LOC': 5,
 'I-PER': 6,
 'B-ORG': 7,
 'I-ORG': 8,
 'I-MISC': 9}

In [0]:
word2idx