# NAMED ENTITY RECOGNITION:

1. The named entities are pre-defined categories chosen according to the use case such as names of people, organizations, places, codes, time notations, monetary values, etc.


## Importing Package

In [102]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from simpletransformers.ner import NERModel,NERArgs
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
data = pd.read_csv("ner_dataset.csv",encoding="latin1" )

In [103]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [104]:
data =data.fillna(method ="ffill")

In [105]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [106]:
data["Sentence #"] = LabelEncoder().fit_transform(data["Sentence #"] )

In [107]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,0,Thousands,NNS,O
1,0,of,IN,O
2,0,demonstrators,NNS,O
3,0,have,VBP,O
4,0,marched,VBN,O


In [108]:
data.rename(columns={"Sentence #":"sentence_id","Word":"words","Tag":"labels"}, inplace =True)

In [109]:
data["labels"] = data["labels"].str.upper()

In [110]:
X= data[["sentence_id","words"]]
Y =data["labels"]

In [111]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size =0.2)

In [112]:
train_data = pd.DataFrame({"sentence_id":x_train["sentence_id"],"words":x_train["words"],"labels":y_train})
test_data = pd.DataFrame({"sentence_id":x_test["sentence_id"],"words":x_test["words"],"labels":y_test})

# Model Training


In [113]:
label = data["labels"].unique().tolist()
label

['O',
 'B-GEO',
 'B-GPE',
 'B-PER',
 'I-GEO',
 'B-ORG',
 'I-ORG',
 'B-TIM',
 'B-ART',
 'I-ART',
 'I-PER',
 'I-GPE',
 'I-TIM',
 'B-NAT',
 'B-EVE',
 'I-EVE',
 'I-NAT']

In [114]:
args = NERArgs()
args.num_train_epochs = 1
args.learning_rate = 1e-4
args.overwrite_output_dir =True
args.train_batch_size = 32
args.eval_batch_size = 32


In [115]:
model = NERModel('bert', 'bert-base-cased',labels=label,args =args)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

In [116]:
model.train_model(train_data,eval_data = test_data,acc=accuracy_score)

  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/1499 [00:00<?, ?it/s]

  model.parameters(), args.max_grad_norm


(1499, 0.1903092280537228)

In [117]:
result, model_outputs, preds_list = model.eval_model(test_data)

  0%|          | 0/3 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/1461 [00:00<?, ?it/s]

In [119]:
result

{'eval_loss': 0.17258920787892915,
 'precision': 0.8304138460464792,
 'recall': 0.7589373983480563,
 'f1_score': 0.793068399566775}

In [120]:
stop_words = stopwords.words("english")
wordnet = WordNetLemmatizer()

def clean_data(x):
  x = ' '.join([word for word in x.split(' ') if word not in stop_words])
  x = x.encode('ascii', 'ignore').decode()
  x = re.sub(r'https*\S+', ' ', x)
  x = re.sub(r'@\S+', ' ', x)
  x = re.sub(r'#\S+', ' ', x)
  x = re.sub(r'\'\w+', '', x)
  x = re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
  x = re.sub(r'\w*\d+\w*', '', x)
  x = re.sub(r'\s{2,}', ' ', x)
  return x


In [121]:
user = input()
user_clean_data = clean_data(user)

American Airlines said it would launch a direct flight to Bengaluru from Seattle :D, home to Amazon and Microsoft https:xyz.com.


In [122]:
prediction, model_output = model.predict([user_clean_data])

  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

In [123]:
prediction

[[{'American': 'B-ORG'},
  {'Airlines': 'I-ORG'},
  {'said': 'O'},
  {'would': 'O'},
  {'launch': 'O'},
  {'direct': 'O'},
  {'flight': 'O'},
  {'Bengaluru': 'B-GEO'},
  {'Seattle': 'B-GEO'},
  {'D': 'O'},
  {'home': 'O'},
  {'Amazon': 'B-ORG'},
  {'Microsoft': 'B-ORG'}]]

In [124]:
dataframe = pd.DataFrame([[user,user_clean_data,prediction]], columns=["text", "clean","extracted"])

In [125]:
dataframe

Unnamed: 0,text,clean,extracted
0,American Airlines said it would launch a direc...,American Airlines said would launch direct fli...,"[[{'American': 'B-ORG'}, {'Airlines': 'I-ORG'}..."
