# Named Entity Recognition

In [1]:
!pip install parsivar



In [2]:
# Loading NLTk
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

ex = 'my name is Ali and I live in Iran'
entities = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(entities)

(S
  my/PRP$
  name/NN
  is/VBZ
  (PERSON Ali/NNP)
  and/CC
  I/PRP
  live/VBP
  in/IN
  (GPE Iran/NNP))


As we see, nltk can't recognize named entity in persian.

In [6]:
ex = 'من علی هستم و در ایران زندگی می کنم'
entities = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(entities)

(S
  من/JJ
  (ORGANIZATION علی/NNP)
  هستم/NNP
  و/NNP
  در/NNP
  ایران/NNP
  زندگی/NNP
  می/NNP
  کنم/NN)


## NER with lookup tables

In [7]:
import pandas as pd
country = pd.read_csv('/content/drive/MyDrive/AI-Internship/country_list.csv', header = None)

In [8]:
country.head()

Unnamed: 0,0
0,1 'AFG' 'افغانستان'
1,2 'ALA' 'جزایر آلند'
2,3 'ALB' 'آلبانی'
3,4 'DZA' 'الجزایر'
4,5 'ASM' 'ساموای آمریکا'


In [9]:
print(f'country shape : {country.shape}')

country shape : (244, 1)


In [10]:
country.columns

Int64Index([0], dtype='int64')

In [11]:
print(country[0][0])

1 'AFG' 'افغانستان'


In [12]:
country[0][0].split()

['1', "'AFG'", "'افغانستان'"]

In [16]:
country[0][0].split()[2]

"'افغانستان'"

because we want to remove the first and end quotation we slice this string and use [1:-1]

In [15]:
country[0][0].split()[2][1:-1]

'افغانستان'

In [17]:
en = []
fa = []

for i in range(country.shape[0]):
  en.append(country[0][i].split()[1][1:-1])
  fa.append(country[0][i].split()[2][1:-1])

print(f'10 elements of en : {en[:10]}')
print(f'10 elements of fa : {fa[:10]}')

10 elements of en : ['AFG', 'ALA', 'ALB', 'DZA', 'ASM', 'AND', 'AGO', 'AIA', 'ATA', 'ATG']
10 elements of fa : ['افغانستان', 'جزای', 'آلبانی', 'الجزایر', 'ساموا', 'آندورا', 'آنگولا', 'آنگویلا', 'جنوبگان', 'آنتیگو']


Let's insert two columns to the country dataframe.One for english name of countrie and another for persian name of them.

In [18]:
country.insert(1, "en", en)
country.insert(2, "fa", fa)

In [19]:
country.head()

Unnamed: 0,0,en,fa
0,1 'AFG' 'افغانستان',AFG,افغانستان
1,2 'ALA' 'جزایر آلند',ALA,جزای
2,3 'ALB' 'آلبانی',ALB,آلبانی
3,4 'DZA' 'الجزایر',DZA,الجزایر
4,5 'ASM' 'ساموای آمریکا',ASM,ساموا


In [20]:
country.drop([0], axis=1, inplace=True)

In [21]:
country.head()

Unnamed: 0,en,fa
0,AFG,افغانستان
1,ALA,جزای
2,ALB,آلبانی
3,DZA,الجزایر
4,ASM,ساموا


In [22]:
# #Save dataframe
# country.to_pickle('/content/drive/MyDrive/AI-Internship/country_lookup')

In [None]:
#load the preprocessing dataframe
country = pd.read_pickle(r'/content/drive/MyDrive/AI-Internship/country_lookup')

In [23]:
country.shape

(244, 2)

Let's develop NER with lookup tables.

In [26]:
def lookup_model(sentence):
  words = sentence.split()
  for word in words:
    if word in country.fa.values:
      return ['GRE', word, "NNP"] 


In [25]:
'ایران' in country.fa.values

True

In [27]:
print(lookup_model('من علی هستم و در ایران زندکی می کنم'))

['GRE', 'ایران', 'NNP']
