# Named Entity Recognition

In [1]:
!pip install parsivar

Collecting parsivar
  Downloading parsivar-0.2.3.tar.gz (36.2 MB)
[K     |████████████████████████████████| 36.2 MB 66 kB/s 
[?25hCollecting nltk==3.4.5
  Downloading nltk-3.4.5.zip (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 43.0 MB/s 
Building wheels for collected packages: parsivar, nltk
  Building wheel for parsivar (setup.py) ... [?25l[?25hdone
  Created wheel for parsivar: filename=parsivar-0.2.3-py3-none-any.whl size=36492971 sha256=c11c14e5d4458c9258923835d20afd6de0e3040d198c6d72ff7a25eb1f64da52
  Stored in directory: /root/.cache/pip/wheels/ae/67/7a/49cbf08f64d3f76a26eceaf0e481a40e233f05d4356875cbed
  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Created wheel for nltk: filename=nltk-3.4.5-py3-none-any.whl size=1449921 sha256=d6cc82a2f1169ef6bea91130fc15df0aab1fcb78a45906b913dd7fc193a52c73
  Stored in directory: /root/.cache/pip/wheels/48/8b/7f/473521e0c731c6566d631b281f323842bbda9bd819eb9a3ead
Successfully built parsivar nltk
Installing collecte

In [2]:
# Loading NLTk
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [3]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

ex = 'my name is Ali and I live in Iran'
entities = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(entities)

(S
  my/PRP$
  name/NN
  is/VBZ
  (PERSON Ali/NNP)
  and/CC
  I/PRP
  live/VBP
  in/IN
  (GPE Iran/NNP))


As we see, nltk can't recognize named entity in persian.

In [None]:
ex = 'من علی هستم و در ایران زندگی می کنم'
entities = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(entities)

(S
  من/JJ
  (ORGANIZATION علی/NNP)
  هستم/NNP
  و/NNP
  در/NNP
  ایران/NNP
  زندگی/NNP
  می/NNP
  کنم/NN)


## NER with lookup tables

Let's do some exploratory on our dataset.

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/AI-Internship/country_list.csv', header = None)

As we see, our country list has one column. Each row consists of the Persian and English names of each country. This dataset is not in our desired form. So, we should split words in each row and separate them, then insert them into two new columns and delete the current column.

In [None]:
df.head()

Unnamed: 0,0
0,1 'AFG' 'افغانستان'
1,2 'ALA' 'جزایر آلند'
2,3 'ALB' 'آلبانی'
3,4 'DZA' 'الجزایر'
4,5 'ASM' 'ساموای آمریکا'


In [None]:
print(f'dataset shape : {df.shape}')

dataset shape : (244, 1)


In [None]:
df.columns

Int64Index([0], dtype='int64')

**Hint:** df[0][1] means first row of the column whose name is [0]

In [None]:
print(df[0][0])

1 'AFG' 'افغانستان'


In [None]:
df[0][0].split()

['1', "'AFG'", "'افغانستان'"]

Let's see another row in which the Persian name of that has more than one word.

In [None]:
print(f'df[0][27] : {df[0][27]}')
lst = df[0][27].split()[2:]
print(f'country name : {lst}')

df[0][27] : 28 'BIH' 'بوسنی و هرزگوین'
country name : ["'بوسنی", 'و', "هرزگوین'"]


Let's combine the parts of this country name.

In [None]:
lst = ' '.join(i for i in lst)
lst

"'بوسنی و هرزگوین'"

because we want to remove the first and end quotation we slice this string and use [1:-1]

In [None]:
lst[1:-1]

'بوسنی و هرزگوین'

In [None]:
en = []
fa = []

for i in range(df.shape[0]):
  
  row_splitted = df[0][i].split()
  en.append(row_splitted[1][1:-1])
  
  name = row_splitted[2:]
  name = ' '.join(word for word in name)
  fa.append(name[1:-1])

print(f'10 elements of en : {en[:10]}')
print(f'10 elements of fa : {fa[:10]}')

10 elements of en : ['AFG', 'ALA', 'ALB', 'DZA', 'ASM', 'AND', 'AGO', 'AIA', 'ATA', 'ATG']
10 elements of fa : ['افغانستان', 'جزایر آلند', 'آلبانی', 'الجزایر', 'ساموای آمریکا', 'آندورا', 'آنگولا', 'آنگویلا', 'جنوبگان', 'آنتیگوا و باربودا']


Let's insert two columns to the country dataframe.One for english name of countrie and another for persian name of them.

In [None]:
country = df.copy()

In [None]:
country.insert(1, "en", en)
country.insert(2, "fa", fa)

In [None]:
country.head()

Unnamed: 0,0,en,fa
0,1 'AFG' 'افغانستان',AFG,افغانستان
1,2 'ALA' 'جزایر آلند',ALA,جزایر آلند
2,3 'ALB' 'آلبانی',ALB,آلبانی
3,4 'DZA' 'الجزایر',DZA,الجزایر
4,5 'ASM' 'ساموای آمریکا',ASM,ساموای آمریکا


In [None]:
country.drop([0], axis=1, inplace=True)

In [None]:
country.head()

Unnamed: 0,en,fa
0,AFG,افغانستان
1,ALA,جزایر آلند
2,ALB,آلبانی
3,DZA,الجزایر
4,ASM,ساموای آمریکا


In [None]:
country.tail()

Unnamed: 0,en,fa
239,WLF,والیس و فوتونا
240,ESH,صحرای غربی
241,YEM,یمن
242,ZMB,زامبیا
243,ZWE,زیمبابوه'


As wee see, last row is not in right way, so we should correct it.

In [None]:
country.fa[243] = country.fa[243][:-1]

In [None]:
country.tail()

Unnamed: 0,en,fa
239,WLF,والیس و فوتونا
240,ESH,صحرای غربی
241,YEM,یمن
242,ZMB,زامبیا
243,ZWE,زیمبابوه


In [None]:
# #Save dataframe
# country.to_pickle('/content/drive/MyDrive/AI-Internship/country_lookup')

In [6]:
#load the preprocessing dataframe
country = pd.read_pickle(r'/content/drive/MyDrive/AI-Internship/country_lookup')

In [7]:
country.shape

(244, 2)

In [8]:
country.isnull().sum()

en    0
fa    0
dtype: int64

In [9]:
country.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   en      244 non-null    object
 1   fa      244 non-null    object
dtypes: object(2)
memory usage: 3.9+ KB


### Create model

Let's develop NER with lookup tables.
If our sentece has some of the words in lookup tabel, this model recognize it and asign it to the predifined category.

In [None]:
def lookup_model(sentence):
  
  result = []
  for name in country.fa.values:
    if name in sentence:
      result += [('GRE', name)]
  return result    


Let's perform this model on some examples.

In [None]:
print(lookup_model('من علی هستم و در ایران زندکی می کنم'))

[('GRE', 'ایران')]


In [None]:
print(lookup_model('.من برای تعطیلات به جزیره هرد و جزایر مک زیبا خواهم رفت'))

[('GRE', 'جزیره هرد و جزایر مک')]


In [None]:
print(lookup_model('تعداد دانشجویان ایرانی  که به کانادا مهاجرت می کنند روز به روز بیشتر می شود.'))

[('GRE', 'کانادا'), ('GRE', 'ایران')]


## NER With Regex 

### Create model

If the sentence has specific words like "کشور", this model recognize the word after this word as a country(or other related category.)

In [None]:
import re

def regex_model(sentence):

  result = []
  iter = re.finditer(r"\bکشور\b", sentence)
  indices = [m.end(0) for m in iter]
  
  for i in indices:
    result.append(('GRE', sentence[i+1:sentence[i+1:].find(" ")+ (i+1)]))
  return result  


In [None]:
sentence = 'من در کشور ایران زندگی میکنم. من کشور خود را دوست دارم. من کشورم ایران را دوست دارم. او برای سفر به کشور جمهوری چک می رود.'

In [None]:
print(regex_model(sentence))

[('GRE', 'ایران'), ('GRE', 'خود'), ('GRE', 'جمهوری')]


In [None]:
## Regex look behind
# ind = re.search('(?<=کشور )(\w+)', sentence).groups()

In [None]:
# ???????????? lookup and regex

## NER With Stanford NER and NLTK 

###Implementing NER with Stanford NER / NLTK

Because Stanford NER tagger is written in Java, you are going to need a proper Java Virtual Machine to be installed on your computer.

In [5]:
import os       #importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

openjdk version "11.0.11" 2021-04-20
OpenJDK Runtime Environment (build 11.0.11+9-Ubuntu-0ubuntu2.18.04)
OpenJDK 64-Bit Server VM (build 11.0.11+9-Ubuntu-0ubuntu2.18.04, mixed mode, sharing)


Let's download Stanford NER model from https://nlp.stanford.edu/software/CRF-NER.html and unzip it.

In [None]:
# import zipfile
# # unzip image file
# local_zip = '/content/drive/MyDrive/AI-Internship/stanford-ner-4.2.0.zip'
# zip_ref = zipfile.ZipFile(local_zip, 'r')
# zip_ref.extractall('/content/drive/MyDrive/AI-Internship/stanford-ner-4.2.0')
# zip_ref.close()

After unzipping this file, we need two things of this folder.(stanford-ner-4.2.0.jar and our model: english.all.3class.distsim.crf.ser.gz)

First we should create a folder in our drive, whose name is 'stanford-ner-tagger'. Then put .jar and .gz in this folder.


In [9]:
import nltk
from nltk.tag.stanford import StanfordNERTagger

jar = '/content/drive/MyDrive/AI-Internship/stanford-ner-tagger/stanford-ner-4.2.0.jar'
model = '/content/drive/MyDrive/AI-Internship/stanford-ner-tagger/english.all.3class.distsim.crf.ser.gz'

sentence = u"Twenty miles east of Reno, Nev., " \
    "where packs of wild mustangs roam free through " \
    "the parched landscape, Tesla Gigafactory 1 " \
    "sprawls near Interstate 80."

# prepare NER tagger with english model
ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

# tokenize: split sentece into words
words = nltk.word_tokenize(sentence)

# run NER tagger on words
print(ner_tagger.tag(words))


[('Twenty', 'O'), ('miles', 'O'), ('east', 'O'), ('of', 'O'), ('Reno', 'LOCATION'), (',', 'O'), ('Nev.', 'LOCATION'), (',', 'O'), ('where', 'O'), ('packs', 'O'), ('of', 'O'), ('wild', 'O'), ('mustangs', 'O'), ('roam', 'O'), ('free', 'O'), ('through', 'O'), ('the', 'O'), ('parched', 'O'), ('landscape', 'O'), (',', 'O'), ('Tesla', 'ORGANIZATION'), ('Gigafactory', 'ORGANIZATION'), ('1', 'O'), ('sprawls', 'O'), ('near', 'O'), ('Interstate', 'LOCATION'), ('80', 'LOCATION'), ('.', 'O')]


As we can see, our model in not bad. It recognize some of named entity. But this model traind just on english corpus, so we should train our own model on the desired language.

### Training our own (Persian) model

In this section, we should train our own model with our dataset.

Our dataset should be in a proper format (each row has just a word and its role and in each row, firs column is word and another one is role of this word.)

We should create 'train' folder in 'stanford-ner-tagger' folder. Then we put our dataset in it.

At this moment we should wire some properties like our model directory, our train file directory and ... in text file whose name is 'prop.txt' and we put it in 'train' folder.

The format of our prop.txt file is as follow:

In [15]:
# trainFile = train/persian NER/train.txt
# serializeTo = dummy-ner-model-persian.ser.gz

# #structure of your training file; this tells the classifier
# #that the word is in column 0 and the correct answer is in
# #column 1

# map = word=0,answer=1

# useClassFeature=true
# useWord=true
# useNGrams=true
# noMidNGrams=true
# maxNGramLeng=6
# usePrev=true
# useNext=true
# useSequences=true
# usePrevSequences=true
# maxLeft=1
# useTypeSeqs=true
# useTypeSeqs2=true
# useTypeySequences=true
# wordShape=chris2useLC
# useDisjunctive=true

Train it, using:

In [26]:
%cd "/content/drive/MyDrive/AI-Internship/stanford-ner-tagger/"
!java -cp "stanford-ner-4.2.0.jar:lib/*" -mx4g edu.stanford.nlp.ie.crf.CRFClassifier -prop train/prop.txt

/content/drive/MyDrive/AI-Internship/stanford-ner-tagger
Invoked on Sun Sep 19 09:01:13 UTC 2021 with arguments: -prop train/prop.txt
useTypeSeqs2=true
noMidNGrams=true
trainFile=train/train.tsv
maxNGramLeng=6
maxLeft=1
serializeTo=dummy-ner-model-persian.ser.gz
wordShape=chris2useLC
useDisjunctive=true
useClassFeature=true
useNGrams=true
useNext=true
usePrev=true
useTypeySequences=true
usePrevSequences=true
qnSize=10
useTypeSeqs=true
useSequences=true
map=word=0,answer=1
useWord=true
numFeatures = 1232256
Time to convert docs to feature indices: 18.3 seconds
Current memory used: 497m
numClasses: 13 [0=O,1=B-pers,2=B-event,3=I-event,4=I-pers,5=B-loc,6=I-loc,7=B-pro,8=I-pro,9=B-fac,10=I-fac,11=B-org,12=I-org]
numDocuments: 10241
numDatums: 334734
numFeatures: 1232256
Time to convert docs to data/labels: 9.1 seconds
Current memory used: 701m
Running gradient on 2 threads
numWeights: 71140524
QNMinimizer called on double function of 71140524 variables, using M = 10.
Exception in thread "m

In [22]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         108G   45G   64G  41% /
tmpfs            64M     0   64M   0% /dev
tmpfs           6.4G     0  6.4G   0% /sys/fs/cgroup
shm             5.9G     0  5.9G   0% /dev/shm
tmpfs           6.4G   28K  6.4G   1% /var/colab
/dev/sda1        81G   49G   33G  60% /etc/hosts
tmpfs           6.4G     0  6.4G   0% /proc/acpi
tmpfs           6.4G     0  6.4G   0% /proc/scsi
tmpfs           6.4G     0  6.4G   0% /sys/firmware
drive            15G   11G  4.1G  73% /content/drive
