# Named Entity Recognition

## NER with NLTK

In [1]:
!pip install parsivar



In [2]:
# Loading NLTk
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

ex = 'my name is Ali and I live in Iran'
entities = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(entities)

(S
  my/PRP$
  name/NN
  is/VBZ
  (PERSON Ali/NNP)
  and/CC
  I/PRP
  live/VBP
  in/IN
  (GPE Iran/NNP))


As we see, nltk can't recognize named entity in persian.

In [13]:
ex = 'من علی هستم و در ایران زندگی می کنم'
entities = nltk.ne_chunk(pos_tag(word_tokenize(ex)))
print(entities)

(S
  من/JJ
  (ORGANIZATION علی/NNP)
  هستم/NNP
  و/NNP
  در/NNP
  ایران/NNP
  زندگی/NNP
  می/NNP
  کنم/NN)


## NER with lookup tables

Let's do some exploratory on our dataset.

In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/AI-Internship/country_list.csv', header = None)

As we see, our country list has one column. Each row consists of the Persian and English names of each country. This dataset is not in our desired form. So, we should split words in each row and separate them, then insert them into two new columns and delete the current column.

In [None]:
df.head()

Unnamed: 0,0
0,1 'AFG' 'افغانستان'
1,2 'ALA' 'جزایر آلند'
2,3 'ALB' 'آلبانی'
3,4 'DZA' 'الجزایر'
4,5 'ASM' 'ساموای آمریکا'


In [None]:
print(f'dataset shape : {df.shape}')

dataset shape : (244, 1)


In [None]:
df.columns

Int64Index([0], dtype='int64')

**Hint:** df[0][1] means first row of the column whose name is [0]

In [None]:
print(df[0][0])

1 'AFG' 'افغانستان'


In [None]:
df[0][0].split()

['1', "'AFG'", "'افغانستان'"]

Let's see another row in which the Persian name of that has more than one word.

In [None]:
print(f'df[0][27] : {df[0][27]}')
lst = df[0][27].split()[2:]
print(f'country name : {lst}')

df[0][27] : 28 'BIH' 'بوسنی و هرزگوین'
country name : ["'بوسنی", 'و', "هرزگوین'"]


Let's combine the parts of this country name.

In [None]:
lst = ' '.join(i for i in lst)
lst

"'بوسنی و هرزگوین'"

because we want to remove the first and end quotation we slice this string and use [1:-1]

In [None]:
lst[1:-1]

'بوسنی و هرزگوین'

In [None]:
en = []
fa = []

for i in range(df.shape[0]):
  
  row_splitted = df[0][i].split()
  en.append(row_splitted[1][1:-1])
  
  name = row_splitted[2:]
  name = ' '.join(word for word in name)
  fa.append(name[1:-1])

print(f'10 elements of en : {en[:10]}')
print(f'10 elements of fa : {fa[:10]}')

10 elements of en : ['AFG', 'ALA', 'ALB', 'DZA', 'ASM', 'AND', 'AGO', 'AIA', 'ATA', 'ATG']
10 elements of fa : ['افغانستان', 'جزایر آلند', 'آلبانی', 'الجزایر', 'ساموای آمریکا', 'آندورا', 'آنگولا', 'آنگویلا', 'جنوبگان', 'آنتیگوا و باربودا']


Let's insert two columns to the country dataframe.One for english name of countrie and another for persian name of them.

In [None]:
country = df.copy()

In [None]:
country.insert(1, "en", en)
country.insert(2, "fa", fa)

In [None]:
country.head()

Unnamed: 0,0,en,fa
0,1 'AFG' 'افغانستان',AFG,افغانستان
1,2 'ALA' 'جزایر آلند',ALA,جزایر آلند
2,3 'ALB' 'آلبانی',ALB,آلبانی
3,4 'DZA' 'الجزایر',DZA,الجزایر
4,5 'ASM' 'ساموای آمریکا',ASM,ساموای آمریکا


In [None]:
country.drop([0], axis=1, inplace=True)

In [None]:
country.head()

Unnamed: 0,en,fa
0,AFG,افغانستان
1,ALA,جزایر آلند
2,ALB,آلبانی
3,DZA,الجزایر
4,ASM,ساموای آمریکا


In [None]:
country.tail()

Unnamed: 0,en,fa
239,WLF,والیس و فوتونا
240,ESH,صحرای غربی
241,YEM,یمن
242,ZMB,زامبیا
243,ZWE,زیمبابوه'


As wee see, last row is not in right way, so we should correct it.

In [None]:
country.fa[243] = country.fa[243][:-1]

In [None]:
country.tail()

Unnamed: 0,en,fa
239,WLF,والیس و فوتونا
240,ESH,صحرای غربی
241,YEM,یمن
242,ZMB,زامبیا
243,ZWE,زیمبابوه


In [None]:
# #Save dataframe
# country.to_pickle('/content/drive/MyDrive/AI-Internship/country_lookup')

In [14]:
#load the preprocessing dataframe
country = pd.read_pickle(r'/content/drive/MyDrive/AI-Internship/country_lookup')

In [15]:
country.shape

(244, 2)

In [16]:
country.isnull().sum()

en    0
fa    0
dtype: int64

In [17]:
country.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   en      244 non-null    object
 1   fa      244 non-null    object
dtypes: object(2)
memory usage: 3.9+ KB


### Create model

Let's develop NER with lookup tables.
If our sentece has some of the words in lookup tabel, this model recognize it and asign it to the predifined category.

In [18]:
def lookup_model(sentence):
  
  result = []
  for name in country.fa.values:
    if name in sentence:
      result += [('GRE', name)]
  return result    


Let's perform this model on some examples.

In [19]:
print(lookup_model('من علی هستم و در ایران زندکی می کنم'))

[('GRE', 'ایران')]


In [20]:
print(lookup_model('.من برای تعطیلات به جزیره هرد و جزایر مک زیبا خواهم رفت'))

[('GRE', 'جزیره هرد و جزایر مک')]


In [21]:
print(lookup_model('تعداد دانشجویان ایرانی  که به کانادا مهاجرت می کنند روز به روز بیشتر می شود.'))

[('GRE', 'کانادا'), ('GRE', 'ایران')]


## NER With Regex 

### Create model

If the sentence has specific words like "کشور", this model recognize the word after this word as a country(or other related category.)

In [22]:
import re

def regex_model(sentence):

  result = []
  iter = re.finditer(r"\bکشور\b", sentence)
  indices = [m.end(0) for m in iter]
  
  for i in indices:
    result.append(('GRE', sentence[i+1:sentence[i+1:].find(" ")+ (i+1)]))
  return result  


In [23]:
sentence = 'من در کشور ایران زندگی میکنم. من کشور خود را دوست دارم. من کشورم ایران را دوست دارم. او برای سفر به کشور جمهوری چک می رود.'

In [24]:
print(regex_model(sentence))

[('GRE', 'ایران'), ('GRE', 'خود'), ('GRE', 'جمهوری')]


As we can see, this function is not work well. (for example it recognizes 'خود' as country.)

In [None]:
## Regex look behind
# ind = re.search('(?<=کشور )(\w+)', sentence).groups()

## NER with lookup table and Regex

In this function we want to use both of the lookup table approach and the regex approach to recognize named entities.

In [27]:
def lookup_regex_model(sentence):

  result = []
  result += lookup_model(sentence)
  result += regex_model(sentence)
  return list(set(result))


In [28]:
sentence = 'من در ایران زندگی میکنم. او در کشور جمهوری چک زندگی می کند.'
print(lookup_regex_model(sentence))

[('GRE', 'ایران'), ('GRE', 'جمهوری'), ('GRE', 'جمهوری چک')]


As we can see, this function recognizes 'جمهوری' and 'جمهوری چک' as two different countries. But this approach has benefites of both the regex and lookup table model.

## NER With Stanford NER and NLTK 

helpful link :https://medium.com/sicara/train-ner-model-with-nltk-stanford-tagger-english-french-german-6d90573a9486

###Implementing NER with Stanford NER / NLTK

Because Stanford NER tagger is written in Java, you are going to need a proper Java Virtual Machine to be installed on your computer.

In [5]:
import os       #importing os to set environment variable
def install_java():
  !apt-get install -y openjdk-8-jdk-headless -qq > /dev/null      #install openjdk
  os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"     #set environment variable
  !java -version       #check java version
install_java()

openjdk version "11.0.11" 2021-04-20
OpenJDK Runtime Environment (build 11.0.11+9-Ubuntu-0ubuntu2.18.04)
OpenJDK 64-Bit Server VM (build 11.0.11+9-Ubuntu-0ubuntu2.18.04, mixed mode, sharing)


Let's download Stanford NER model from https://nlp.stanford.edu/software/CRF-NER.html and unzip it.

In [None]:
# import zipfile
# # unzip image file
# local_zip = '/content/drive/MyDrive/AI-Internship/stanford-ner-4.2.0.zip'
# zip_ref = zipfile.ZipFile(local_zip, 'r')
# zip_ref.extractall('/content/drive/MyDrive/AI-Internship/stanford-ner-4.2.0')
# zip_ref.close()

After unzipping this file, we need two things of this folder.(NER tagger engine: stanford-ner-4.2.0.jar and our model: english.all.3class.distsim.crf.ser.gz)

First we should create a folder in our drive, whose name is 'stanford-ner-tagger'. Then put .jar and .gz in this folder.


In [6]:
import nltk
from nltk.tag.stanford import StanfordNERTagger

jar = '/content/drive/MyDrive/AI-Internship/stanford-ner-tagger/stanford-ner-4.2.0.jar'
model = '/content/drive/MyDrive/AI-Internship/stanford-ner-tagger/english.all.3class.distsim.crf.ser.gz'

sentence = u"Twenty miles east of Reno, Nev., " \
    "where packs of wild mustangs roam free through " \
    "the parched landscape, Tesla Gigafactory 1 " \
    "sprawls near Interstate 80."

# prepare NER tagger with english model
ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

# tokenize: split sentece into words
words = nltk.word_tokenize(sentence)

# run NER tagger on words
print(ner_tagger.tag(words))


[('Twenty', 'O'), ('miles', 'O'), ('east', 'O'), ('of', 'O'), ('Reno', 'LOCATION'), (',', 'O'), ('Nev.', 'LOCATION'), (',', 'O'), ('where', 'O'), ('packs', 'O'), ('of', 'O'), ('wild', 'O'), ('mustangs', 'O'), ('roam', 'O'), ('free', 'O'), ('through', 'O'), ('the', 'O'), ('parched', 'O'), ('landscape', 'O'), (',', 'O'), ('Tesla', 'ORGANIZATION'), ('Gigafactory', 'ORGANIZATION'), ('1', 'O'), ('sprawls', 'O'), ('near', 'O'), ('Interstate', 'LOCATION'), ('80', 'LOCATION'), ('.', 'O')]


As we can see, our model in not bad. It recognize some of named entity. But this model traind just on english corpus, so we should train our own model on the desired language.

### Training our own (Persian) model

In this section, we should train our own model with our dataset.

Our dataset should be in a proper format (each row has just a word and its role and in each row, firs column is word and another one is role of this word.)

We should create 'train' folder in 'stanford-ner-tagger' folder. Then we put our dataset in it.

At this moment we should wire some properties like our model directory, our train file directory and ... in text file whose name is 'prop.txt' and we put it in 'train' folder.

The format of our prop.txt file is as follow:

In [None]:
## location of the training file
# trainFile = train/train.tsv

## location where you would like to save (serialize) your
## classifier; adding .gz at the end automatically gzips the file,
# #making it smaller, and faster to load
# serializeTo = dummy-ner-model-persian.ser.gz

## structure of your training file; this tells the classifier that
## the word is in column 0 and the correct answer is in column 1
# map = word=0,answer=1

## these are the features we'd like to train with
## some are discussed below, the rest can be
## understood by looking at NERFeatureFactory
# useClassFeature=true
# useWord=true

## This specifies the order of the CRF: order 1 means that features
## apply at most to a class pair of previous class and current class
## or current class and next class.
# maxLeft=1

## the last 4 properties deal with word shape features
# useTypeSeqs=true
# useTypeSeqs2=true
# useTypeySequences=true
# wordShape=chris2useLC

## these features set to use less memory (if we use Ngrams or don't set folowing features
## it gives us 'xception in thread "main" java.lang.OutOfMemoryError: Java heap space' error in the output)
# qnSize=2
# saveFeatureIndexToDisk = true
# useObservedSequencesOnly=true
# featureDiffThresh=0.05

 In training, CRFClassifier will train one model, drop all the features with weight (absolute value) beneath the given threshold, and then train a second model. Training thus takes longer, but the resulting model is smaller and faster at runtime, and usually has very similar performance for a reasonable threshold such as 0.05.(helpful link : https://nlp.stanford.edu/software/crf-faq.shtml#a)

#### Train the model

Train it, using:

This section takes about 30 minutes.

In [7]:
%cd "/content/drive/MyDrive/AI-Internship/stanford-ner-tagger/"
!java -cp "stanford-ner-4.2.0.jar:lib/*" -mx4g edu.stanford.nlp.ie.crf.CRFClassifier -prop train/prop.txt

/content/drive/MyDrive/AI-Internship/stanford-ner-tagger
Invoked on Mon Sep 20 05:14:30 UTC 2021 with arguments: -prop train/prop.txt
useTypeSeqs2=true
trainFile=train/train.tsv
useObservedSequencesOnly=true
saveFeatureIndexToDisk=true
maxLeft=1
wordShape=chris2useLC
serializeTo=dummy-ner-model-persian.ser.gz
useClassFeature=true
useTypeySequences=true
featureDiffThresh=0.05
qnSize=2
useTypeSeqs=true
map=word=0,answer=1
useWord=true
numFeatures = 917330
Time to convert docs to feature indices: 10.8 seconds
Current memory used: 389m
numClasses: 13 [0=O,1=B-pers,2=B-event,3=I-event,4=I-pers,5=B-loc,6=I-loc,7=B-pro,8=I-pro,9=B-fac,10=I-fac,11=B-org,12=I-org]
numDocuments: 10241
numDatums: 334734
numFeatures: 917330
Time to convert docs to data/labels: 4.7 seconds
Current memory used: 460m
Writing feature index to temporary file.
Running gradient on 2 threads
numWeights: 19684244
QNMinimizer called on double function of 19684244 variables, using M = 2.
               An explanation of the 

As we can see in the output we have 13 classes for named entities (numClasses: 13 [0=O,1=B-pers,2=B-event,3=I-event,4=I-pers,5=B-loc,6=I-loc,7=B-pro,8=I-pro,9=B-fac,10=I-fac,11=B-org,12=I-org]).

This code should output 'dummy-ner-model-persian.ser.gz' file as a trained model.

Let's use this model on persian sentences.

In [8]:
import nltk
from nltk.tag.stanford import StanfordNERTagger

sentence = 'من علی هستم و در ایران زندگی می کنم.' 

jar = '/content/drive/MyDrive/AI-Internship/stanford-ner-tagger/stanford-ner-4.2.0.jar'
model = '/content/drive/MyDrive/AI-Internship/stanford-ner-tagger/dummy-ner-model-persian.ser.gz'

ner_tagger = StanfordNERTagger(model, jar, encoding='utf8')

words = nltk.word_tokenize(sentence)
print(ner_tagger.tag(words))

[('من', 'O'), ('علی', 'O'), ('هستم', 'O'), ('و', 'O'), ('در', 'O'), ('ایران', 'B-loc'), ('زندگی', 'O'), ('می', 'O'), ('کنم', 'O'), ('.', 'O')]


As we can see, this model is not bad. It recognizes 'ایران' as a location but can't recognize 'علی' as a person. But in comparison with the NLTK default function for NER, it works better.

Let's try this model on a sentece in train.tsv.

In [11]:
sentence2 = 'دکتر اصغری دبیر چهارمین همایش انجمن زمین شناسی ایران در این زمینه گفت.'

words = nltk.word_tokenize(sentence2)
print(ner_tagger.tag(words))

[('دکتر', 'O'), ('اصغری', 'B-pers'), ('دبیر', 'O'), ('چهارمین', 'B-event'), ('همایش', 'I-event'), ('انجمن', 'I-event'), ('زمین', 'I-event'), ('شناسی', 'I-event'), ('ایران', 'I-event'), ('در', 'O'), ('این', 'O'), ('زمینه', 'O'), ('گفت', 'O'), ('.', 'O')]


It works well on this sentence because this sentence is in our dataset and model trained on it.

#### Evalute the model

Let's test the model on dev.tsv.
The output should be in the following format :

*   column 0 : tokens
*   column 1 : actual label
*   column 2 : predicted label by the trained model

At the end of the output, we can see F1 score and other accuracy metric for each class.




In [10]:
%cd "/content/drive/MyDrive/AI-Internship/stanford-ner-tagger/"
!java -cp "stanford-ner-4.2.0.jar" edu.stanford.nlp.ie.crf.CRFClassifier -loadClassifier dummy-ner-model-persian.ser.gz -testFile train/valid.tsv

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
خدمات	O	O
استاد	O	O
لطفی	B-pers	B-pers
در	O	O
معرفی	O	O
فلسفه	O	O
غرب	O	O
در	O	O
0	O	O
سال	O	O
اخیر	O	O
را	O	O
تمامی	O	O
اهل	O	O
فلسفه	O	O
و	O	O
فرهنگ	O	O
ارج	O	O
می‌نهند	O	O
و	O	O
لازم	O	O
است	O	O
که	O	O
حق‌شناسی	O	O
و	O	O
احترام	O	O
خود	O	O
را	O	O
به	O	O
این	O	O
انسان	O	O
شریف	O	O
که	O	O
در	O	O
گوشه	O	O
تنهائی	O	O
به	O	O
ترجمه	O	O
این‌همه	O	O
آثار	O	O
مهم	O	O
همت	O	O
گماشته	O	O
است	O	O
،	O	O
اظهار	O	O
کنیم	O	O
و	O	O
سپاسگزار	O	O
زحمات	O	O
او	O	O
باشیم	O	O
.	O	O

دوره	O	O
اول	O	O
را	O	O
ترجمه	O	O
آثار	O	O
افلاطون	B-pers	B-pers
و	O	O
فلوطین	B-pers	B-pers
بیان	O	O
کرد	O	O
و	O	O
ترجمه	O	O
آثار	O	O
یاسپرس	B-pers	B-pers
،	O	O
فیلسوف	O	O
آلمانی	O	O
را	O	O
دوره	O	O
دوم	O	O
کارهای	O	O
دکتر	O	O
لطفی	B-pers	B-pers
دانست	O	O
.	O	O

سپس	O	O
استاد	O	O
محمدحسن	B-pers	B-pers
لطفی	I-pers	I-pers
با	O	O
سپاسگزاری	O	O
از	O	O
بانیان	O	O
این	O	O
نشست	O	O
،	O	O
به	O	O
توجیه	O	O
این	O	O
نکته	O	O
پرداخت	O	O
که	O	O
چرا	O	O
سه	O	O
کتاب	B-pro	B-pr

As we can see, our model has F1 socre = 0.9925 on dev.tsv. 

**Important note :**train and dev file should be in .tsv format. (tab separated form)

## NER with Polyglot

helpful link : https://www.geeksforgeeks.org/natural-language-processing-using-polyglot-introduction/

In [1]:
pip install polyglot

Collecting polyglot
  Downloading polyglot-16.7.4.tar.gz (126 kB)
[?25l[K     |██▋                             | 10 kB 24.6 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 29.4 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 30.9 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 32.9 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 35.7 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 27.4 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 24.8 MB/s eta 0:00:01[K     |████████████████████▊           | 81 kB 25.8 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 27.5 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 29.1 MB/s eta 0:00:01[K     |████████████████████████████▌   | 112 kB 29.1 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 29.1 MB/s eta 0:00:01[K     |████████████████████████████████| 126 kB 29.1 MB/s 
[?25hBuild

In [5]:
# installing dependency packages
!pip install pyicu 

Collecting pyicu
  Downloading PyICU-2.7.4.tar.gz (298 kB)
[?25l[K     |█                               | 10 kB 21.1 MB/s eta 0:00:01[K     |██▏                             | 20 kB 23.2 MB/s eta 0:00:01[K     |███▎                            | 30 kB 26.0 MB/s eta 0:00:01[K     |████▍                           | 40 kB 29.8 MB/s eta 0:00:01[K     |█████▌                          | 51 kB 32.8 MB/s eta 0:00:01[K     |██████▋                         | 61 kB 34.3 MB/s eta 0:00:01[K     |███████▊                        | 71 kB 26.6 MB/s eta 0:00:01[K     |████████▉                       | 81 kB 28.0 MB/s eta 0:00:01[K     |█████████▉                      | 92 kB 27.1 MB/s eta 0:00:01[K     |███████████                     | 102 kB 28.7 MB/s eta 0:00:01[K     |████████████                    | 112 kB 28.7 MB/s eta 0:00:01[K     |█████████████▏                  | 122 kB 28.7 MB/s eta 0:00:01[K     |██████████████▎                 | 133 kB 28.7 MB/s eta 0:00:01[K    

In [6]:
# installing dependency packages
!pip install Morfessor
!pip install pycld2 

Collecting Morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6
Collecting pycld2
  Downloading pycld2-0.41.tar.gz (41.4 MB)
[K     |████████████████████████████████| 41.4 MB 38 kB/s 
[?25hBuilding wheels for collected packages: pycld2
  Building wheel for pycld2 (setup.py) ... [?25l[?25hdone
  Created wheel for pycld2: filename=pycld2-0.41-cp37-cp37m-linux_x86_64.whl size=9834277 sha256=735e32a3cddf62033208b80a85d1cb3b93e5eaa7567439be1024aed6408885b1
  Stored in directory: /root/.cache/pip/wheels/ed/e4/58/ed2e9f43c07d617cc81fe7aff0fc6e42b16c9cf6afe960b614
Successfully built pycld2
Installing collected packages: pycld2
Successfully installed pycld2-0.41


In [7]:
%%bash
polyglot download ner2.en    # downloading model ner

[polyglot_data] Downloading package ner2.en to /root/polyglot_data...


In [8]:
%%bash
polyglot download pos2.en    # downloading model pos

[polyglot_data] Downloading package pos2.en to /root/polyglot_data...


###Language detection 

In [10]:
from polyglot.detect import Detector

persian_text = 'من علی هستم و در ایران زندگی می کنم.'
detector = Detector(persian_text)
print(detector.language)

name: Persian     code: fa       confidence:  98.0 read bytes:  1089


### Tokenization

In [16]:
# importing Text from polyglot library
from polyglot.text import Text

sentence = 'من علی هستم و در ایران زندگی می کنم. علی امسال به دانشگاه می رود'

# passing sentences through imported Text
text = Text(sentence)

# dividing sentence into words
print(f'tokens : {text.words}')
print('\n')

# separating senteces
print(f'senteces : {text.sentences}')

tokens : ['من', 'علی', 'هستم', 'و', 'در', 'ایران', 'زندگی', 'می', 'کنم', '.', 'علی', 'امسال', 'به', 'دانشگاه', 'می', 'رود']


senteces : [Sentence("من علی هستم و در ایران زندگی می کنم."), Sentence("علی امسال به دانشگاه می رود")]


### Named Entity Recognition

Polyglot recognizes three categories of entities:


1.   Location
2.   Organization
3.   Persons



In [21]:
%%bash
polyglot download embeddings2.fa   

[polyglot_data] Downloading package embeddings2.fa to
[polyglot_data]     /root/polyglot_data...


In [23]:
%%bash
polyglot download ner2.fa

[polyglot_data] Downloading package ner2.fa to /root/polyglot_data...


In [24]:
from polyglot.text import Text

sentence = 'من علی هستم و در ایران زندگی می کنم. علی امسال به دانشگاه می رود'

text = Text(sentence, hint_language_code='fa')
print(text.entities)

[I-PER(['علی']), I-LOC(['ایران']), I-PER(['علی']), I-ORG(['دانشگاه'])]


### Sentiment Analysis

In [30]:
%%bash
polyglot download sentiment2.fa  # downloading model sentimen

[polyglot_data] Downloading package sentiment2.fa to
[polyglot_data]     /root/polyglot_data...


In [31]:
print(text.polarity)

1.0
