In [30]:
import sys
from os import path as p
from __future__ import print_function

In [2]:
polyglot_dir = '/data/polyglot/'

if polyglot_dir not in sys.path:
  sys.path.insert(0, polyglot_dir)

# Named Entity Extraction

Named entity extraction task aims to extract phrases from plain text that correpond to entities.
Polyglot recognizes 3 categories of entities:

- Locations (Tag: `I-LOC`): cities, countries, regions, continents, neighborhoods,  administrative divisions ...
- Organizations (Tag: `I-ORG`): sports teams, newspapers, banks, universities, schools, non-profits, companies, ...
- Persons (Tag: `I-PER`): politicians, scientists, artists, atheletes ...

## Languages Coverage

The models were trained on datasets extracted automatically from Wikipedia.
Polyglot currently supports 40 major languages.

In [7]:
from polyglot.downloader import downloader
print(", ".join(downloader.supported_languages("ner2")))

Polish, Turkish, Russian, Czech, Arabic, Korean, Catalan; Valencian, Indonesian, Vietnamese, Thai, Romanian, Moldavian, Moldovan, Tagalog, Danish, Finnish, German, Persian, Latvian, Chinese, French, Portuguese, Slovak, Hebrew (modern), Malay, Slovene, Bulgarian, Hindi, Japanese, Hungarian, Croatian, Ukrainian, Serbian, Lithuanian, Norwegian, Dutch, Swedish, English, Greek, Modern, Spanish; Castilian, Italian, Estonian


## Library Interface

Entities inside a text object or a sentence are represented as chunks.
Each chunk identifies the start and the end indices of the word subsequence within the text.

In [16]:
from polyglot.text import Text

In [32]:
blob = """
The Israeli Prime Minister Benjamin Netanyahu has warned that Iran poses a "threat to the entire world".
"""
text = Text(blob)

In [33]:
for sent in text.sentences:
    print(sent, "\n", sent.entities, "\n")

The Israeli Prime Minister Benjamin Netanyahu has warned that Iran poses a "threat to the entire world". 
 [I-ORG([u'Israeli']), I-PER([u'Benjamin', u'Netanyahu']), I-LOC([u'Iran'])] 



## Command Line Interface

In [34]:
%%bash
polyglot download embeddings2.en

[polyglot_data] Downloading package embeddings2.en to
[polyglot_data]     /home/rmyeid/polyglot_data...
[polyglot_data]   Package embeddings2.en is already up-to-date!




### Citation

This work is a direct implementation of the research being described in the following paper.
The author of this library strongly encourage you to cite the following paper if you are using this software.

## References

- [Polyglot-NER project page.](https://bit.ly/polyglot-ner)
- [Wikipedia on NER](http://en.wikipedia.org/wiki/Named-entity_recognition).