# Configuring ES multi-lingual index with RLI
Below we run through the following steps:
1. Install RBL Plugin.
2. Create an empty index.
3. Add custom anlysis chains and dynamic template mapping that use RBL's tokenization and normailzation. 
4. Compare analysis of default and Rosette analysis.
5. Add a handfull of records in various languages.
6. View dynamically created fields. Note correct analyzers.

----
### 1) Install RBL Plugin

In [None]:
# Simple Plugin Installation
# ./bin/plugin --install analysis-rbl-je --url file:///path/rbl-je-elasticsearch-1.0.0.zip

# Add your license file
# cp /path/rlp-license.xml plugins/analysis-rbl-je/rbl-je-7.12.1/licenses/ 

----
### 2) Create an empty index

In [None]:
# Global Settings for Elasticsearch
es_host = "_enter_your_elastic_hostname_here_" #"localhost"
es_port = "9200"
index_name = "language_analysis_demo"
document_type = "customers"

# Intall the python binding for elastic search
# pip install elasticsearch

# Use the elasticsearch python client
from elasticsearch import Elasticsearch
import json

# Create connection to es server
es = Elasticsearch(host=es_host, port=es_port)

In [None]:
# Uncomment the following line to delete existing index with this name. Ignores 404 IndexMissingException.
print "delete", index_name, es.indices.delete(index=index_name, ignore=[404])

# Create the index. Ignores 400 IndexAlreadyExistsException
print "create", index_name, es.indices.create(index=index_name, ignore=[400])

In [None]:
# Display the settings of the newly created index -- shouldn't be much here. 
settings = es.indices.get_settings(index=index_name)
print json.dumps(settings, indent=2)

In [None]:
# Display the mappings of the newly created index -- shouldn't be much here. 
mapping = es.indices.get_mapping(index=index_name)
print json.dumps(mapping, indent=2)

----
### 3) Add custom anlysis chains that use RBL's tokenization and normailzation
 - Close the index to make edits
 - Add custom anlysis chains and dynamic template mapping that use RBL's tokenization and normailzation. 
 - Add a document type with dynamic template mapping that automatically assoicates new fields with the correct anlysis chains bases on a 3 letter langaguge code suffix on field name.
 - Reopen the index and inspect the new settings.

In [None]:
# Close the index to make edits
print "close", index_name, es.indices.close(index=index_name)

In [None]:
# Define custom anlysis chains with RBL tokenizer and RBL token filter for JAPANESE, GERMAN, and ENGLISH
# Note that there are additional options that can be set here. See section 4 of the RBL for Elasticserach guide.
es.indices.put_settings(index=index_name, body={
    "analysis": {
        "analyzer": {
            "rbl_eng_analyzer": {
                "filter": ["rbl_eng_filter"],
                "tokenizer": "rbl_eng_tokenizer",
                "type": "custom"
            },
            "rbl_deu_analyzer": {
                "filter": ["rbl_deu_filter"],
                "tokenizer": "rbl_deu_tokenizer",
                "type": "custom"
            },                
            "rbl_jpn_analyzer": {
                "filter": ["rbl_jpn_filter"],
                "tokenizer": "rbl_jpn_tokenizer",
                "type": "custom"
            }
        },
        "filter": {
            "rbl_eng_filter": {
                "addLemmaTokens": "true",
                "language": "eng",
                "type": "rbl"
            },
            "rbl_deu_filter": {
                "addLemmaTokens": "true",
                "language": "deu",
                "type": "rbl"
            },
            "rbl_jpn_filter": {
                "addLemmaTokens": "true",
                "language": "jpn",
                "type": "rbl"
            }
        },
        "tokenizer": {
            "rbl_eng_tokenizer": {
                "language": "eng",
                "type": "rbl"
            },
            "rbl_deu_tokenizer": {
                "language": "deu",
                "type": "rbl"
            },                
            "rbl_jpn_tokenizer": {
                "language": "jpn",
                "type": "rbl"
            }
        }
    }
})

In [None]:
#Create dynamic template mapping so that feilds with language suffexes use the correct analyzers
es.indices.put_mapping(index=index_name, doc_type=document_type, body={
"dynamic_templates": [
                { "japanese_text": {
                      "match":              "*_jpn", 
                      "match_mapping_type": "string",
                      "mapping": {
                          "type":           "string",
                          "analyzer":       "rbl_jpn_analyzer"
                      }
                }},
                { "german_text": {
                      "match":              "*_deu", 
                      "match_mapping_type": "string",
                      "mapping": {
                          "type":           "string",
                          "analyzer":       "deu_jpn_analyzer"
                      }
                }},
                { "en": {
                      "match":              "*_eng", 
                      "match_mapping_type": "string",
                      "mapping": {
                          "type":           "string",
                          "analyzer":       "rbl_eng_analyzer"
                      }
                }}
            ]
})

In [None]:
# Open the index to make searchable
print "open", index_name, es.indices.open(index=index_name)

In [None]:
# Display the settings of the updated index
settings = es.indices.get_settings(index=index_name)
print json.dumps(settings, indent=2)

In [None]:
#View the dynamically created mapping for the 
mapping = es.indices.get_mapping(index=index_name)
print json.dumps(mapping, indent=2)

----
### 4) Compare analysis of default and Rosette analysis
 - Gernman decompounding example
 - Japanese Tokenization example

In [None]:
#The Elasticsearch standard tokenizer
tokens = es.indices.analyze(index=index_name,body="Migrationsforscher",analyzer="standard")
print json.dumps(tokens, indent=2)

In [None]:
#The Elasticsearch standard tokenizer
tokens = es.indices.analyze(index=index_name,body="Migrationsforscher",analyzer="rbl_deu_analyzer")
print json.dumps(tokens, indent=2)

**Japanese Tokenization example**
 - 水泳の世界選手権第
 - 9 Characters
 - Translates to roughly "The first swimming world championships"

In [None]:
#The naive whitespace tokenizer 
tokens = es.indices.analyze(index=index_name,body="水泳の世界選手権第",analyzer="whitespace")
print json.dumps(tokens, indent=2)

In [None]:
#The Elasticsearch standard tokenizer
tokens = es.indices.analyze(index=index_name,body="水泳の世界選手権第",analyzer="standard")
print json.dumps(tokens, indent=2)

In [None]:
#The Elasticsearch standard tokenizer
tokens = es.indices.analyze(index=index_name,body="水泳の世界選手権第",analyzer="rbl_jpn_analyzer")
print json.dumps(tokens, indent=2)

----
### 5) Add a handfull of records in various languages

In [None]:
# Add 6 sample Japanese Documents to the Japanese field
print "add document", index_name, es.index(index=index_name, doc_type=document_type, id=1, body={"body_jpn": "T水泳の世界選手権第１１日は２７日、豪州・メルボルンで行われ、女子百メートル背泳ぎ決勝で中村礼子（東京ＳＣ）が１分０秒４０の日本人として大会初の銅メダルを獲得した。"}) 
print "add document", index_name, es.index(index=index_name, doc_type=document_type, id=2, body={"body_jpn": "優勝は５９秒４４の世界新をマークしたアメリカ人のナタリー・コーグリン。"})
print "add document", index_name, es.index(index=index_name, doc_type=document_type, id=3, body={"body_jpn": "女子千五百メートル自由形決勝では、柴田亜衣（チームアリーナ）が１５分５８秒５５をマークし、２日続けて同種目の日本記録を更新し銅メダル。"})
print "add document", index_name, es.index(index=index_name, doc_type=document_type, id=4, body={"body_jpn": "男子百メートル背泳ぎ決勝は、アーロン・ピアソル（米）が５２秒９８の世界新で優勝し、森田智己（セントラルスポーツ）は８位。"})
print "add document", index_name, es.index(index=index_name, doc_type=document_type, id=5, body={"body_eng": "Record heat wave expect across most of Europe this week."})
print "add document", index_name, es.index(index=index_name, doc_type=document_type, id=6, body={"body_xxx": "鑑於薱朲蘱傢庭葰烠宬員啇懙笙椇婡旳繜嚴忣祺鮃等啇啝bú迻嘚權利ㄖㄅ承認，迺湜卋琾臫凷、㊣礒與龢鮃啇基礎，"})

In [None]:
# refresh to make the documents available for search
print "refresh", index_name, es.indices.refresh(index=index_name)

# and now we can count the documents
print(es.count(index=index_name)['count'], 'documents in index')

----
### 6) View the mapping again
 - Note the dynaically created fields

In [None]:
# Display the settings of the updated index
settings = es.indices.get_settings(index=index_name)
print json.dumps(settings, indent=2)