In [1]:
import sys
import os
import json
!conda install --yes --prefix {sys.prefix} elasticsearch
!conda install --yes --prefix {sys.prefix} elasticsearch-dsl
from elasticsearch import Elasticsearch
from elasticsearch import helpers
from elasticsearch_dsl import Search

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



In [2]:
def load_json_file(file):
    data = []
    if os.path.exists(file):
        for line in open(file, 'r'):
            data.append(json.loads(line))
        return data
    else:
        return("File does not exist")

In [3]:
def get_list_by_chunk_size(original_list, batch_size):
    # looping till length equals batch_size
    for i in range(0, len(original_list), batch_size):  
        yield original_list[i:i + batch_size] 

In [4]:
def bulk_insert_data_to_es(elasticsearch_connection, data, index, bulk_size=100):
    try:
        batch_data = get_list_by_chunk_size(data, bulk_size)
        for batch in batch_data:
            count = 0
            actions = []
            while count <= len(batch) - 1:
                action = {
                    "_index": index,
                    "_source": {}
                }
                action["_source"] = batch[count]
                actions.append(action)
                count = count + 1
            helpers.bulk(elasticsearch_connection, actions)
        return True
    except:
        e = sys.exc_info()
        print("Bulk insertion job failed")
        print(e)
        return False

In [5]:
def get_stopwords():
    stop_words = []
    with open(os.getcwd() + "/util" + "/sinhala_stopwords.txt", 'r') as f:
        for word in f:
            stop_words.append(word.strip())
    return stop_words

In [6]:
es_connection = Elasticsearch('http://localhost:9200')
es_connection.indices.delete(index="artists-*")



{'acknowledged': True}

In [7]:
stopwords = get_stopwords()
mapping = {
    "settings" : {
        "analysis" : {
            "analyzer" : {
                "sinhala_analyzer" : {
                    "type": "standard",
                    "stopwords": stopwords
                }
            }
        } 
    },
    "aliases": { 
        "artists": { 
            "is_write_index": True 
        }
    },
    "mappings" : {
        "properties" : {
            "awards" : {
                "properties" : {
                    "instituition" : {
                        "type" : "text",
                        "analyzer" : "sinhala_analyzer",
                        "fields" : {
                            "keyword" : {
                                "type" : "keyword",
                                "ignore_above" : 256
                            }
                        }
                    },
                    "title" : {
                        "type" : "text",
                        "analyzer" : "sinhala_analyzer",
                        "fields" : {
                            "keyword" : {
                                "type" : "keyword",
                                "ignore_above" : 256
                            }
                        }
                    }
                }
            },
            "bio" : {
                "type" : "text",
                "analyzer" : "sinhala_analyzer"
            },
            "birth" : {
                "type" : "text",
                "analyzer" : "sinhala_analyzer",
            },
            "death" : {
                "type" : "text",
                "analyzer" : "sinhala_analyzer",
            },
            "films" : {
                "properties" : {
                    "role" : {
                        "type" : "text",
                        "analyzer" : "sinhala_analyzer",
                        "fields" : {
                            "keyword" : {
                                "type" : "keyword",
                                "ignore_above" : 256
                            }
                        }
                    },
                    "title" : {
                        "type" : "text",
                        "analyzer" : "sinhala_analyzer",
                        "fields" : {
                            "keyword" : {
                                "type" : "keyword",
                                "ignore_above" : 256
                            }
                        }
                    },
                    "year" : {
                      "type" : "long"
                    }
                }
            },
            "id" : {
                "type" : "long"
            },
            "name" : {
                "type" : "text",
                "analyzer" : "sinhala_analyzer",
                "fields" : {
                    "keyword" : {
                        "type" : "keyword",
                        "ignore_above" : 256
                    }
                }
            },
            "real_name" : {
                "type" : "text",
                "analyzer" : "sinhala_analyzer",
                "fields" : {
                    "keyword" : {
                      "type" : "keyword",
                      "ignore_above" : 256
                    }
                }
            }
      }
    }
}

In [8]:
es_connection.indices.create(index="artists-000001", body=mapping)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'artists-000001'}

In [9]:
artist_data = load_json_file(os.getcwd() + '/corpus/artists.json')
bulk_insert_data_to_es(es_connection, artist_data, "artists")

Bulk insertion job failed
(<class 'elasticsearch.helpers.errors.BulkIndexError'>, BulkIndexError('19 document(s) failed to index.', [{'index': {'_index': 'artists-000001', '_type': '_doc', '_id': 'dEmqMX0ByuuP2Z3ngdJx', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'not_x_content_exception', 'reason': 'Compressor detection can only be called on some xcontent bytes or compressed xcontent bytes'}}, 'data': 'F'}}, {'index': {'_index': 'artists-000001', '_type': '_doc', '_id': 'dUmqMX0ByuuP2Z3ngdJz', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to parse', 'caused_by': {'type': 'not_x_content_exception', 'reason': 'Compressor detection can only be called on some xcontent bytes or compressed xcontent bytes'}}, 'data': 'i'}}, {'index': {'_index': 'artists-000001', '_type': '_doc', '_id': 'dkmqMX0ByuuP2Z3ngdJz', 'status': 400, 'error': {'type': 'mapper_parsing_exception', 'reason': 'failed to pa

False