In [1]:
import pandas as pd
import json
import boto3
import numpy as np
from requests_aws4auth import AWS4Auth
from datetime import datetime
from elasticsearch import Elasticsearch, RequestsHttpConnection
from elasticsearch.exceptions import TransportError
from elasticsearch.helpers import streaming_bulk
from elasticsearch_dsl import Search

In [2]:
endpoint = ('https://search-insightprojecttest-qcbe6ffjwxsdscktijg6uragwi.us-west-1.es.amazonaws.com')
es = Elasticsearch(endpoint)

In [3]:
def load_mapping(j): 
    if j:
        try:
            with open(j) as f:
                return(json.load(f))
        except(FileNotFoundError, TypeError): 
            return(j)
    else: 
        return {}
    
def create_index(client, index, mapping_all=None, mappings=None, 
                 settings=None, delete_index=False):

    create_index_json = {}
    if mapping_all: 
        create_index_json = load_mapping(mapping_all)
    else: 
        mappings = load_mapping(mappings) if mappings else {}
        if settings: 
            settings = load_mapping(settings) 
        else:
            settings = {
                'number_of_shards': 1,
                'number_of_replicas': 0
            }

        create_index_json = {
            'settings': settings,
            'mappings': mappings
        }
    if delete_index:
        try:
            client.indices.delete(index = index)
        except TransportError as e:
            if e.error == 'index_not_found_exception': 
                pass
        client.indices.create(
                index = index,
                body = create_index_json
            )
        return

    try:
        client.indices.create(
            index = index,
            body = create_index_json
        )
    except TransportError as e:
        if e.error == 'index_already_exists_exception':
            pass
        else:
            raise
            
def load_items(client, 
               index,  
               parse_function, 
               doc_type='doc',
               verbose=False): 
    stats = {
        'success': 0, 
        'fail': 0
    }
    error = []
    for ok, result in streaming_bulk(
        client, 
        parse_function(), 
        index=index, 
        doc_type=doc_type,
        chunk_size=50
    ): 
        action, result = result.popitem()
        doc_id = '/%s/doc/%s' % (index, result['_id'])
        if not ok: 
            stats['fail'] += 1
            if verbose: 
                error.append((action, result))
            print(action, result)
        else: 
            stats['success'] += 1

In [30]:
ind = []
def parse_ingredient(file_name='ingredients_bigram_3.json'):
    with open(file_name) as f:
        for line in f: 
            a = (json.loads(line))
            yield(a)
        
create_index(es, 'ingredient_2', mapping_all=None, settings=None,
             delete_index=True)
load_items(es, 'ingredient_2', parse_ingredient)

DELETE https://search-insightprojecttest-qcbe6ffjwxsdscktijg6uragwi.us-west-1.es.amazonaws.com:443/ingredient_2 [status:404 request:0.279s]


In [25]:
def parse_product(file_name='data/sephora.jl'):
    with open(file_name) as f: 
        for product in f:
            product = json.loads(product)
            if not product['ingredients']: 
                continue
            if 'reviews' in product:
                del product['reviews']
            yield(product)

create_index(es, 'product', mapping_all=None, settings=None,
             delete_index=True)
load_items(es, 'product', parse_product)

In [43]:
query = {
  "from": 0, 
  "size": 1,
  "query": {
    "bool": {
      "should": [
        {
          "term": {
            "name.keyword": {
              "value": "ferula assa foetida", 
              "boost": 100
            }
          }
        },
        {
          "match_phrase": {
            "name": {
              "query": "ferula assa foetida",
              "boost": 50
            }
          }
        },
        {
          "match": {
            "name": {
              "query": "ferula assa foetida",
              "operator": "and", 
              "boost": 20
            }
          }
        }
      ]
    }
  }
}

In [45]:
es.search(
    index='ingredient_2', 
    body=query
)['hits']['hits'][0]['_source']

{'About': '',
 'Allergies & immunotoxicity': 0.0,
 'Allergies/immunotoxicity Concerns': [],
 'Allergies/immunotoxicity Reasons': [],
 'Biochemical or cellular level changes Concerns': [],
 'Biochemical or cellular level changes Reasons': [],
 'Cancer': 0.0,
 'Cancer Concerns': [],
 'Cancer Reasons': [],
 'Chemical release concerns Concerns': [],
 'Chemical release concerns Reasons': [],
 'Contamination concerns Concerns': [],
 'Contamination concerns Reasons': [],
 'Data gaps Concerns': ['7 studies in PubMed science library may include information on the toxicity of this chemical see search results ->'],
 'Data gaps Reasons': ['NLM PubMed'],
 'Decreased skin absorption Concerns': [],
 'Decreased skin absorption Reasons': [],
 'Developmental & reproductive toxicity': 0.0,
 'Developmental/reproductive toxicity Concerns': [],
 'Developmental/reproductive toxicity Reasons': [],
 'Ecotoxicology Concerns': [],
 'Ecotoxicology Reasons': [],
 'Endocrine disruption Concerns': [],
 'Endocrine di