# ElasticSearch prototype

Now that I've gone through all the notes on `ElasticSearch: The Definite Guide`, I have a much better idea of how to set up the store and related queries.

In [1]:
import elasticsearch
from elasticsearch import helpers
import json

def p(doc, max_lines=25):
    lines = json.dumps(doc, indent=2, sort_keys=True).split('\n')
    if not max_lines or len(lines) <= max_lines:
        print('\n'.join(lines))
    else:
        print('\n'.join(lines[:max_lines/2]) + '\n...\n' + '\n'.join(lines[-max_lines/2:]))

In [2]:
es = elasticsearch.Elasticsearch(hosts=[{'host': 'localhost', 'port': 9200}])

In [8]:
es.index?

In [5]:
es.indices.exists('cbit')

True

In [3]:
# Clear everything
es.indices.delete(index='*')

{u'acknowledged': True}

In [4]:
# Set up index (v1)
# "study" and "sample" need to live in the same index to set up a parent-child relationship
es.indices.create(index='cbit', body={
        "settings": {
            # Set up indexing to support efficient search-as-you-type
            # (see https://www.elastic.co/guide/en/elasticsearch/guide/current/_index_time_search_as_you_type.html)
            "analysis": {
                "filter": {
                    "autocomplete_filter": { 
                        "type":     "edge_ngram",
                        "min_gram": 1,
                        "max_gram": 20
                    }
                },
                "analyzer": {
                    "autocomplete": {
                        "type":      "custom",
                        "tokenizer": "standard",
                        "filter": [
                            "lowercase",
                            "autocomplete_filter" 
                        ]
                    }
                }
            }            
        },
        
        "mappings": {
            "study": {
                # Prevent creation of dynamic fields
                # (when adding studies with new fields, these should be presented to the
                #  user for explicit typing)
                #"dynamic": "strict",
                # TODO: For the moment, though, allow dynamic mapping, just ensure that 
                # everything is mapped as a string below
                "dynamic": True,

                "dynamic_templates": [
                    {
                        # Some fields are objects (names are all in uppercase)
                        "object_fields": {
                            "match_pattern": "regex",
                            "path_match": r"^([A-Z ]+)$",
                            "mapping": {
                                "type": "object"
                            }
                        }
                    },
                    
                    {    
                        # Some fields should have full-text search enabled
                        "fts_fields": {
                            "match_pattern": "regex",
                            "path_match": r"^.*\.({0})$".format("|".join([
                                        "Study Researchers Involved",
                                        "Study PubMed ID",
                                        "Study Publication Author List",
                                    ])),
                            "mapping": {
                                # Everything is a string for now (improve this during Sprint #3)
                                "type": "string",

                                # Make sure that all matches are done by exact content
                                # (full-text search is done against the _all field, which *is* analyzed)
                                "index": "not_analyzed",

                                "include_in_all": True
                            }
                        }
                    },
                    
                    {
                        # Everything else is excluded from full-text search
                        "default": {
                            "match": "*",
                            "mapping": {
                                # Everything is a string for now (improve this during Sprint #3)
                                "type": "string",

                                # Make sure that all matches are done by exact content
                                # (full-text search is done against the _all field, which *is* analyzed)
                                "index": "not_analyzed",

                                "include_in_all": False
                            }
                        }
                    }
                ],

                "_all": {
                    # Do analyze everything in the study metadata that can be searched by
                    # full-text search.  Use autocomplete analyzer above to split words on
                    # word boundaries, lowercase everything and produce edge n-grams in index.
                    # But don't produce edge n-grams during searching
                    "index": "analyzed",
                    "analyzer": "autocomplete",
                    "search_analyzer": "standard"
                },
            },
            
            "sample": {
                
                # Set up parent-child relationship with `study`
                "_parent": {
                    "type": "study"
                },
                
                # Prevent creation of dynamic fields
                # (when adding studies with new fields, these should be presented to the
                #  user for explicit typing)
                #"dynamic": "strict",
                # TODO: For the moment, though, allow dynamic mapping, just ensure that 
                # everything is mapped as a string below
                "dynamic": True,

                "dynamic_templates": [
                    {
                        # Everything sample metadata is full-text searchable
                        "default": {
                            "match": "*",
                            "mapping": {
                                # Everything is a string for now (improve this during Sprint #3)
                                "type": "string",

                                # Make sure that all matches are done by exact content
                                # (full-text search is done against the _all field, which *is* analyzed)
                                "index": "not_analyzed",

                                "include_in_all": True
                            }
                        }
                    }
                ],

                "_all": {
                    # Do analyze everything in the study metadata that can be searched by
                    # full-text search.  Use autocomplete analyzer above to split words on
                    # word boundaries, lowercase everything and produce edge n-grams in index.
                    # But don't produce edge n-grams during searching
                    "index": "analyzed",
                    "analyzer": "autocomplete",
                    "search_analyzer": "standard"
                },
            }
        }
    })

{u'acknowledged': True}

In [5]:
# Make an alias to study_v1
#es.indices.put_alias(name='study', index='study_v1')

In [6]:
# Check
p(es.indices.get(index='cbit'), None)

{
  "cbit": {
    "aliases": {}, 
    "mappings": {
      "sample": {
        "_all": {
          "analyzer": "autocomplete", 
          "search_analyzer": "standard"
        }, 
        "_parent": {
          "type": "study"
        }, 
        "_routing": {
          "required": true
        }, 
        "dynamic": "true", 
        "dynamic_templates": [
          {
            "default": {
              "mapping": {
                "include_in_all": true, 
                "index": "not_analyzed", 
                "type": "string"
              }, 
              "match": "*"
            }
          }
        ]
      }, 
      "study": {
        "_all": {
          "analyzer": "autocomplete", 
          "search_analyzer": "standard"
        }, 
        "dynamic": "true", 
        "dynamic_templates": [
          {
            "object_fields": {
              "mapping": {
                "type": "object"
              }, 
              "match_pattern": "regex", 
              "path_matc

In [7]:
# Set up samples index (v1)
if False:
    es.indices.create(index='sample_v1', body={
            "settings": {
                # Set up indexing to support efficient search-as-you-type
                # (see https://www.elastic.co/guide/en/elasticsearch/guide/current/_index_time_search_as_you_type.html)
                "analysis": {
                    "filter": {
                        "autocomplete_filter": { 
                            "type":     "edge_ngram",
                            "min_gram": 1,
                            "max_gram": 20
                        }
                    },
                    "analyzer": {
                        "autocomplete": {
                            "type":      "custom",
                            "tokenizer": "standard",
                            "filter": [
                                "lowercase",
                                "autocomplete_filter" 
                            ]
                        }
                    }
                }            
            },

            "mappings": {
                "sample": {  # Only one doc_type

                    # Set up parent-child relationship with `study`
                    "_parent": {
                        "type": "study_v1"
                    },

                    # Prevent creation of dynamic fields
                    # (when adding studies with new fields, these should be presented to the
                    #  user for explicit typing)
                    #"dynamic": "strict",
                    # TODO: For the moment, though, allow dynamic mapping, just ensure that 
                    # everything is mapped as a string below
                    "dynamic": True,

                    "dynamic_templates": [
                        {
                            # Everything sample metadata is full-text searchable
                            "default": {
                                "match": "*",
                                "mapping": {
                                    # Everything is a string for now (improve this during Sprint #3)
                                    "type": "string",

                                    # Make sure that all matches are done by exact content
                                    # (full-text search is done against the _all field, which *is* analyzed)
                                    "index": "not_analyzed",

                                    "include_in_all": True
                                }
                            }
                        }
                    ],

                    "_all": {
                        # Do analyze everything in the study metadata that can be searched by
                        # full-text search.  Use autocomplete analyzer above to split words on
                        # word boundaries, lowercase everything and produce edge n-grams in index.
                        # But don't produce edge n-grams during searching
                        "index": "analyzed",
                        "analyzer": "autocomplete",
                        "search_analyzer": "standard"
                    },
                }
            }
        })

In [8]:
# Make an alias to sample_v1
#es.indices.put_alias(name='sample', index='sample_v1')

In [9]:
# Check
#p(es.indices.get(index='sample'), None)

In [10]:
import reader, config, json
cfg = config.Config()

i = reader.read_investigation(cfg, open('../../data/new_ISAcreatorArchives/StudyID_01_archive/i_Investigation.txt', 'r'))
result = reader.conform_investigation_to_schema(
                reader.remove_isa_name_prefixes(
                  reader.remove_empty_values_in_dict(
                    reader.flatten_investigation(
                      i
                    )
                  )
                )
              )

response = es.index(index='cbit', doc_type='study', body=result)
study1_id = response['_id']
response

{u'_id': u'AVhJhfxO8X1My4__bYgD',
 u'_index': u'cbit',
 u'_shards': {u'failed': 0, u'successful': 1, u'total': 2},
 u'_type': u'study',
 u'_version': 1,
 u'created': True}

In [12]:
i = reader.read_investigation(cfg, open('../../data/new_ISAcreatorArchives/StudyID_02_archive/i_Investigation.txt', 'r'))
result = reader.conform_investigation_to_schema(
               reader.remove_isa_name_prefixes(
                 reader.remove_empty_values_in_dict(
                   reader.flatten_investigation(
                     i
                   )
                 )
               )
             )
response = es.index(index='cbit', doc_type='study', body=result)
study2_id = response['_id']
response

{u'_id': u'AVhJhnMu8X1My4__bYgF',
 u'_index': u'cbit',
 u'_shards': {u'failed': 0, u'successful': 1, u'total': 2},
 u'_type': u'study',
 u'_version': 1,
 u'created': True}

In [13]:
# Check dynamic mappings for studies
p(es.indices.get_mapping(index='cbit', doc_type='study'), None)

{
  "cbit": {
    "mappings": {
      "study": {
        "_all": {
          "analyzer": "autocomplete", 
          "search_analyzer": "standard"
        }, 
        "dynamic": "true", 
        "dynamic_templates": [
          {
            "object_fields": {
              "mapping": {
                "type": "object"
              }, 
              "match_pattern": "regex", 
              "path_match": "^([A-Z ]+)$"
            }
          }, 
          {
            "fts_fields": {
              "mapping": {
                "include_in_all": true, 
                "index": "not_analyzed", 
                "type": "string"
              }, 
              "match_pattern": "regex", 
              "path_match": "^.*\\.(Study Researchers Involved|Study PubMed ID|Study Publication Author List)$"
            }
          }, 
          {
            "default": {
              "mapping": {
                "include_in_all": false, 
                "index": "not_analyzed", 
                "type

In [14]:
# Full-text search by Pubmed Id
p(es.search(index='cbit', doc_type='study', body={
        "query": {
            "match_phrase": {
                "_all": "22646480"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved']
    }))

{
  "_shards": {
    "failed": 0, 
    "successful": 5, 
    "total": 5
  }, 
  "hits": {
    "hits": [
      {
        "_id": "AVhJhfxO8X1My4__bYgD", 
        "_index": "cbit", 
        "_score": 0.047945753, 
...
            "Study Researchers Involved": "Doorn J, Leusink M, Groen N, van de Peppel J, van Leeuwen JP, van Blitterswijk CA, de Boer J", 
            "Study Title": "Diverse effects of cyclic AMP variants on osteogenic and adipogenic differentiation of human mesenchymal stromal cells"
          }
        }, 
        "_type": "study"
      }
    ], 
    "max_score": 0.047945753, 
    "total": 1
  }, 
  "timed_out": false, 
  "took": 66
}


In [15]:
# Full-text search by Pubmed Id prefix
p(es.search(index='cbit', doc_type='study', body={
        "query": {
            "match_phrase": {
                "_all": "226", #"22646480"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved']
    }))

{
  "_shards": {
    "failed": 0, 
    "successful": 5, 
    "total": 5
  }, 
  "hits": {
    "hits": [
      {
        "_id": "AVhJhfxO8X1My4__bYgD", 
        "_index": "cbit", 
        "_score": 0.047945753, 
...
            "Study Researchers Involved": "Doorn J, Leusink M, Groen N, van de Peppel J, van Leeuwen JP, van Blitterswijk CA, de Boer J", 
            "Study Title": "Diverse effects of cyclic AMP variants on osteogenic and adipogenic differentiation of human mesenchymal stromal cells"
          }
        }, 
        "_type": "study"
      }
    ], 
    "max_score": 0.047945753, 
    "total": 1
  }, 
  "timed_out": false, 
  "took": 5
}


In [16]:
# Full-text search by researcher last name
p(es.search(index='cbit', doc_type='study', body={
        "query": {
            "match_phrase": {
                "_all": "Doorn"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved']
    }))

{
  "_shards": {
    "failed": 0, 
    "successful": 5, 
    "total": 5
  }, 
  "hits": {
    "hits": [
      {
        "_id": "AVhJhfxO8X1My4__bYgD", 
        "_index": "cbit", 
        "_score": 0.06780553, 
...
            "Study Researchers Involved": "Doorn J, Leusink M, Groen N, van de Peppel J, van Leeuwen JP, van Blitterswijk CA, de Boer J", 
            "Study Title": "Diverse effects of cyclic AMP variants on osteogenic and adipogenic differentiation of human mesenchymal stromal cells"
          }
        }, 
        "_type": "study"
      }
    ], 
    "max_score": 0.06780553, 
    "total": 1
  }, 
  "timed_out": false, 
  "took": 4
}


In [17]:
# Full-text search by researcher last name (partial)
p(es.search(index='cbit', doc_type='study', body={
        "query": {
            "match_phrase": {
                "_all": "door"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved']
    }))

{
  "_shards": {
    "failed": 0, 
    "successful": 5, 
    "total": 5
  }, 
  "hits": {
    "hits": [
      {
        "_id": "AVhJhfxO8X1My4__bYgD", 
        "_index": "cbit", 
        "_score": 0.06780553, 
...
            "Study Researchers Involved": "Doorn J, Leusink M, Groen N, van de Peppel J, van Leeuwen JP, van Blitterswijk CA, de Boer J", 
            "Study Title": "Diverse effects of cyclic AMP variants on osteogenic and adipogenic differentiation of human mesenchymal stromal cells"
          }
        }, 
        "_type": "study"
      }
    ], 
    "max_score": 0.06780553, 
    "total": 1
  }, 
  "timed_out": false, 
  "took": 4
}


In [18]:
# Full-text search by researcher last name (partial)
p(es.search(index='cbit', doc_type='study', body={
        "query": {
            "match_phrase": {
                "_all": "de boe"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved']
    }))

{
  "_shards": {
    "failed": 0, 
    "successful": 5, 
    "total": 5
  }, 
  "hits": {
    "hits": [
      {
        "_id": "AVhJhgFs8X1My4__bYgE", 
        "_index": "cbit", 
        "_score": 0.13561106, 
...
            "Study Researchers Involved": "Groen N, van de Peppel J, Yuan H, van Leeuwen JP, van Blitterswijk CA, de Boer J", 
            "Study Title": "Bioinformatics-based selection of a model cell type for in vitro biomaterial testing"
          }
        }, 
        "_type": "study"
      }
    ], 
    "max_score": 0.13561106, 
    "total": 3
  }, 
  "timed_out": false, 
  "took": 17
}


In [19]:
# Full-text search by researcher last name (fail due to wrong ordering)
p(es.search(index='cbit', doc_type='study', body={
        "query": {
            "match_phrase": {
                "_all": "de boer doorn"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved']
    }))

{
  "_shards": {
    "failed": 0, 
    "successful": 5, 
    "total": 5
  }, 
  "hits": {
    "hits": [], 
    "max_score": null, 
    "total": 0
  }, 
  "timed_out": false, 
  "took": 4
}


In [20]:
# Load all samples
import reader, config, json
cfg = config.Config()
a = reader.read_assay(cfg, open('../../data/new_ISAcreatorArchives/StudyID_01_archive/a_transcription_micro_1.txt', 'r'))
s = reader.read_study_sample(cfg, open('../../data/new_ISAcreatorArchives/StudyID_01_archive/s_study_sample.txt', 'r'))
d = reader.join_study_sample_and_assay(reader.clean_up_study_samples(s), reader.clean_up_assay(a))
d = reader.apply_special_treatments_to_study_sample(d)
a2 = reader.read_assay(cfg, open('../../data/new_ISAcreatorArchives/StudyID_02_archive/a_transcription_micro_1.txt', 'r'))
s2 = reader.read_study_sample(cfg, open('../../data/new_ISAcreatorArchives/StudyID_02_archive/s_study_sample.txt', 'r'))
d2 = reader.join_study_sample_and_assay(reader.clean_up_study_samples(s2), reader.clean_up_assay(a2))
d2 = reader.apply_special_treatments_to_study_sample(d2)

result = []

for i, (k, v) in enumerate(d.iteritems()):
    vv = v.copy()
    vv['Sample Name'] = k
    vv['_parent'] = study1_id
    result.append(vv)

for i, (k, v) in enumerate(d2.iteritems()):
    vv = v.copy()
    vv['Sample Name'] = k
    vv['_parent'] = study2_id
    result.append(vv)

helpers.bulk(es, index='cbit', doc_type='sample', actions=result)

(72, [])

In [20]:
# Full-text search everywhere
result = es.search(index='cbit', body={
        "size": 100,
        "query": {
            "match_phrase": {
                "_all": "d"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])

2
[u'Diverse effects of cyclic AMP variants on osteogenic and adipogenic differentiation of human mesenchymal stromal cells', u'Bioinformatics-based selection of a model cell type for in vitro biomaterial testing']


In [21]:
# Full-text search everywhere
result = es.search(index='cbit', body={
        "size": 100,
        "query": {
            "match_phrase": {
                "_all": "db"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])

6
[u'5342595019_E', u'5342595019_B', u'5342595019_H', u'5342595028_H', u'5342595028_E', u'5342595028_B']


In [22]:
# Full-text search everywhere
result = es.search(index='cbit', body={
        "size": 100,
        "query": {
            "match_phrase": {
                "_all": "db-camp"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])

6
[u'5342595019_E', u'5342595019_B', u'5342595019_H', u'5342595028_H', u'5342595028_E', u'5342595028_B']


In [23]:
# Full-text search everywhere
result = es.search(index='cbit', body={
        "size": 100,
        "query": {
            "match_phrase": {
                "_all": "c"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])

74
[u'5937337021_A', u'5937337049_H', u'5937337059_I', u'5937337061_A', u'5937337044_K', u'5937337061_B', u'5937337021_F', u'5937337059_C', u'5937337059_F', u'5937337044_I', u'5937337044_C', u'5937337044_F', u'5342595019_E', u'5342595019_F', u'5342595019_B', u'5342595019_C', u'5342595019_H', u'5342595019_I', u'5342595028_H', u'5342595028_I', u'5342595028_E', u'5342595028_F', u'5342595028_B', u'5342595028_C', u'5937337021_G', u'5937337059_G', u'5937337061_D', u'5937337021_E', u'5937337059_E', u'5937337021_B', u'5937337021_L', u'5937337044_J', u'5937337044_A', u'5937337021_I', u'5937337059_B', u'5937337049_L', u'5937337059_J', u'5937337021_H', u'5937337021_J', u'5937337044_E', u'5937337059_D', u'5937337049_K', u'5937337049_I', u'5937337049_A', u'5937337061_E', u'5937337049_D', u'5937337044_H', u'5937337049_F', u'5937337049_C', u'5937337061_C', u'5937337061_F', u'5342595019_D', u'5342595019_G', u'5342595019_A', u'5342595028_D', u'5342595028_G', u'5342595028_A', u'Bioinformatics-based sele

In [24]:
# Full-text search everywhere
result = es.search(index='cbit', body={
        "size": 100,
        "query": {
            "match_phrase": {
                "_all": "ca"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])

32
[u'Bioinformatics-based selection of a model cell type for in vitro biomaterial testing', u'5937337044_I', u'5937337044_C', u'5937337044_F', u'5937337049_D', u'5937337021_I', u'5937337021_G', u'5937337021_F', u'5937337059_B', u'5937337059_C', u'5937337044_H', u'5937337049_L', u'5937337059_F', u'5937337049_F', u'5937337059_G', u'5937337049_C', u'5937337061_C', u'5937337061_D', u'5937337061_F', u'Diverse effects of cyclic AMP variants on osteogenic and adipogenic differentiation of human mesenchymal stromal cells', u'5342595019_E', u'5342595019_F', u'5342595019_B', u'5342595019_C', u'5342595019_H', u'5342595019_I', u'5342595028_H', u'5342595028_I', u'5342595028_E', u'5342595028_F', u'5342595028_B', u'5342595028_C']


In [25]:
# Full-text search everywhere
result = es.search(index='cbit', body={
        "size": 100,
        "query": {
            "match_phrase": {
                "_all": "cam"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])

12
[u'5342595019_E', u'5342595019_F', u'5342595019_B', u'5342595019_C', u'5342595019_H', u'5342595019_I', u'5342595028_H', u'5342595028_I', u'5342595028_E', u'5342595028_F', u'5342595028_B', u'5342595028_C']


In [26]:
# Full-text search everywhere
result = es.search(index='cbit', body={
        "size": 100,
        "query": {
            "match_phrase": {
                "_all": "camp"
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])

12
[u'5342595019_E', u'5342595019_F', u'5342595019_B', u'5342595019_C', u'5342595019_H', u'5342595019_I', u'5342595028_H', u'5342595028_I', u'5342595028_E', u'5342595028_F', u'5342595028_B', u'5342595028_C']


In [27]:
# Full-text search where matching a study results in all samples matching
query_text = "Door"  # partial for "Doorn"
result = es.search(index='cbit', doc_type='sample', body={
        "size": 100,
        "query": {
            "bool": {
                # A "bool" query with a "should" clause but no "must" clause is
                # ES's quirky way of expressing 'A or B'
                "should": [
                    { "match_phrase": { "_all": query_text } },
                    { "has_parent": {
                        "type": "study",
                        "query": { "match_phrase": { "_all": query_text } }
                    }}
                ]
            }
        },
        "aggs": {
            "studies": {
                "terms": {
                    "field": "_parent"
                }
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])
p(result['aggregations'])

18
[u'5342595019_D', u'5342595019_E', u'5342595019_F', u'5342595019_G', u'5342595019_A', u'5342595019_B', u'5342595019_C', u'5342595019_H', u'5342595019_I', u'5342595028_H', u'5342595028_I', u'5342595028_D', u'5342595028_E', u'5342595028_F', u'5342595028_G', u'5342595028_A', u'5342595028_B', u'5342595028_C']
{
  "studies": {
    "buckets": [
      {
        "doc_count": 18, 
        "key": "AVhDnx1kcZts01DBflFT"
      }
    ], 
    "doc_count_error_upper_bound": 0, 
    "sum_other_doc_count": 0
  }
}


In [28]:
# Full-text search where matching a study results in all samples matching
query_text = "strom"  # partial for "stromal cell"
result = es.search(index='cbit', doc_type='sample', body={
        "size": 100,
        "query": {
            "bool": {
                # A "bool" query with a "should" clause but no "must" clause is
                # ES's quirky way of expressing 'A or B'
                "should": [
                    { "match_phrase": { "_all": query_text } },
                    { "has_parent": {
                        "type": "study",
                        "query": { "match_phrase": { "_all": query_text } }
                    }}
                ]
            }
        },
        "aggs": {
            "studies": {
                "terms": {
                    "field": "_parent"
                }
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])
p(result['aggregations'])

27
[u'5937337021_E', u'5937337059_E', u'5937337021_B', u'5937337021_L', u'5937337044_J', u'5937337044_A', u'5937337021_G', u'5937337059_G', u'5937337061_D', u'5342595019_D', u'5342595019_G', u'5342595019_A', u'5342595028_D', u'5342595028_G', u'5342595028_A', u'5342595019_E', u'5342595019_F', u'5342595019_B', u'5342595019_C', u'5342595019_H', u'5342595019_I', u'5342595028_H', u'5342595028_I', u'5342595028_E', u'5342595028_F', u'5342595028_B', u'5342595028_C']
{
  "studies": {
    "buckets": [
      {
        "doc_count": 18, 
        "key": "AVhDnx1kcZts01DBflFT"
      }, 
      {
        "doc_count": 9, 
        "key": "AVhDnyLlcZts01DBflFU"
      }
    ], 
    "doc_count_error_upper_bound": 0, 
    "sum_other_doc_count": 0
  }
}


---

Awesome!  Full-text search now works!  Now for filtering

---

In [29]:
# Full-text search where matching a study results in all samples matching
query_text = "strom"  # partial for "stromal cell"
result = es.search(index='cbit', doc_type='sample', body={
        "size": 100,
        "query": {
            "bool": {
                # A "bool" query with a "should" clause but no "must" clause is
                # ES's quirky way of expressing 'A or B'
                "should": [
                    { "match_phrase": { "_all": query_text } },
                    { "has_parent": {
                        "type": "study",
                        "query": { "match_phrase": { "_all": query_text } }
                    }}
                ],
                "must_not": [
                    { "term": { "*Compound": "8-br-cAMP - 8-bromo-cAMP" }}
                ]
            }
        },
        "aggs": {
            "studies": {
                "terms": {
                    "field": "_parent"
                }
            }
        },
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])
p(result['aggregations'])

21
[u'5937337021_E', u'5937337059_E', u'5937337021_B', u'5937337021_L', u'5937337044_J', u'5937337044_A', u'5937337021_G', u'5937337059_G', u'5937337061_D', u'5342595019_D', u'5342595019_G', u'5342595019_A', u'5342595028_D', u'5342595028_G', u'5342595028_A', u'5342595019_E', u'5342595019_B', u'5342595019_H', u'5342595028_H', u'5342595028_E', u'5342595028_B']
{
  "studies": {
    "buckets": [
      {
        "doc_count": 12, 
        "key": "AVhDnx1kcZts01DBflFT"
      }, 
      {
        "doc_count": 9, 
        "key": "AVhDnyLlcZts01DBflFU"
      }
    ], 
    "doc_count_error_upper_bound": 0, 
    "sum_other_doc_count": 0
  }
}


---

AWESOME!  This is going to work...

Let's go for aggregations.  We need to apply either all the sample metadata exclusion filters,
or all but those related to a particular field (to have proper counts in the sidebar)

---

In [30]:
# Full-text search where matching a study results in all samples matching
# but with a compound excluded
#
# (In the real app, make sizes of `terms` aggregation large enough to not matter too much, but
#  degrade gracefully by ordering by document count in the query, then resorting
#  alphabetically client-side.  That way, the filters will show values alphabetically, but
#  only for the most common filters.  Should show a warning if there's truncation)
#
query_text = "strom"  # partial for "stromal cell"
all_metadata_exclusions = [
    { "term": { "*Compound": "8-br-cAMP - 8-bromo-cAMP" }}
]
all_but_compound_metadata_exclusions = [
]
result = es.search(index='cbit', doc_type='sample', body={
        "size": 100,
        "query": {
            "bool": {
                # A "bool" query with a "should" clause but no "must" clause is
                # ES's quirky way of expressing 'A or B'
                "should": [
                    { "match_phrase": { "_all": query_text } },
                    { "has_parent": {
                        "type": "study",
                        "query": { "match_phrase": { "_all": query_text } }
                    }}
                ]
            }
        },
        "aggs": {
            "all_filters": {
                "filter": {"bool": {"must_not": all_metadata_exclusions}},
                "aggs": {
                    "studies": {
                        "terms": {
                            "field": "_parent",
                            "size": 100   # Return 100 top studies
                        }
                    },
                    "Array or chip design": {
                        "terms": {
                            "field": "Array or chip design",
                            "missing": "<None>",
                            "size": 100,   # Return first 100 field values
                            "order": { "_term": "asc" }
                        }
                    },
                    "Attach Duration (hours)": {
                        "terms": {
                            "field": "Attach Duration (hours)",
                            "missing": "<None>",
                            "size": 100,   # Return first 100 field values
                            "order": { "_term": "asc" }
                        }
                    },
                    "*Material": {
                        "terms": {
                            "field": "*Material",
                            "missing": "<None>",
                            "size": 100,   # Return first 100 field values
                            "order": { "_term": "asc" }
                        }
                    },
                }
            },
            "all_but_compound_filters": {
                "filter": {"bool": {"must_not": all_but_compound_metadata_exclusions}},
                "aggs": {
                    "*Compound": {
                        "terms": {
                            "field": "*Compound",
                            "missing": "<None>",
                            "size": 100,   # Return first 100 field values
                            "order": { "_term": "asc" }
                        }
                    }
                }
            }
        },
        "post_filter": {"bool": {"must_not": all_metadata_exclusions}},
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])
p(result['aggregations'], None)

21
[u'5937337021_E', u'5937337059_E', u'5937337021_B', u'5937337021_L', u'5937337044_J', u'5937337044_A', u'5937337021_G', u'5937337059_G', u'5937337061_D', u'5342595019_D', u'5342595019_G', u'5342595019_A', u'5342595028_D', u'5342595028_G', u'5342595028_A', u'5342595019_E', u'5342595019_B', u'5342595019_H', u'5342595028_H', u'5342595028_E', u'5342595028_B']
{
  "all_but_compound_filters": {
    "*Compound": {
      "buckets": [
        {
          "doc_count": 6, 
          "key": "8-br-cAMP - 8-bromo-cAMP"
        }, 
        {
          "doc_count": 15, 
          "key": "<None>"
        }, 
        {
          "doc_count": 6, 
          "key": "db-cAMP - dibutyryl-cAMP"
        }
      ], 
      "doc_count_error_upper_bound": 0, 
      "sum_other_doc_count": 0
    }, 
    "doc_count": 27
  }, 
  "all_filters": {
    "*Material": {
      "buckets": [
        {
          "doc_count": 3, 
          "key": "BCP - biphasic calcium phosphate"
        }, 
        {
          "doc_count": 

In [31]:
# If we need to add controls back in, need to issue two queries: one to get matching
# control sample IDs, then the second is the real query
# Full-text search where matching a study results in all samples matching
# but with a compound excluded
#
# (In the real app, make sizes of `terms` aggregation large enough to not matter too much, but
#  degrade gracefully by ordering by document count in the query, then resorting
#  alphabetically client-side.  That way, the filters will show values alphabetically, but
#  only for the most common filters.  Should show a warning if there's truncation)
#
query_text = "db-cam"  # partial for "db-camp".  there's 6 samples matching & 6 controls
all_metadata_exclusions = [
    { "term": { "*Compound": "8-br-cAMP - 8-bromo-cAMP" }}
]
all_but_compound_metadata_exclusions = [
]

controls_result = es.search(index='cbit', doc_type='sample', body={
        "size": 100,
        "query": {
            "bool": {
                # A "bool" query with a "should" clause but no "must" clause is
                # ES's quirky way of expressing 'A or B'
                'must': { "exists": { "field": "Sample Match" }},
                "should": [
                    { "match_phrase": { "_all": query_text } },
                    { "has_parent": {
                        "type": "study",
                        "query": { "match_phrase": { "_all": query_text } }
                    }}
                ],
                "must_not": all_metadata_exclusions
            }
        },
        "_source": ['Sample Match']
    })

controls = [sample['_source']['Sample Match'] for sample in controls_result['hits']['hits']]

result = es.search(index='cbit', doc_type='sample', body={
        "size": 100,
        "query": {
            "bool": {
                # A "bool" query with a "should" clause but no "must" clause is
                # ES's quirky way of expressing 'A or B'
                "should": [
                    { "match_phrase": { "_all": query_text } },
                    { "has_parent": {
                        "type": "study",
                        "query": { "match_phrase": { "_all": query_text } }
                    }},
                    { "terms": { "Sample Name": controls } }
                ]
            }
        },
        "aggs": {
            "all_filters": {
                "filter": {"bool": {"must_not": all_metadata_exclusions}},
                "aggs": {
                    "studies": {
                        "terms": {
                            "field": "_parent",
                            "size": 100   # Return 100 top studies
                        }
                    },
                    "Array or chip design": {
                        "terms": {
                            "field": "Array or chip design",
                            "missing": "<None>",
                            "size": 100,   # Return first 100 field values
                            "order": { "_term": "asc" }
                        }
                    },
                    "Attach Duration (hours)": {
                        "terms": {
                            "field": "Attach Duration (hours)",
                            "missing": "<None>",
                            "size": 100,   # Return first 100 field values
                            "order": { "_term": "asc" }
                        }
                    },
                    "*Material": {
                        "terms": {
                            "field": "*Material",
                            "missing": "<None>",
                            "size": 100,   # Return first 100 field values
                            "order": { "_term": "asc" }
                        }
                    },
                }
            },
            "all_but_compound_filters": {
                "filter": {"bool": {"must_not": all_but_compound_metadata_exclusions}},
                "aggs": {
                    "*Compound": {
                        "terms": {
                            "field": "*Compound",
                            "missing": "<None>",
                            "size": 100,   # Return first 100 field values
                            "order": { "_term": "asc" }
                        }
                    }
                }
            }
        },
        "post_filter": {"bool": {"must_not": all_metadata_exclusions}},
        "_source": ['STUDY.Study Title', 'STUDY.Study Researchers Involved',
                    'Sample Name', '*Study ID']
    })
print(result['hits']['total'])
print([hit['_source'].get('Sample Name', hit['_source'].get('STUDY', {}).get('Study Title', '?'))
       for hit in result['hits']['hits']])
p(result['aggregations'], None)

12
[u'5342595019_E', u'5342595019_B', u'5342595019_H', u'5342595028_H', u'5342595028_E', u'5342595028_B', u'5342595019_D', u'5342595019_G', u'5342595019_A', u'5342595028_D', u'5342595028_G', u'5342595028_A']
{
  "all_but_compound_filters": {
    "*Compound": {
      "buckets": [
        {
          "doc_count": 6, 
          "key": "<None>"
        }, 
        {
          "doc_count": 6, 
          "key": "db-cAMP - dibutyryl-cAMP"
        }
      ], 
      "doc_count_error_upper_bound": 0, 
      "sum_other_doc_count": 0
    }, 
    "doc_count": 12
  }, 
  "all_filters": {
    "*Material": {
      "buckets": [
        {
          "doc_count": 12, 
          "key": "PS - polystyrene"
        }
      ], 
      "doc_count_error_upper_bound": 0, 
      "sum_other_doc_count": 0
    }, 
    "Array or chip design": {
      "buckets": [
        {
          "doc_count": 12, 
          "key": "Illumina HT12v3"
        }
      ], 
      "doc_count_error_upper_bound": 0, 
      "sum_other_doc_cou

In [42]:
# Look up controls using an aggregation, to avoid bringing back duplicates over the wire
query_text = "db-cam"  # partial for "db-camp".  there's 6 samples matching & 6 controls
all_metadata_exclusions = [
    { "term": { "*Compound": "8-br-cAMP - 8-bromo-cAMP" }}
]
all_but_compound_metadata_exclusions = [
]

controls_result = es.search(index='cbit', doc_type='sample', body={
        "size": 0,
        "query": {
            "bool": {
                # A "bool" query with a "should" clause but no "must" clause is
                # ES's quirky way of expressing 'A or B'
                'must': { "exists": { "field": "Sample Match" }},
                "should": [
                    { "match_phrase": { "_all": query_text } },
                    { "has_parent": {
                        "type": "study",
                        "query": { "match_phrase": { "_all": query_text } }
                    }}
                ],
                "must_not": all_metadata_exclusions
            }
        },
        "aggs": {
            "controls": {
                "terms": {
                    "field": "Sample Match",
                    "size": 10000
                }
            }
        }
    })

p(controls_result, None)

{
  "_shards": {
    "failed": 0, 
    "successful": 5, 
    "total": 5
  }, 
  "aggregations": {
    "controls": {
      "buckets": [
        {
          "doc_count": 1, 
          "key": "5342595019_A"
        }, 
        {
          "doc_count": 1, 
          "key": "5342595019_D"
        }, 
        {
          "doc_count": 1, 
          "key": "5342595019_G"
        }, 
        {
          "doc_count": 1, 
          "key": "5342595028_A"
        }, 
        {
          "doc_count": 1, 
          "key": "5342595028_D"
        }, 
        {
          "doc_count": 1, 
          "key": "5342595028_G"
        }
      ], 
      "doc_count_error_upper_bound": 0, 
      "sum_other_doc_count": 0
    }
  }, 
  "hits": {
    "hits": [], 
    "max_score": 0.0, 
    "total": 6
  }, 
  "timed_out": false, 
  "took": 8
}


---

AWESOME!  I can do absolutely everything I need in two queries.  It seems there's also a plug-in to implement [filter joins](https://github.com/sirensolutions/siren-join) to do the lookup of relevant controls in the same query.  So with that, I could by with exactly 1 query.  I'll put that down to Sprint 3

In [34]:
# Get the study metadata and all associated sample metadata for a single study
es.search(index='cbit', body={
        "size": 100,
        "query": {
            "bool": {
                "should": [
                    {
                        "bool": {
                            "must": [
                                { "match": { "_type": "study" } },
                                { "match": { "_id": study1_id } },
                            ]
                        }
                    },
                    {
                        "bool": {
                            "must": [
                                { "match": { "_type": "sample" } },
                                {
                                    "has_parent": {
                                        "type": "study",
                                        "query": { "match": { "_id": study1_id } }
                                    }
                                }
                            ]
                        }
                    }
                ]
            }
        }
    })

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'hits': {u'hits': [{u'_id': u'AVhDnx1kcZts01DBflFT',
    u'_index': u'cbit',
    u'_score': 0.5,
    u'_source': {u'INVESTIGATION': {u'Investigation Identifier': u'1478015927860',
      u'Investigation Title': u'Investigation'},
     u'ONTOLOGY SOURCE REFERENCE': {u'Term Source Description': u'Experimental Factor Ontology',
      u'Term Source File': u'http://data.bioontology.org/ontologies/EFO',
      u'Term Source Name': u'EFO',
      u'Term Source Version': u'149'},
     u'STUDY': {u'Study Description': u'Osteogenic differentiation of human mesenchymal stromal cells (hMSCs) may potentially be used in cell-based bone tissue-engineering applications to enhance the bone-forming potential of these cells. Osteogenic differentiation and adipogenic differentiation are thought to be mutually exclusive, and although several signaling pathways and cues that induce osteogenic or adipogenic differentiation, respectively, have been id

In [36]:
# Get sample mappings (for generating list of applicable filters)
p(es.indices.get_mapping(index='cbit', doc_type='sample'), None)

{
  "cbit": {
    "mappings": {
      "sample": {
        "_all": {
          "analyzer": "autocomplete", 
          "search_analyzer": "standard"
        }, 
        "_parent": {
          "type": "study"
        }, 
        "_routing": {
          "required": true
        }, 
        "dynamic": "true", 
        "dynamic_templates": [
          {
            "default": {
              "mapping": {
                "include_in_all": true, 
                "index": "not_analyzed", 
                "type": "string"
              }, 
              "match": "*"
            }
          }
        ], 
        "properties": {
          "*Compound": {
            "include_in_all": true, 
            "index": "not_analyzed", 
            "type": "string"
          }, 
          "*Material": {
            "include_in_all": true, 
            "index": "not_analyzed", 
            "type": "string"
          }, 
          "*Phase composition - % HA": {
            "include_in_all": true, 
           

In [37]:
print(study1_id)
print(study2_id)

AVhDnx1kcZts01DBflFT
AVhDnyLlcZts01DBflFU


In [27]:
es.search(index='cbit', doc_type='sample', body={
    "query": {
      "bool": {
        "must": {
          "exists": {
            "field": "Control"
          }
        }
      }
    },
    "size": 100,
    "_source": ["Control"],
    "aggs": {
      "studies": {
        "terms": {
          "field": "_parent",
          "size": 10000
        }
      }
    }
  }
)

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'studies': {u'buckets': [{u'doc_count': 54,
     u'key': u'AVhJhnMu8X1My4__bYgF'},
    {u'doc_count': 18, u'key': u'AVhJhfxO8X1My4__bYgD'}],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 0}},
 u'hits': {u'hits': [{u'_id': u'AVhJhwUC8X1My4__bYgG',
    u'_index': u'cbit',
    u'_parent': u'AVhJhfxO8X1My4__bYgD',
    u'_routing': u'AVhJhfxO8X1My4__bYgD',
    u'_score': 1.0,
    u'_source': {u'Control': True},
    u'_type': u'sample'},
   {u'_id': u'AVhJhwUC8X1My4__bYgH',
    u'_index': u'cbit',
    u'_parent': u'AVhJhfxO8X1My4__bYgD',
    u'_routing': u'AVhJhfxO8X1My4__bYgD',
    u'_score': 1.0,
    u'_source': {u'Control': False},
    u'_type': u'sample'},
   {u'_id': u'AVhJhwUC8X1My4__bYgI',
    u'_index': u'cbit',
    u'_parent': u'AVhJhfxO8X1My4__bYgD',
    u'_routing': u'AVhJhfxO8X1My4__bYgD',
    u'_score': 1.0,
    u'_source': {u'Control': False},
    u'_type': u'sample'},
   {u'_id':

In [29]:
p(es.indices.get_mapping(index='cbit', doc_type='sample'), None)

{
  "cbit": {
    "mappings": {
      "sample": {
        "_all": {
          "analyzer": "autocomplete", 
          "search_analyzer": "standard"
        }, 
        "_parent": {
          "type": "study"
        }, 
        "_routing": {
          "required": true
        }, 
        "dynamic": "true", 
        "dynamic_templates": [
          {
            "default": {
              "mapping": {
                "include_in_all": true, 
                "index": "not_analyzed", 
                "type": "string"
              }, 
              "match": "*"
            }
          }
        ], 
        "properties": {
          "*Compound": {
            "include_in_all": true, 
            "index": "not_analyzed", 
            "type": "string"
          }, 
          "*Material": {
            "include_in_all": true, 
            "index": "not_analyzed", 
            "type": "string"
          }, 
          "*Phase composition - % HA": {
            "include_in_all": true, 
           

In [30]:
es.search(index='cbit', doc_type='sample', body={
        'size': 0,
        'aggs': {
            'controls': {
                'terms': {
                    'field': 'Control'
                }
            }
        }
    })

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'controls': {u'buckets': [{u'doc_count': 66,
     u'key': u'false'},
    {u'doc_count': 6, u'key': u'true'}],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 0}},
 u'hits': {u'hits': [], u'max_score': 0.0, u'total': 72},
 u'timed_out': False,
 u'took': 3}

In [35]:
es.search(index='cbit', doc_type='sample', body={
    "size": 10,
    "query": {
      "bool": {
        "should": [
          {
            "match_phrase": {
              "_all": "23632322"
            }
          }
                    #,
          #{
          #  "has_parent": {
          #    "type": "study",
          #    "query": {
          #      "match_phrase": {
          #        "_all": "23632322"
          #      }
          #    }
          #  }
          #}
        ],
        "must": [
          {
            "exists": {
              "field": "Sample Match"
            }
          }
        ],
        "must_not": [],
                "minimum_should_match": 1
      }
    },
    "aggs": {
      "controls": {
        "terms": {
          "field": "Sample Match",
          "size": 10000
        }
      }
    }
  })

{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
 u'aggregations': {u'controls': {u'buckets': [],
   u'doc_count_error_upper_bound': 0,
   u'sum_other_doc_count': 0}},
 u'hits': {u'hits': [], u'max_score': None, u'total': 0},
 u'timed_out': False,
 u'took': 1}