# Getting started with Elastic

Start by importing elasticsearch library. Make sure it is installed with `python3 -m pip install --user elasticsearch`.

In [181]:
from elasticsearch import Elasticsearch

Establish a connection. It will default to `localhost:9200` if `hosts` argument is omitted.

In [182]:
es = Elasticsearch(hosts=["localhost:9200"])

Always make sure your cluster connection is actually alive. 

In [183]:
es.ping()

True

Index a first document.

In [184]:
document = {
    "field1": "val1",
    "field2": "val1",
    "field3": 123
}
es.index("second", doc_type="doc", body=document, id="BBBB")

{'_index': 'second',
 '_type': 'doc',
 '_id': 'BBBB',
 '_version': 16,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 16,
 '_primary_term': 1}

Note that elasticsearch library is just a wrapper for talking to HTTP API, so prior example is roughly equal to this:

In [185]:
import requests
import json
url = "http://localhost:9200/second/doc/BBBB"
headers = { "Content-Type": "application/json" }

resp = requests.post(url, data=json.dumps(document), headers=headers)
print(resp.json())

{'_index': 'second', '_type': 'doc', '_id': 'BBBB', '_version': 17, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 17, '_primary_term': 1}


Then retreive it.

In [186]:
newdoc = es.get("second", doc_type="doc", id="BBBB")
print(newdoc)

{'_index': 'second', '_type': 'doc', '_id': 'BBBB', '_version': 17, 'found': True, '_source': {'field1': 'val1', 'field2': 'val1', 'field3': 123}}


Elastic attaches fair amount of meta information. Actual souce document is in `_source` field.

In [187]:
newdoc = newdoc["_source"]
print(newdoc)

{'field1': 'val1', 'field2': 'val1', 'field3': 123}


Elasticsearch uses HTTP and transport protocol, so indexing individual documents is fairly expensive. Especially when talking about IDS logs. Proper way is to use `bulk` API.

See:
 * https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-bulk.html

Bulk format requires metadata line before each document to indicate what action should be taken, which index used, etc. Consider the illustration:

In [188]:
meta = {
    "index": {
        "_index": "third",
        "_type": "_doc",
        "_id": "CCCC"
    }
}

bulk = []
i = 0
for i in range(100):
    meta = {
        "index": {
            "_index": "third",
            "_type": "_doc",
            "_id": i
        }
    }
    doc = {
        "message": "this is message {}".format(i),
        "count": i
    }
    
    bulk.append(meta)
    bulk.append(doc)

In [189]:
for msg in bulk[0:10]:
    print(msg)

{'index': {'_index': 'third', '_type': '_doc', '_id': 0}}
{'message': 'this is message 0', 'count': 0}
{'index': {'_index': 'third', '_type': '_doc', '_id': 1}}
{'message': 'this is message 1', 'count': 1}
{'index': {'_index': 'third', '_type': '_doc', '_id': 2}}
{'message': 'this is message 2', 'count': 2}
{'index': {'_index': 'third', '_type': '_doc', '_id': 3}}
{'message': 'this is message 3', 'count': 3}
{'index': {'_index': 'third', '_type': '_doc', '_id': 4}}
{'message': 'this is message 4', 'count': 4}


In [190]:
resp = es.bulk(bulk)

In [191]:
print(resp.keys())

dict_keys(['took', 'errors', 'items'])


In [192]:
print(resp["errors"])

False


In [193]:
for result in resp["items"][0:10]:
    print(result)

{'index': {'_index': 'third', '_type': '_doc', '_id': '0', '_version': 5, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 84, '_primary_term': 1, 'status': 200}}
{'index': {'_index': 'third', '_type': '_doc', '_id': '1', '_version': 5, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 72, '_primary_term': 1, 'status': 200}}
{'index': {'_index': 'third', '_type': '_doc', '_id': '2', '_version': 5, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 68, '_primary_term': 1, 'status': 200}}
{'index': {'_index': 'third', '_type': '_doc', '_id': '3', '_version': 5, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 92, '_primary_term': 1, 'status': 200}}
{'index': {'_index': 'third', '_type': '_doc', '_id': '4', '_version': 5, 'result': 'updated', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 69, '_primary_term': 1, 'status': 2

Now we can run a search against this index, looking for documents where `count` field is `>= 12` or `<= 20`. Only three results are reported back to the user.

In [194]:
results = es.search("third", body={
    "size": 3,
    "query": {
        "range": {
            "count": {
                "gte": 12,
                "lte": 20,
            }
        }
    }
})
print(results.keys())
print(results["hits"].keys())

dict_keys(['took', 'timed_out', '_shards', 'hits'])
dict_keys(['total', 'max_score', 'hits'])


In [195]:
if not results["timed_out"]:
    for result in results["hits"]["hits"]:
        print(result)

{'_index': 'third', '_type': '_doc', '_id': '14', '_score': 1.0, '_source': {'message': 'this is message 14', 'count': 14}}
{'_index': 'third', '_type': '_doc', '_id': '19', '_score': 1.0, '_source': {'message': 'this is message 19', 'count': 19}}
{'_index': 'third', '_type': '_doc', '_id': '12', '_score': 1.0, '_source': {'message': 'this is message 12', 'count': 12}}


Note that omitting `_id` will cause elastic to autogenerate one. However, if you index the same log again, then having a distinct ID will cause the old one to be updated. Otherwise, the second indexing round will duplicate the log.

In [196]:
es.index("fourth", doc_type="doc", body=document, id="BBBB")
es.index("fourth", doc_type="doc", body=document, id="BBBB")

{'_index': 'fourth',
 '_type': 'doc',
 '_id': 'BBBB',
 '_version': 8,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 7,
 '_primary_term': 1}

In [197]:
es.index("fifth", doc_type="doc", body=document)
es.index("fifth", doc_type="doc", body=document)

{'_index': 'fifth',
 '_type': 'doc',
 '_id': 'O-f3dWgB6y6dRW9rRBkm',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 3,
 '_primary_term': 1}

In [198]:
idx = es.cat.indices()

In [199]:
print(idx)

yellow open second Auogp1nfRAyqQHYYS9CiAg 5 1   2 0  9.3kb  9.3kb
yellow open fifth  jIYL8jZFRmGnvNXUBroZLw 5 1   8 0 21.4kb 21.4kb
green  open first  dKmyapUCTSWaGunmnybU9A 5 0   2 0 31.5kb 31.5kb
yellow open fourth jhY3bWMkR862MlqoZNwXrw 5 1   1 0  9.1kb  9.1kb
yellow open third  osXNyLiETa--XFGTnQjySw 5 1 100 0 49.1kb 49.1kb



Notice that `fourth` has only one document while `fifth` has more (depending on how many times you executed this script). Yet, they are the same.

In [200]:
es.search("fifth")

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 8,
  'max_score': 1.0,
  'hits': [{'_index': 'fifth',
    '_type': 'doc',
    '_id': 'NufbdWgB6y6dRW9raxlg',
    '_score': 1.0,
    '_source': {'field1': 'val1', 'field2': 'val1', 'field3': 123}},
   {'_index': 'fifth',
    '_type': 'doc',
    '_id': 'N-fbdWgB6y6dRW9raxl8',
    '_score': 1.0,
    '_source': {'field1': 'val1', 'field2': 'val1', 'field3': 123}},
   {'_index': 'fifth',
    '_type': 'doc',
    '_id': 'OefgdWgB6y6dRW9rARkX',
    '_score': 1.0,
    '_source': {'field1': 'val1', 'field2': 'val1', 'field3': 123}},
   {'_index': 'fifth',
    '_type': 'doc',
    '_id': 'OOfgdWgB6y6dRW9rABn8',
    '_score': 1.0,
    '_source': {'field1': 'val1', 'field2': 'val1', 'field3': 123}},
   {'_index': 'fifth',
    '_type': 'doc',
    '_id': 'NOfRdWgB6y6dRW9rnhki',
    '_score': 1.0,
    '_source': {'field1': 'val1', 'field2': 'val1', 'field3': 123}},
   {'_index': 'fi

In [201]:
es.search("fourth")

{'took': 2,
 'timed_out': False,
 '_shards': {'total': 5, 'successful': 5, 'skipped': 0, 'failed': 0},
 'hits': {'total': 1,
  'max_score': 1.0,
  'hits': [{'_index': 'fourth',
    '_type': 'doc',
    '_id': 'BBBB',
    '_score': 1.0,
    '_source': {'field1': 'val1', 'field2': 'val1', 'field3': 123}}]}}

Finally, note that we can manage pretty much anything via elastic python API. For example, we could create mapping template programmatically.

In [202]:
DEFAULT_SETTINGS = {
    "index": {
        "number_of_shards": 3,
        "number_of_replicas": 0,
        "refresh_interval": "30s"
    }
}

DEFAULT_PROPERTIES = {
    "@timestamp": {
        "type": "date",
        "format": "strict_date_optional_time||epoch_millis||date_time"
    },
    "@version": {
        "type": "keyword"
    },
    "ip": {
        "type": "ip"
    }
}

DEFAULT_MAPPINGS = {
    "_default_": {
        "dynamic_templates": [
            {
                "message_field": {
                    "path_match": "message",
                    "mapping": {
                        "norms": False,
                        "type": "text"
                    },
                    "match_mapping_type": "string"
                }
            },
            {
                "string_fields": {
                    "mapping": {
                        "norms": False,
                        "type": "text",
                        "fields": {
                            "keyword": {
                                "type": "keyword"
                            }
                        }
                    },
                    "match_mapping_type": "string",
                    "match": "*"
                }
            }
        ],
        "properties": DEFAULT_PROPERTIES
    }
}

DEFAULT_TEMPLATE = {
    "order": 0,
    "version": 0,
    "index_patterns": ["logstash-*"],
    "settings": DEFAULT_SETTINGS,
    "mappings": DEFAULT_MAPPINGS,
    "aliases": {}
}

In [203]:
print(DEFAULT_TEMPLATE)

{'order': 0, 'version': 0, 'index_patterns': ['logstash-*'], 'settings': {'index': {'number_of_shards': 3, 'number_of_replicas': 0, 'refresh_interval': '30s'}}, 'mappings': {'_default_': {'dynamic_templates': [{'message_field': {'path_match': 'message', 'mapping': {'norms': False, 'type': 'text'}, 'match_mapping_type': 'string'}}, {'string_fields': {'mapping': {'norms': False, 'type': 'text', 'fields': {'keyword': {'type': 'keyword'}}}, 'match_mapping_type': 'string', 'match': '*'}}], 'properties': {'@timestamp': {'type': 'date', 'format': 'strict_date_optional_time||epoch_millis||date_time'}, '@version': {'type': 'keyword'}, 'ip': {'type': 'ip'}}}}, 'aliases': {}}


In [204]:
tpl = DEFAULT_TEMPLATE
resp = es.indices.put_template("logstash", body=tpl)
print(resp)

{'acknowledged': True}


In [205]:
resp = es.indices.get_template("logstash")
print(resp)

{'logstash': {'order': 0, 'version': 0, 'index_patterns': ['logstash-*'], 'settings': {'index': {'number_of_shards': '3', 'number_of_replicas': '0', 'refresh_interval': '30s'}}, 'mappings': {'_default_': {'dynamic_templates': [{'message_field': {'path_match': 'message', 'mapping': {'norms': False, 'type': 'text'}, 'match_mapping_type': 'string'}}, {'string_fields': {'mapping': {'norms': False, 'type': 'text', 'fields': {'keyword': {'type': 'keyword'}}}, 'match_mapping_type': 'string', 'match': '*'}}], 'properties': {'@timestamp': {'type': 'date', 'format': 'strict_date_optional_time||epoch_millis||date_time'}, '@version': {'type': 'keyword'}, 'ip': {'type': 'ip'}}}}, 'aliases': {}}}


## Task

 * Use elasticsearch `_bulk` API to index entire `eve.json` to index `suricata`
 * Loading entire file into memory and sending one huge bulk may be fine in class, but is a very bad idea in production;
   * Choose an arbitrary bulk size N and flush logs to elastic when buffer is full;
   * Don't forget the tail;
 * Index EVE logs dynamically based on `event_type` field;
   * `alert` should be sent to index `suricata-alert` while `stats` should be sent to `suricata-stats`, etc;
 * Using information from EVE `timestamp` field, set up hourly index pattern;
   * Final index pattern should look like `suricata-<event_type>-<YEAR>.<MONTH>.<DAY>.<HOUR>`
   * For example, `suricata-alert-2019.01.22.16`
   * Verify by writing a wildcard query for pattern `suricata-dns-2019.*`
 * Set up indexing template that matches all `suricata` prefixed indices;
   * Disable replicas;
   * Use non-deafault number of shards;
   * Set `refresh_interval` to 3 seconds;
   * Verify by deleting all suricata indices and reindexing using your scripts, use `_cat/indices` and `cat/_shards` api to validate;
 * Everyone usually uses logstash to do this stuff, some tools assume this and look for logstash-specific fields when running queries;
   * Fix one future issue by adding a new field `@timestamp` to each document. It should correspond to `timestamp` field from EVE log;