# Import packages

In [None]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
import pandas as pd
import time
from csv import writer

In [None]:
# Ignore warning messages
import warnings
warnings.filterwarnings("ignore")

# Login and connect

In [None]:
import sys
sys.path.append("..")
from credentials import *  # make sure to set the required files

In [None]:
es = Elasticsearch(hosts=hosts, # make sure to configure the hosts in the credentials file
                   verify_certs=False,
                   scheme='https',
                   http_auth=(username, password), # configure the login details in the credentials.py file
                   #api_key=(api_username, api_password), # configure the api login details in the credentials.py file
                  )

In [None]:
# Check API response 
elastic_info = es.info()
for k, v in elastic_info.items():
    print(k, '-'*(20-(1*len(k))), v)

In [None]:
#Check health of elastic cluster
es_health = es.cluster.health(wait_for_status='yellow', request_timeout=1)
for k, v in es_health.items():
    print(k, '-'*(20-(1*len(k))), v)

# Check the list of Indices and columns

In [None]:
for i in es.indices.get_mapping().keys():
    print(i)

In [None]:
# Check the list of columns in that index
index = 'gstt_clinical_documents_letters'
for col in es.indices.get_mapping(index=index)[index]['mappings']['properties'].keys():
    print(col)

# Set parameters

In [None]:
pt_list = [] # example list of patients' patient_TrustNumber here

## Columns of interest

Select your fields and list in order of output columns

In [None]:

columns = ["body_analysed",
           "patient_RaceCode",
           "patient_Id", # this number can be different from the trust number which you are searching for
           "patient_DOB", 
           "patient_GenderCode", 
           "patient_MaritalStatusCode", 
           "patient_ReligionCode", 
           "patient_DeceasedDtm", 
           "patient_LastName",
           "patient_FirstName",
          ]

## Build query

For further information on [how to build a query can be found here](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html)


In [None]:
query = {
    "from" : 0 ,
    "size" : 10000,
    "query": {
        "bool": {
            "filter" : {
                "terms" : {"patient_TrustNumber" : pt_list}
            }
        }
    },
    "_source": columns
}

# Search, retrieve, and save

In [None]:
search_results = scan(es,
                      index=["gstt_clinical_epr_*"],
                      preserve_order=True,  # This has major effects on performance but preserves the search order
                      query=query,
                     )

### Simple search

In [None]:
# Construct a df
temp_results = []

for hit in search_results:
    row = {}
    row['_index'] = hit['_index']
    row['_type'] = hit['_type']
    row['_id'] = hit['_id']
    row['_score'] = hit['_score']
    row.update(hit['_source'])
    temp_results.append(row)

df = pd.DataFrame(temp_results)
df_headers = ['_index', '_type', '_id', '_score']
df_headers.extend(columns)
output_df = pd.DataFrame(columns=df_headers)
df = pd.concat([output_df, df])

### Simple search and save to file
This option is for large searches

In [None]:
# Construct a df
search_results_filename = 'search_results.csv'  # change output filename
temp_results = []
counter = 0
save_interator = 5000  # saves every x hits
df_headers = ['_index', '_type', '_id', '_score']
df_headers.extend(columns)

with open(search_results_filename, 'a') as f_object:
    writer_object = writer(f_object)
    writer_object.writerow(df_headers)
    for hit in search_results:
        if counter % save_interator == 0:
            writer_object.writerows(temp_results)
            temp_results = []
            print(f'Saved {save_interator} docs')
        row = {}
        row['_index'] = hit['_index']
        row['_type'] = hit['_type']
        row['_id'] = hit['_id']
        row['_score'] = hit['_score']
        row.update(hit['_source'])
        temp_results.append(row.values())
        counter += 1
        
    writer_object.writerows(temp_results)
    temp_results = []
    f_object.close()

# Process

In [None]:
# Whatever you want here
df = pd.read_csv(search_results_filename)

In [None]:
df.head()

# Save

In [None]:
df.to_csv("file_name.csv", index=False)