# Search the Data

A very quick search utility for finding things on the map.

In [None]:
import glob
import json
import os
import pandas as pd

## Load Data

In [None]:
DATA_PATH = '../map/output/district/'

def list_district_data(path):
    return glob.glob(os.path.join(path, '*.data.json'))

def read_district_data(pathname):
    with open(pathname) as file:
        return json.load(file)

raw_data = [
    read_district_data(district_data_file)
    for district_data_file in list_district_data(DATA_PATH)
]

## Build Index (or read if it exists)

In [None]:
from whoosh.index import create_in, open_dir, exists_in
from whoosh.fields import *
SCHEMA = Schema(
    my_eu_id=ID(stored=True),
    postcode=ID(stored=True),
    title=TEXT(stored=True),
    content=TEXT(stored=True))

In [None]:
def build_index(raw_district_data):
    writer = INDEX.writer()
    outward_code = raw_district_data['outwardCode']
    
    def make_postcode(row):
        return '{} {}'.format(outward_code, row['inwardCode'])
    
    def add_cordis(df):
        for _index, row in df[~df.objective.isna()].iterrows():
            writer.add_document(
                postcode=make_postcode(row),
                my_eu_id=row['myEuId'],
                title=row['projectTitle'],
                content=row['objective']
            )
                
    def add_creative_or_erasmus(df):
        for _index, row in df[~df.summary.isna()].iterrows():
            writer.add_document(
                postcode=make_postcode(row),
                my_eu_id=row['myEuId'],
                title=row['project'],
                content=row['summary']
            )

    def add_esif(df):
        for _index, row in df[~df.summary.isna()].iterrows():
            writer.add_document(
                postcode=make_postcode(row),
                my_eu_id=row['myEuId'],
                title=row['projectTitle'],
                content=row['summary']
            )
    
    for dataset, data in raw_district_data['datasets'].items():
        df = pd.read_json(json.dumps(data), orient='split')
        if dataset == 'cordis':
            add_cordis(df)
        if dataset == 'creative' or dataset == 'erasmus':
            add_creative_or_erasmus(df)
        if dataset == 'esif':
            add_esif(df)
            
    writer.commit()

if exists_in('output'):
    INDEX = open_dir('output')
else:
    INDEX = create_in('output', SCHEMA)
    for raw_district_data in raw_data:
        build_index(raw_district_data)

## Search

In [None]:
from whoosh.query import *
from whoosh.qparser import MultifieldParser

def search_word(word, output_file_name=None, limit=20):
    if output_file_name is None:
        output_file_name = 'output/{}.html'.format(word)
    query = Or([Term('title', word), Term('content', word)])
    return search_with_query(query, output_file_name, limit)

def search(query_string, output_file_name, limit=20):   
    parser = MultifieldParser(['title', 'content'], schema=SCHEMA)
    query = parser.parse(query_string)
    return search_with_query(query, output_file_name, limit)

def search_with_query(query, output_file_name, limit=20):
    with INDEX.searcher() as searcher:
        with open(output_file_name, 'w') as output_file:
            output_file.write('<html><body>')
            hits = searcher.search(query, limit=limit)
            hits.fragmenter.surround = 100
            output_file.write('<h1>{} hits for <tt>{}</tt></h1>'.format(len(hits), query))
            output_file.write('<dl>')
            for hit in hits:
                output_file.write(
                    '<dt><a href="https://www.myeu.uk/#/postcode/{}">{}</a> ({})</dt>'.format(
                        hit['postcode'].replace(' ', '/'),
                        hit['postcode'],
                        hit['my_eu_id']
                    ))
                output_file.write('<dd><p style="text-decoration: underline;">')
                if hit.highlights('title') == '':
                    output_file.write(hit['title'])
                else:
                    output_file.write(hit.highlights('title'))
                output_file.write('</p><p>')
                output_file.write(hit.highlights('content', top=5))
                output_file.write('</p></dd>')
            output_file.write('</dl></body></html>')                

In [None]:
search_word('women', limit=200)

In [None]:
search("women's services", 'output/womens_services.html', limit=200)

In [None]:
search("domestic violence", 'output/domestic_violence.html', limit=200)

In [None]:
search("domestic violence services", 'output/domestic_violence_services.html', limit=200)

In [None]:
search("local refuge", 'output/local_refuge.html', limit=200)

In [None]:
search("employment rights", 'output/employment_rights.html', limit=200)

In [None]:
search("gender pay gap", 'output/gender_pay_gap.html', limit=200)

In [None]:
search("equality human rights protection", 'output/equality_human_rights_protection.html', limit=200)

In [None]:
search("women's rights", 'output/womens_rights.html', limit=200)

In [None]:
search("women's rights gender equality", 'output/womens_rights_gender_equality.html', limit=200)

In [None]:
search("charter of fundamental rights", 'output/charter_of_fundamental_rights.html', limit=200)

In [None]:
search("gender equality", 'output/gender_equality.html', limit=200)

In [None]:
search_word('gender', limit=200)