# Get Sentences with SPIKE's query API

The following script takes a spike query and lists of words, runs the query on SPIKE and downloads a list of sentences that match the query. 

In [7]:
import csv
import json
import requests
from tqdm import tqdm
from collections import defaultdict
import jsonlines
from random import shuffle

### Materials to prepare
This script assumes the following materials:
1. json file with the desired patterns, located at `./src`, in the following format:
```
{
    "0": {
        "query": "$[w={roles}]guitarist <E>musician:[e=PERSON]John plays the piano",
        "type": "syntactic", # other options are boolean or token 
        "case_strategy": "ignore", # other options are exact or smart 
        "label": "positive",
        "lists": ["roles"] # should match the name within brackets in the query. Leave empty list if irrelevant.
    },
    "1": {
        ...
    }...
}
```
2. Lists of words stored in text files under `../data/lists`. The name of the file should match the name in the patterns file. Note you can download the list straight from spike, or create one yourself, with a single item per line. 

In [8]:
SPIKE_MATCHES_DIR = '../data/spike_matches'
PATTERNS_FILE = 'patterns.json'
LISTS_FILE = '../data/lists'
# we can either take 1000 and send to file, or take 5000, shuffle and take the first 1000 of the shuffled list
LIMIT = 5000
MAX_SENTENCES = 1000
NEG_PRODUCT = 60 # 10 * len(positive patterns). uncomment if LIMIT is 1000
ENTITY_TYPE='PERSON' # the type of entity you are looking for. If your desired capture is not an entity, leave an empty string.


In [11]:
def read_patterns_from_file(path):
    with open(path, "r") as f:
        return json.load(f)


def write_pattern_matches(pattern):
    pattern_matches = search_single_query(pattern)
    return pattern_matches

        
def search_single_query(pattern):
    spike_url = "https://spike.staging.apps.allenai.org"
    stream_location = get_stream_location(spike_url, pattern)
    if not stream_location: return None
    matches = list(search_stream(spike_url, stream_location, pattern))
    return matches


def get_lists(pattern):
    lists = defaultdict(list)
    list_names = pattern["lists"]
    if list_names:
        for name in list_names:
            with open(f"../data/lists/{name}.txt", "r") as f:
                for item in f.readlines():
                    lists[name].append(item.strip())
    return lists 
    

def get_stream_location(spike_url, pattern):
    dataset = "wikipedia"
    url = spike_url + "/api/3/multi-search/query"
    query_type = pattern["type"]
    query = pattern["query"]
    case_strategy = pattern["case_strategy"]
    lists = get_lists(pattern)
    data= {
        "queries": {
            "main": {
                query_type: query
            }
        },
        "data_set_name": dataset,
        "context": {
            "lists": lists,
            "tables": {
            },
            "case_strategy": case_strategy,
            "attempt_fuzzy": False
        }
    }
    response = requests.post(url, json=data)
    if 'stream-location' not in response.headers:
        return None
    return response.headers['stream-location']


def search_stream(spike_url, stream_location, pattern):
    stream_url = spike_url + stream_location + f"?include_sentence=true&limit={LIMIT}&include_sentence=true"
    response = requests.get(stream_url)
    results = [json.loads(jl) for jl in response.text.splitlines()]
    if len(results) == 0:
        print(f"Couldn't find any results for pattern {pattern}")
    for result in results:
        if result['kind'] in ['continuation_url', 'tip']: continue
        data = result['value']['sub_matches']['main']
        entities = [ent for ent in data['sentence']['entities'] if ent["label"].startswith(ENTITY_TYPE)] if ENTITY_TYPE else []
        yield {
            'words': data['words'],
            'captures': data['captures'],
            'sentence_index': data['sentence_index'],
            'highlights': data['highlights'],
            'entities': entities
            }

        
def main():
    patterns = read_patterns_from_file(f'{PATTERNS_FILE}')
    for idx, pattern in tqdm(patterns.items()):
        max_sents = MAX_SENTENCES*NEG_PRODUCT if pattern["label"] == "negative" else MAX_SENTENCES
        label = pattern["label"]
        with jsonlines.open(f'{SPIKE_MATCHES_DIR}/{label}/{idx}.json', 'w') as f:
            matches = write_pattern_matches(pattern)
            shuffle(matches)
            if matches:
                for match in matches[:max_sents]:
                    f.write(match)
            else:
                print("no matches")

### Add Musicians using Hearst Patterns

We can use SPIKE to collect names of musicians. Then, call spike API again, for sentences that contain those names. 

In [4]:
# there's currently a bug in using lemma+lists. Replace the ORs with a list once it's fixed. https://github.com/allenai/spike/issues/2056


hearst_patterns = {
    "1": {
        "query": "$[l={roles}]musicians $such as <E>musician:e=PERSON",
        "type": "syntactic",
        "case_strategy": "ignore",
        "label": "positive",
        "lists": ["roles"]
    },
    "2": {
        "query": "<E>musician:[e=PERSON]John $and $other $[l={roles}]musicians",
        "type": "syntactic",
        "case_strategy": "ignore",
        "label": "positive",
        "lists": ["roles"]
    }, 
    "3": {
        "query": "$such $[l={roles}]musicians as <E>musician:[e=PERSON]John",
        "type": "syntactic",
        "case_strategy": "ignore",
        "label": "positive",
        "lists": ["roles"]
    },
    "4": {
        "query": "<E>musician:[e=PERSON]John is a $[w={roles}]musician",
        "type": "syntactic",
        "case_strategy": "ignore",
        "label": "positive",
        "lists": ["roles"]
    }
}

In [5]:
def get_hearst_based_list_of_musicians(patterns_dict):
    musicians = set()

    for idx, pattern in patterns_dict.items():
        matches = write_pattern_matches(pattern)
        for sent in matches:
            captures = sent["captures"]
            first, last = captures["musician"]["first"], captures["musician"]["last"]
            if first != last:
                musician = " ".join(sent["words"][first:last+1])
                musicians.add(musician)
    return musicians

Create a list of musicians and store it with the rest of the lists.

In [11]:
LIMIT = 1000
musicians = get_hearst_based_list_of_musicians(hearst_patterns)

with open(f'{LISTS_FILE}/hearst_musicians.txt', "w") as f:
    for musician in musicians:
        if musician:
            f.write(musician+"\n")

It is recommended to briefly go over the list to verify there are no weird items. 
Take this sentence for example, taken from the wikipedia article about [Bonzo Dog Doo-Dah Band](https://en.wikipedia.org/wiki?curid=18949367)
```
In this number every member of the band was introduced and played a solo, starting with the genuine band members before including such improbable guest musicians as John Wayne on xylophone, Adolf Hitler on vibes, J. Arthur Rank on gong, Prime Minister Harold Wilson on violin, the Wild Man of Borneo, Val Doonican, Horace Batchelor, and Lord Snooty and His Pals.
```
Retrieval of this sentence inserts "improbable guest musicians" such as `Adlof Hitler` and `John Wayne` to the list, though they definitely does not belong there. 

Add the following pattern to your patterns file. If you run `main()` again, you will get another file with sentences that contain these musicians.

```
"7": {
        "query": "musician:w={hearst_musicians}",
        "type": "boolean",
        "case_strategy": "ignore",
        "label": "positive",
        "lists": ["hearst_musicians"]
    }
```

In [12]:
main()

100%|███████████████████████████████████████████████████████████| 8/8 [01:05<00:00,  8.17s/it]


In [13]:
meaningless_query  = {
    "10": {
        "query": "the",
        "type": "boolean",
        "case_strategy": "ignore",
        "label": "negative",
        "lists": []
    },
      "11": {
        "query": "t=NN",
        "type": "boolean",
        "case_strategy": "ignore",
        "label": "negative",
        "lists": []
    }
                     }

for idx, pattern in meaningless_query.items():
    sentences = []
    matches = write_pattern_matches(pattern)
    for sent in matches:
        if not sent["entities"]:
            sentences.append(sent)

    if sentences:
        with jsonlines.open(f'{SPIKE_MATCHES_DIR}/negative/{idx}.jsonl', 'w') as f:
            shuffle(sentences)
            for sent in sentences:
                f.write(sent)
