In [1]:
from pathlib import Path
import json
import sys

from neuvueclient import NeuvueQueue
import pandas as pd
import numpy as np
from joblib import Parallel, delayed

sys.path.append(str(Path.cwd().parents[1]))
from utils.constants import NEUVUE_QUEUE_URL

# Query Task Notebook

This notebook shows a couple examples of how to use the client to query task data and retrieve completed tasks for downstream processing.

In [2]:
client = NeuvueQueue(NEUVUE_QUEUE_URL)

Auth method: Config File


In [3]:
AUTHOR = "dxenes1"
NAMESPACE = "forcedChoiceExample"
ASSIGNEES = ["dxenes1"]
INSTRUCTIONS = {"prompt": "Is the selected seg ID a multi-soma? (two valid neurons merged together)"}

Lets say in this case I want to get the tasks that user `dxenes1` has completed in the Forced Choice Example namespace. I want to examine the decisions they've made in each of there tasks.

In [4]:
import time 
start = time.time()
tasks = client.get_tasks(sieve={
    'assignee':'dxenes1',
    'status':'closed',
    'namespace':'forcedChoiceExample'
})
print(round(time.time() - start, 2))

3.44


The above query will give me ALL of the information for the tasks that fit the criteria established in the sieve. It takes a while to grab the tasks because it also converts the JSON state links into JSON strings automatically. I can disable that to get a faster query.


In [5]:
start = time.time()
tasks = client.get_tasks(sieve={
    'assignee':'dxenes1',
    'status':'closed',
    'namespace':'forcedChoiceExample'
}, convert_states_to_json=False)
print(round(time.time() - start, 2))

0.54


In [6]:
tasks.head()

Unnamed: 0_level_0,active,assignee,author,closed,created,instructions,metadata,namespace,opened,priority,duration,status,seg_id,ng_state,points,tags,__v
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
65e225bdd4cc5f67dd8fb954,True,dxenes1,dxenes1,2024-03-01 19:52:58.603,2024-03-01 19:00:12.823,{'prompt': 'Is the selected seg ID a multi-som...,"{'nuclei': [531788, 525961, 528862, 528334, 52...",forcedChoiceExample,2024-03-01 19:08:28.407,1000,93,closed,864691135968943973,https://global.daf-apis.com/nglstate/api/v1/57...,,,0
65e225bdd4cc5f67dd8fb957,True,dxenes1,dxenes1,2024-03-01 19:53:10.116,2024-03-01 19:00:13.235,{'prompt': 'Is the selected seg ID a multi-som...,"{'nuclei': [198054, 198124, 198037, 198128, 19...",forcedChoiceExample,2024-03-01 19:52:59.431,1000,5,closed,864691135270095013,https://global.daf-apis.com/nglstate/api/v1/55...,,,0
65e225bdd4cc5f67dd8fb95a,True,dxenes1,dxenes1,2024-03-01 19:53:15.224,2024-03-01 19:00:13.453,{'prompt': 'Is the selected seg ID a multi-som...,"{'nuclei': [208610, 276973, 208491, 176123, 37...",forcedChoiceExample,2024-03-01 19:53:10.424,1000,3,closed,864691135479319750,https://global.daf-apis.com/nglstate/api/v1/59...,,,0
65e225bdd4cc5f67dd8fb95d,True,dxenes1,dxenes1,2024-03-01 19:53:19.113,2024-03-01 19:00:13.672,{'prompt': 'Is the selected seg ID a multi-som...,"{'nuclei': [608855, 608854, 611313, 642226], '...",forcedChoiceExample,2024-03-01 19:53:15.518,1000,2,closed,864691135888983689,https://global.daf-apis.com/nglstate/api/v1/67...,,,0
65e225bdd4cc5f67dd8fb960,True,dxenes1,dxenes1,2024-03-01 19:53:23.256,2024-03-01 19:00:13.868,{'prompt': 'Is the selected seg ID a multi-som...,"{'nuclei': [592840, 495316, 594720, 497173], '...",forcedChoiceExample,2024-03-01 19:53:19.406,1000,2,closed,864691136974041116,https://global.daf-apis.com/nglstate/api/v1/57...,,,0


Lets say you only care about `metadata` since thats where the decision is stored. You can use the `select` kwarg to filter the query to only certain columns

In [7]:
start = time.time()
tasks = client.get_tasks(sieve={
    'assignee':'dxenes1',
    'status':'closed',
    'namespace':'forcedChoiceExample'
}, select=['metadata'], convert_states_to_json=False)
print(round(time.time() - start, 2))

0.9


We can then add a column to grab decision out of metadata

In [8]:
tasks['decision'] = tasks['metadata'].apply(lambda x: x.get('decision'))

In [9]:
tasks['decision'].value_counts()

decision
yes       7
no        6
unsure    3
Name: count, dtype: int64

Lets say we only wanted to get tasks after a certain datetime, we can do that with a datetime query in one of the dated columns: 'created', 'opened', and 'closed'. 

- created: time task was created
- opened: time task was opened by a proofreader (gets reset if the proofreader skipped and was assigned to another)
- closed: time task was closed by a proofreader

You can query datetimes using MongoDB logical operators. `$gt` corresponds to greater than and `$lt` corresponds to less than

In [10]:
from datetime import datetime

In [11]:
start = time.time()
tasks = client.get_tasks(sieve={
    'assignee':'dxenes1',
    'status':'closed',
    'namespace':'forcedChoiceExample',
    'closed':{
        "$gt": datetime(2024, 2, 29)
    }
}, select=['metadata'], convert_states_to_json=False)
print(round(time.time() - start, 2))

0.57


In [12]:
tasks

Unnamed: 0_level_0,metadata
_id,Unnamed: 1_level_1
65e225bdd4cc5f67dd8fb954,"{'nuclei': [531788, 525961, 528862, 528334, 52..."
65e225bdd4cc5f67dd8fb957,"{'nuclei': [198054, 198124, 198037, 198128, 19..."
65e225bdd4cc5f67dd8fb95a,"{'nuclei': [208610, 276973, 208491, 176123, 37..."
65e225bdd4cc5f67dd8fb95d,"{'nuclei': [608855, 608854, 611313, 642226], '..."
65e225bdd4cc5f67dd8fb960,"{'nuclei': [592840, 495316, 594720, 497173], '..."
65e225bed4cc5f67dd8fb963,"{'nuclei': [208627, 208571, 176130, 208524], '..."
65e225bed4cc5f67dd8fb966,"{'nuclei': [557416, 523216, 523790, 557479], '..."
65e225bed4cc5f67dd8fb969,"{'nuclei': [236847, 133203, 305448, 136670], '..."
65e225bed4cc5f67dd8fb96c,"{'nuclei': [136327, 177213, 140114], 'selected..."
65e225bfd4cc5f67dd8fb96f,"{'nuclei': [439291, 439345, 377674], 'selected..."
