In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.vectorstores import Qdrant
from qdrant_client import QdrantClient

In [None]:
qdrant_client = QdrantClient(host="localhost")
vectordb = Qdrant(
    client=qdrant_client, 
    collection_name="dandi_collection",
    embeddings=OpenAIEmbeddings(),
    content_payload_key="text_content",
)

In [None]:
vectordb.similarity_search("glial cells")

In [None]:
num_queries = 3
retriever_from_llm = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(),
    llm=ChatOpenAI(temperature=0)
)

In [None]:
user_input = "I am interested in the tuning properties of glial cells. Are there any good dandisets for studying that?"
retriever_from_llm.generate_queries(user_input)

In [None]:
unique_docs = retriever_from_llm.get_relevant_documents(question=user_input)
unique_docs

In [7]:
from utils.pipeline import suggest_relevant_dandisets
from utils.openai import get_llm_chat_answer

In [6]:
user_input = "I want to study natural movement in humans"

suggestions = suggest_relevant_dandisets(user_input=user_input, model="gpt-3.5-turbo-16k", method=1)
print(suggestions)

The most relevant dandisets for your question are:

1. DANDISET:000055/draft - AJILE12: Long-term naturalistic human intracranial neural recordings and pose
   This dandiset contains intracranial neural recordings and upper body pose trajectories of humans performing unstructured, spontaneous movements in completely naturalistic settings. The dataset includes synchronized neural recordings and pose trajectories, along with relevant metadata such as wrist movement events and annotated behavioral states. This dandiset uses an electrophysiological approach and includes multi-electrode extracellular electrophysiology recordings, providing insights into the neural basis of natural human movement.

2. DANDISET:000540/draft - Dataset for: A change in behavioral state switches the pattern of motor output that underlies rhythmic head and orofacial movements
   This dandiset provides recorded multi-modal data from rats performing naturalistic foraging and rearing behaviors. The dataset includes 

In [16]:
system_prompt = """You are a helpful neuroscience programmer assistant, 
you help extract neuroscience methods information and write Python code for analysis"""

prompt = """Given the sample file structure, and the user's input, write a Python script that analysis this neurophysiology data. 
You can stream data from dandi archive like this:
```
import fsspec 
import h5py
from pynwb import NWBHDF5IO
fs = fsspec.filesystem("http")
f = fs.open("file_url", "rb")
file = h5py.File(f)
io = NWBHDF5IO(file=file, mode="r", load_namespaces=True)
nwbfile = io.read()
```
---
user input:
I want to know if the occurence of a given event significantly influences the occurence of any next event.
---
file structure:
nwbfile.acquisition =>
{'BehavioralSyllable': BehavioralSyllable abc.LabeledEvents at 0x140407025449184
 Fields:
   data: <HDF5 dataset "data": shape (53771,), type "|u1">
   data__labels: ['Paw lick/scrunch' 'Pause' 'Pause, low rear' 'Pause' 'Scrunch'
  'Paused low rear' 'High sniff' 'Turn left' 'Reared sniff' 'Reared sniff'
  'High run' 'Rear/jump' 'Reared sniff' 'Body lick' 'Short run' 'Run'
  'Short dart' 'Wall rear' 'Forward run' 'Orient left' 'Run' 'High sniff'
  'Rear' 'Turn left' 'Rear up, turn left' 'Walk Forward' 'Pause'
  'Rightward rear down' 'Dive down' 'Pause' 'Leftward rear down' 'Groom'
  'Pause, turn left' 'Groom/paw lick' 'Orient right' 'Scrunch right' 'Rear'
  'Uncommon Syllable (frequency < 1%)' 'Uncommon Syllable (frequency < 1%)'
  'Uncommon Syllable (frequency < 1%)' 'Uncommon Syllable (frequency < 1%)'
  'Uncommon Syllable (frequency < 1%)' 'Uncommon Syllable (frequency < 1%)'
  'Uncommon Syllable (frequency < 1%)' 'Uncommon Syllable (frequency < 1%)'
  'Uncommon Syllable (frequency < 1%)' 'Uncommon Syllable (frequency < 1%)'
  'Uncommon Syllable (frequency < 1%)' 'Uncommon Syllable (frequency < 1%)'
  'Uncommon Syllable (frequency < 1%)' 'Uncommon Syllable (frequency < 1%)'
  'Uncommon Syllable (frequency < 1%)' 'Uncommon Syllable (frequency < 1%)'
  'Uncommon Syllable (frequency < 1%)' 'Uncommon Syllable (frequency < 1%)'
  'Uncommon Syllable (frequency < 1%)' 'Uncommon Syllable (frequency < 1%)'
  'Uncommon Syllable (frequency < 1%)']
   description: Behavioral Syllable identified by Motion Sequencing (MoSeq).
   timestamps: <HDF5 dataset "timestamps": shape (53771,), type "<f8">
   timestamps__unit: seconds}
---
Begin:"""

r = get_llm_chat_answer(prompt=prompt, system_prompt=system_prompt)

In [17]:
print(r)

To analyze whether the occurrence of a given event significantly influences the occurrence of any next event, we can perform a contingency table analysis using the data from the `BehavioralSyllable` acquisition.

Here's a Python script that extracts the data and performs the contingency table analysis:

```python
import fsspec
import h5py
import numpy as np
from scipy.stats import chi2_contingency

# Stream data from DANDI archive
file_url = "URL_TO_NWB_FILE"
fs = fsspec.filesystem("http")
f = fs.open(file_url, "rb")
file = h5py.File(f)

# Get the data
data = file["nwbfile"]["acquisition"]["BehavioralSyllable"]["data"][:]
labels = file["nwbfile"]["acquisition"]["BehavioralSyllable"]["data__labels"][:]

# Count occurrences of each event
events, counts = np.unique(labels, return_counts=True)

# Create contingency table
contingency_table = np.zeros((len(events), len(events)))
event_indices = {event: i for i, event in enumerate(events)}

for i in range(len(labels) - 1):
    current_event =