In [None]:
#import packages and declare file paths

import pathlib
import os
from scipy import sparse
import pandas as pd
from joblib import Memory
from nilearn import plotting
from datetime import date
from neuroquery import datasets
from neuroquery.img_utils import coordinates_to_maps
from neuroquery.smoothed_regression import SmoothedRegression
from neuroquery.tokenization import TextVectorizer
from neuroquery.encoding import NeuroQueryModel

conscioussness_data = pd.read_csv("model_data/Sheet 2-Articles for Consciousness (Wdoi).csv")
connectivity_data = pd.read_csv("model_data/Sheet 3-Articles for Functional Connectivity (Wdoi).csv")
maybe_data = pd.read_csv("model_data/Sheet 4-The Maybe List (Wdoi).csv")

In [3]:
CACHE_DIRECTORY = "maybe_data_cache"
OUTPUT_DIRECTORY = "maybe_data_model"

#CORPUS_FILE is the file path of the csv containing the studies you want in the neuroquery model
#in this example the .csv file was saved in the same directory as this script
#CORPUS_FILE = "autism_data.csv"

#CORPUS_FILE_MASTER is the file path of the csv containing all 13459 studies used to create the original neuroquery model
CORPUS_FILE_MASTER = "corpus_metadata.csv"

data_dir = pathlib.Path(datasets.fetch_neuroquery_model())

corpus_masterdata = pd.read_csv(CORPUS_FILE_MASTER)
corpus_metadata = maybe_data
# vectorizer = TextVectorizer.from_vocabulary_file(
#     str(VOCAB_FILE)
# )

vectorizer = TextVectorizer.from_vocabulary_file(
    str(data_dir / "vocabulary.csv")
)

# The TFIDF features stored with NeuroQuery data correspond to the terms in
# `vocabulary.csv` and the studies in `corpus_metadata.csv`;
# see `README.md` in the data directory for details
tfidf = sparse.load_npz(str(data_dir / "corpus_tfidf.npz"))

coordinates = pd.read_csv(datasets.fetch_peak_coordinates())

In [None]:
# We cache the `coordinates_to_maps` function with joblib to avoid recomputing
# this if we train a new model.
coord_to_maps = Memory(CACHE_DIRECTORY).cache(coordinates_to_maps)

# You can set target_affine to a different value to increase image resolution
# or reduce computation time. The model on neuroquery.org uses 4 mm
# resolution i.e. target_affine=(4, 4, 4)
# You can also adjust the smoothing by setting `fwhm` (Full Width at Half
# maximum)
brain_maps, masker = coord_to_maps(
    coordinates, target_affine=(6, 6, 6), fwhm=9.0
)
brain_maps = brain_maps[(brain_maps.values != 0).any(axis=1)]

In [74]:
#create boolean list kept_idx
#each value in list indicates if the study at that index is kept or not
#example: if kept_idx[0] == False, then it means the study at corpus_masterdata[0] is not kept
#studies that are not kept will not be used to train the neuroquery model

all_pmids_list = list(corpus_masterdata['pmid'])
target_pmids_list = list(corpus_metadata['pmid'])
kept_studies =[]
for study in all_pmids_list:
    if study not in target_pmids_list:
        kept_studies.append(False)
    else:
        kept_studies.append(True)

kept_idx = pd.Series(kept_studies) 

pmids = brain_maps.index.intersection(corpus_metadata["pmid"])
# kept_idx = corpus_metadata["pmid"].isin(pmids)
tfidf = tfidf.A[kept_idx, :]
brain_maps = brain_maps.loc[pmids, :]

In [None]:
regressor = SmoothedRegression(alphas=[1.0, 10.0, 100.0])

print(
    "Fitting smoothed regression model on {} samples...".format(tfidf.shape[0])
)

regressor.fit(tfidf, brain_maps.values)
corpus_metadata = corpus_masterdata.set_index("pmid").loc[pmids, :].reset_index()
encoder = NeuroQueryModel(
    vectorizer,
    regressor,
    masker.mask_img_,
    corpus_info={
        "tfidf": sparse.csr_matrix(tfidf),
        "metadata": corpus_metadata,
    },
)
encoder.to_data_dir(OUTPUT_DIRECTORY)

In [None]:
query = "autism spectrum disorder"
print('Encoding "{}"'.format(query))

result = encoder(query)

plotting.view_img(result["brain_map"], threshold=3.0).open_in_browser()

print("Similar words:")
print(result["similar_words"].head())
print("\nSimilar documents:")
print(result["similar_documents"].head())

print("\nmodel saved in {}".format(OUTPUT_DIRECTORY))

# Display in notebook
plotting.view_img(result["brain_map"], threshold=3.0)