In [26]:
import pathlib

from scipy import sparse
import pandas as pd
from joblib import Memory
from nilearn import plotting

from neuroquery import datasets
from neuroquery.img_utils import coordinates_to_maps
from neuroquery.smoothed_regression import SmoothedRegression
from neuroquery.tokenization import TextVectorizer
from neuroquery.encoding import NeuroQueryModel
cache_directory = "cache"

In [27]:

CORPUS_FILE = "autism_data.csv"
#VOCAB_FILE = "vocabulary.csv"
CORPUS_FILE_MASTER = "corpus_metadata.csv"

data_dir = pathlib.Path(datasets.fetch_neuroquery_model())

corpus_metadata = pd.read_csv(CORPUS_FILE)
corpus_masterdata = pd.read_csv(CORPUS_FILE_MASTER)
# vectorizer = TextVectorizer.from_vocabulary_file(
#     str(VOCAB_FILE)
# )

vectorizer = TextVectorizer.from_vocabulary_file(
    str(data_dir / "vocabulary.csv")
)

# The TFIDF features stored with NeuroQuery data correspond to the terms in
# `vocabulary.csv` and the studies in `corpus_metadata.csv`;
# see `README.md` in the data directory for details
tfidf = sparse.load_npz(str(data_dir / "corpus_tfidf.npz"))

coordinates = pd.read_csv(datasets.fetch_peak_coordinates())

In [28]:
# We cache the `coordinates_to_maps` function with joblib to avoid recomputing
# this if we train a new model.
coord_to_maps = Memory(cache_directory).cache(coordinates_to_maps)

# You can set target_affine to a different value to increase image resolution
# or reduce computation time. The model on neuroquery.org uses 4 mm
# resolution i.e. target_affine=(4, 4, 4)
# You can also adjust the smoothing by setting `fwhm` (Full Width at Half
# maximum)
brain_maps, masker = coord_to_maps(
    coordinates, target_affine=(6, 6, 6), fwhm=9.0
)
brain_maps = brain_maps[(brain_maps.values != 0).any(axis=1)]

________________________________________________________________________________
[Memory] Calling neuroquery.img_utils.coordinates_to_maps...
coordinates_to_maps(            pmid table_id table_name     x     y     z
0       26160289    t0010    Table 2  -3.0  42.0  12.0
1       26160289    t0010    Table 2  -3.0 -57.0  23.0
2       26160289    t0010    Table 2   4.0 -57.0  24.0
3       26160289    t0010    Table 2  -9.0  54.0  27.0
4       27535906       T1   Table 1.  15.0  36.0  22.0
...          ...      ...        ...   ...   ...   ...
469255  18559106       T2    Table 2 -26.0 -32.0  46.0
469256  18559106       T3    Table 3 -22.0  16.0  21.0
469257  18559106       T3    Table 3  24.0   9.0  23.0
469258  18559106       T3    Table 3 -39.0 -48.0  27.0
469259  18559106       T3    Table 3  38.0 -46.0  25.0

[469260 rows x 6 columns], target_affine=(6, 6, 6), fwhm=9.0)
Transforming 469260 coordinates for 13459 articles
100.0% pmid:  28928708           

If this happens often in your code, it can cause performance problems 
(results will be correct in all cases). 
The reason for this is probably some large input arguments for a wrapped
 function (e.g. large strings).
THIS IS A JOBLIB ISSUE. If you can, kindly provide the joblib's team with an
 example so that they can fix the problem.
  brain_maps, masker = coord_to_maps(


__________________________________________coordinates_to_maps - 3601.6s, 60.0min


In [29]:
all_pmids_list = list(corpus_masterdata['pmid'])
autism_pmids_list = list(corpus_metadata['pmid'])
kept_studies =[]
for study in all_pmids_list:
    if study not in autism_pmids_list:
        kept_studies.append(False)
    else:
        kept_studies.append(True)

kept_idx = pd.Series(kept_studies) 


pmids = brain_maps.index.intersection(corpus_metadata["pmid"])
# kept_idx = corpus_metadata["pmid"].isin(pmids)
tfidf = tfidf.A[kept_idx, :]
brain_maps = brain_maps.loc[pmids, :]

In [30]:
regressor = SmoothedRegression(alphas=[1.0, 10.0, 100.0])

print(
    "Fitting smoothed regression model on {} samples...".format(tfidf.shape[0])
)
regressor.fit(tfidf, brain_maps.values)

Fitting smoothed regression model on 30 samples...
keeping 115 features


SmoothedRegression(alphas=[1.0, 10.0, 100.0])

In [31]:
output_directory = "autism_model"

corpus_metadata = corpus_metadata.set_index("pmid").loc[pmids, :].reset_index()
encoder = NeuroQueryModel(
    vectorizer,
    regressor,
    masker.mask_img_,
    corpus_info={
        "tfidf": sparse.csr_matrix(tfidf),
        "metadata": corpus_metadata,
    },
)
encoder.to_data_dir(output_directory)

In [23]:
query = "Autism"
print('Encoding "{}"'.format(query))

result = encoder(query)

plotting.view_img(result["brain_map"], threshold=3.0).open_in_browser()

print("Similar words:")
print(result["similar_words"].head())
print("\nSimilar documents:")
print(result["similar_documents"].head())

print("\nmodel saved in {}".format(output_directory))

# Display in notebook
plotting.view_img(result["brain_map"], threshold=3.0)

Encoding "Autism"


ValueError: Size of label 'j' for operand 1 (6289) does not match previous terms (6308).

In [32]:
tfidf.shape

(30, 6308)