In [98]:
%reload_kedro

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import requests
import xmltodict

In [106]:
ns = catalog.load('neurosynth_text')
nq = catalog.load('neuroquery_text')

def drop_duplicated_neuroquery(ns, nq):
    '''
    Find any articles that are in both databases.
    Drop them from neuroquery (less complete metadata)
    '''
    
    nq = nq[~nq['study_id'].isin(ns['study_id'])]
    return ns, nq

In [107]:
# get metadata for unique neuroquery studies

def get_pubmed_citations(nq):
    '''Query NIH API. Likely request-limited. Use caution.'''
    
    service_root= 'https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi'
    texts = nq['study_id']

    citations = []

    for text in texts:

        # compose query
        text = 'PMC' + text
        req = f'{service_root}?id={text}'

        # parse and hault out citation
        resp = requests.get(req)
        pub_meta = xmltodict.parse(resp.content)

        try:
            citation = pub_meta['OA']['records']['record']['@citation']
        except KeyError:
            citation = np.NaN

        citations.append(citation)

    nq_with_citations = pd.concat([
        nq, 
        pd.Series(citations, name='citation')], 
        axis=1)

    return nq_with_citations

def extract_dates_from_citations(df):

    pattern='[.|,|;]'

    expanded = df['citation'].str.split(pattern, regex=True, expand=True)
    expanded = expanded.astype('str')

    for c in expanded.columns:
            
        expanded[c] = pd.to_datetime(
            expanded[c].str.strip(), 
            format="%Y %b %d",
            errors='coerce')
        
    contracted = expanded.astype('str').sum(axis=1)
    contracted = pd.to_datetime(contracted.str.replace('NaT', ''))

    df['year'] = pd.DatetimeIndex(contracted).year

    return df



# nq_citations = get_pubmed_citations(nq)
# nq_cite = extract_dates_from_citations(nq)
nq_cite = pd.read_csv('../data/02_intermediate/nq_citations.csv')

nq_cite = drop_duplicated_neuroquery(ns, nq_cite)

In [108]:
def combine_texts(ns: pd.DataFrame, nq: pd.DataFrame):
    
    ns['source'] = 'neurosynth'
    nq['source'] = 'neuroquery'

    return pd.concat([ns, nq])

text = combine_texts(ns, nq_cite)
text

Unnamed: 0,id,study_id,contrast_id,abstract,authors,journal,year,title,source,citation
0,10022492-1,10022492,1,A fundamental characteristic of working memory...,"Callicott JH, Mattay VS, Bertolino A, Finn K, ...","Cerebral cortex (New York, N.Y. : 1991)",1999.0,Physiological characteristics of capacity cons...,neurosynth,
1,10022494-1,10022494,1,Electrophysiological studies on monkeys have b...,"Toni I, Schluter ND, Josephs O, Friston K, Pas...","Cerebral cortex (New York, N.Y. : 1991)",1999.0,"Signal-, set- and movement-related activity in...",neurosynth,
2,10022496-1,10022496,1,Most functional imaging studies of the auditor...,"Lockwood AH, Salvi RJ, Coad ML, Arnold SA, Wac...","Cerebral cortex (New York, N.Y. : 1991)",1999.0,The functional anatomy of the normal human aud...,neurosynth,
3,10051677-1,10051677,1,Positron emission tomography studies were cond...,"Denton D, Shade R, Zamarippa F, Egan G, Blair-...",Proceedings of the National Academy of Science...,1999.0,Correlation of regional cerebral blood flow an...,neurosynth,
4,10191322-1,10191322,1,The cortical organization of language in bilin...,"Chee MW, Tan EW, Thiel T",The Journal of neuroscience : the official jou...,1999.0,Mandarin and English single word processing st...,neurosynth,
...,...,...,...,...,...,...,...,...,...,...
13450,28912749-1,28912749,1,The corpus callosum (CC) plays an important ro...,,,,,neuroquery,
13451,28915853-1,28915853,1,"BACKGROUND: Human, hairy skin contains a subgr...",,,,,neuroquery,
13452,28919871-1,28919871,1,The human brain has the capacity to integrate ...,,,,,neuroquery,
13453,28924215-1,28924215,1,The human capacity to master multiple language...,,,,,neuroquery,


In [111]:
text[text['source'] == 'neuroquery']

Unnamed: 0,id,study_id,contrast_id,abstract,authors,journal,year,title,source,citation
0,10202567-1,10202567,1,BACKGROUND: Patients with posttraumatic stress...,,,,,neuroquery,World J Nucl Med. 2023 May 1; 22(2):108-113
1,10439584-1,10439584,1,Impairment in semantic processing occurs early...,,,,,neuroquery,BMC Psychiatry. 2023 Aug 18; 23:604
2,10523407-1,10523407,1,1. Functional magnetic resonance imaging (fMRI...,,,,,neuroquery,Health Equity. 2023 Aug 30; 7(1):453-461
8,11273413-1,11273413,1,We used functional magnetic resonance imaging ...,,,,,neuroquery,
10,11584070-1,11584070,1,Studies of delayed nonmatching-to-sample (DNMS...,,,,,neuroquery,
...,...,...,...,...,...,...,...,...,...,...
13450,28912749-1,28912749,1,The corpus callosum (CC) plays an important ro...,,,,,neuroquery,
13451,28915853-1,28915853,1,"BACKGROUND: Human, hairy skin contains a subgr...",,,,,neuroquery,
13452,28919871-1,28919871,1,The human brain has the capacity to integrate ...,,,,,neuroquery,
13453,28924215-1,28924215,1,The human capacity to master multiple language...,,,,,neuroquery,
