In [22]:
%reload_kedro

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import requests
import xmltodict

In [3]:
ns = catalog.load('neurosynth_text')
nq = catalog.load('neuroquery_text')

def drop_duplicated_neuroquery(ns, nq):
    '''
    Find any articles that are in both databases.
    Drop them from neuroquery (less complete metadata)
    '''
    
    nq = nq[~nq['study_id'].isin(ns['study_id'])]
    return ns, nq

In [4]:
# get metadata for unique neuroquery studies

def get_pubmed_citations(nq):
    '''Query NIH API. Likely request-limited. Use caution.'''
    
    service_root= 'https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi'
    texts = nq['study_id']

    citations = []

    for text in texts:

        # compose query
        text = 'PMC' + text
        req = f'{service_root}?id={text}'

        # parse and hault out citation
        resp = requests.get(req)
        pub_meta = xmltodict.parse(resp.content)

        try:
            citation = pub_meta['OA']['records']['record']['@citation']
        except KeyError:
            citation = np.NaN

        citations.append(citation)

    nq_with_citations = pd.concat([
        nq, 
        pd.Series(citations, name='citation')], 
        axis=1)

    return nq_with_citations

def extract_dates_from_citations(df):

    pattern='[.|,|;]'

    expanded = df['citation'].str.split(pattern, regex=True, expand=True)
    expanded = expanded.astype('str')

    for c in expanded.columns:
            
        expanded[c] = pd.to_datetime(
            expanded[c].str.strip(), 
            format="%Y %b %d",
            errors='coerce')
        
    contracted = expanded.astype('str').sum(axis=1)
    contracted = pd.to_datetime(contracted.str.replace('NaT', ''))

    df['year'] = pd.DatetimeIndex(contracted).year

    return df



# nq_citations = get_pubmed_citations(nq)
# nq_cite = extract_dates_from_citations(nq)
nq_cite = pd.read_csv('../data/02_intermediate/nq_citations.csv')

nq_cite = drop_duplicated_neuroquery(ns, nq_cite)

In [5]:
def combine_texts(ns: pd.DataFrame, nq: pd.DataFrame):
    
    ns['source'] = 'neurosynth'
    nq['source'] = 'neuroquery'

    return pd.concat([ns, nq])

text = combine_texts(ns, nq_cite)
text.head()

In [None]:
ns

Unnamed: 0,id,study_id,contrast_id,abstract,authors,journal,year,title,source
0,10022492-1,10022492,1,A fundamental characteristic of working memory...,"Callicott JH, Mattay VS, Bertolino A, Finn K, ...","Cerebral cortex (New York, N.Y. : 1991)",1999,Physiological characteristics of capacity cons...,neurosynth
1,10022494-1,10022494,1,Electrophysiological studies on monkeys have b...,"Toni I, Schluter ND, Josephs O, Friston K, Pas...","Cerebral cortex (New York, N.Y. : 1991)",1999,"Signal-, set- and movement-related activity in...",neurosynth
2,10022496-1,10022496,1,Most functional imaging studies of the auditor...,"Lockwood AH, Salvi RJ, Coad ML, Arnold SA, Wac...","Cerebral cortex (New York, N.Y. : 1991)",1999,The functional anatomy of the normal human aud...,neurosynth
3,10051677-1,10051677,1,Positron emission tomography studies were cond...,"Denton D, Shade R, Zamarippa F, Egan G, Blair-...",Proceedings of the National Academy of Science...,1999,Correlation of regional cerebral blood flow an...,neurosynth
4,10191322-1,10191322,1,The cortical organization of language in bilin...,"Chee MW, Tan EW, Thiel T",The Journal of neuroscience : the official jou...,1999,Mandarin and English single word processing st...,neurosynth
...,...,...,...,...,...,...,...,...,...
14366,9819274-1,9819274,1,The cortical processing of vestibular informat...,"Lobel E, Kleine JF, Bihan DL, Leroy-Willig A, ...",Journal of neurophysiology,1998,Functional MRI of galvanic vestibular stimulat...,neurosynth
14367,9838166-1,9838166,1,An impaired ability to recite highly automated...,"Wildgruber D, Kischka U, Ackermann H, Klose U,...",Brain research. Cognitive brain research,1999,Dynamic pattern of brain activation during seq...,neurosynth
14368,9862924-1,9862924,1,Temporal and intensity coding of pain in human...,"Porro CA, Cettolo V, Francescato MP, Baraldi P",Journal of neurophysiology,1998,Temporal and intensity coding of pain in human...,neurosynth
14369,9886448-1,9886448,1,OBJECTIVES: To report on a patient with a lacu...,"Van Der Werf YD, Weerts JG, Jolles J, Witter M...","Journal of neurology, neurosurgery, and psychi...",1999,Neuropsychological correlates of a right unila...,neurosynth


In [31]:
# happiness?

from work.pipelines.word_helpers.nodes import parse_text, generic_sizerank_df

labMT = catalog.load('labMT')
labMT = labMT.rename(columns={'word': 'ngram'})
labMT.head()

Unnamed: 0,rank,ngram,english,happiness,std
0,0,laughter,laughter,8.5,0.93
1,1,happiness,happiness,8.44,0.97
2,2,love,love,8.42,1.11
3,3,happy,happy,8.3,0.99
4,4,laughed,laughed,8.26,1.16


In [50]:
def join_happiness(df, labMT):
    out = (df
     .set_index('ngram')
     .join(labMT.drop(columns='rank').set_index('ngram')))
    
    return out

In [64]:
abstracts_by_year = {}

grouped = ns.groupby('year')
for g in grouped.groups:
    year_abstracts = grouped.get_group(g)
    year_abstracts = year_abstracts['abstract'].str.cat()

    year_sizerank = generic_sizerank_df(year_abstracts)
    year_happiness = join_happiness(year_sizerank, labMT)

    abstracts_by_year[g] = year_happiness

In [65]:
happiness = pd.DataFrame()
tmp = pd.DataFrame()
for k, v in abstracts_by_year.items():

    tmp = v.copy()
    tmp.insert(0, 'year', k)

    happiness = pd.concat([happiness, tmp])
happiness

Unnamed: 0_level_0,year,rank,count,english,happiness,std
ngram,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
the,1997,0,120,the,4.98,0.91
of,1997,1,67,of,4.94,0.77
.,1997,2,67,,,
",",1997,3,60,,,
in,1997,4,48,in,5.50,1.02
...,...,...,...,...,...,...
Dyslexia,2018,8845,1,,,
CONCLUSIONS/SIGNIFICANCE,2018,8846,1,,,
Reading,2018,8847,1,,,
professionals,2018,8848,1,professionals,6.28,1.40


In [72]:
pubcounts = ns.groupby('year').count()['study_id']
pubcounts.reset_index()

Unnamed: 0,year,study_id
0,1997,7
1,1998,16
2,1999,48
3,2000,86
4,2001,110
5,2002,163
6,2003,356
7,2004,372
8,2005,509
9,2006,628


In [81]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import kaleido

def plot_pubcounts(df, happiness, title="Neurosynth Publication Counts and Happiness"):

    pubcounts = df.groupby('year').count()['study_id'].reset_index()

    # title = ''

    # fig = figure(figsize=(10, 2))
    # plt.plot(pubcounts)

    fig = make_subplots(specs=[[{"secondary_y": True}]])

    fig.add_trace(
        go.scatter.Line(x=pubcounts['year'], y=pubcounts['study_id'],  name='publication count'),
        secondary_y=False,
    )

    fig.add_trace(
        go.Violin(x=happiness['year'], y=happiness['happiness'], name="average happiness"),
        secondary_y=True,
    )
    fig.update_layout(
        showlegend=True,
        plot_bgcolor='white',
        title_text=title
    )
    fig.update_yaxes(
        gridcolor='lightgrey',
        title='publication counts',
        secondary_y=False
    )
    fig.update_yaxes(
        title='average happiness',
        secondary_y=True
    )
    fig.show()
    fig.write_image('../data/08_reporting/project/ns_pubcount.pdf', width=1024, height=500)



plot_pubcounts(ns, happiness)

# px.violin(happiness, x='year', y='happiness')