In [3]:
%reload_kedro

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

import requests
import xmltodict

In [5]:
ns = catalog.load('neurosynth_text')
nq = catalog.load('neuroquery_text')

def drop_duplicated_neuroquery(ns, nq):
    '''
    Find any articles that are in both databases.
    Drop them from neuroquery (less complete metadata)
    '''
    
    nq = nq[~nq['study_id'].isin(ns['study_id'])]
    return ns, nq

In [6]:
# get metadata for unique neuroquery studies

def get_pubmed_citations(nq):
    '''Query NIH API. Likely request-limited. Use caution.'''
    
    service_root= 'https://www.ncbi.nlm.nih.gov/pmc/utils/oa/oa.fcgi'
    texts = nq['study_id']

    citations = []

    for text in texts:

        # compose query
        text = 'PMC' + text
        req = f'{service_root}?id={text}'

        # parse and hault out citation
        resp = requests.get(req)
        pub_meta = xmltodict.parse(resp.content)

        try:
            citation = pub_meta['OA']['records']['record']['@citation']
        except KeyError:
            citation = np.NaN

        citations.append(citation)

    nq_with_citations = pd.concat([
        nq, 
        pd.Series(citations, name='citation')], 
        axis=1)

    return nq_with_citations

def extract_dates_from_citations(df):

    pattern='[.|,|;]'

    expanded = df['citation'].str.split(pattern, regex=True, expand=True)
    expanded = expanded.astype('str')

    for c in expanded.columns:
            
        expanded[c] = pd.to_datetime(
            expanded[c].str.strip(), 
            format="%Y %b %d",
            errors='coerce')
        
    contracted = expanded.astype('str').sum(axis=1)
    contracted = pd.to_datetime(contracted.str.replace('NaT', ''))

    df['year'] = pd.DatetimeIndex(contracted).year

    return df



# nq_citations = get_pubmed_citations(nq)
# nq_cite = extract_dates_from_citations(nq)
nq_cite = pd.read_csv('../data/02_intermediate/nq_citations.csv')

nq_cite = drop_duplicated_neuroquery(ns, nq_cite)

In [7]:
# ns

In [8]:
# happiness?

from work.pipelines.word_helpers.nodes import parse_text, generic_sizerank_df

labMT = catalog.load('labMT')
labMT = labMT.rename(columns={'word': 'ngram'})
labMT.head()

Unnamed: 0,rank,ngram,english,happiness,std
0,0,laughter,laughter,8.5,0.93
1,1,happiness,happiness,8.44,0.97
2,2,love,love,8.42,1.11
3,3,happy,happy,8.3,0.99
4,4,laughed,laughed,8.26,1.16


In [9]:
def join_happiness(df, labMT):
    out = (df
     .set_index('ngram')
     .join(labMT.drop(columns='rank').set_index('ngram')))
    
    out = out[(out['happiness'] < 4) | (out['happiness'] > 6)]

    return out

In [10]:
abstracts_by_year = {}

grouped = ns.groupby('year')
for g in grouped.groups:
    year_abstracts = grouped.get_group(g)
    year_abstracts = year_abstracts['abstract'].str.cat()

    year_sizerank = generic_sizerank_df(year_abstracts)
    year_happiness = join_happiness(year_sizerank, labMT)

    abstracts_by_year[g] = year_happiness

In [11]:
happiness = pd.DataFrame()
tmp = pd.DataFrame()
for k, v in abstracts_by_year.items():

    tmp = v.copy()
    tmp.insert(0, 'year', k)

    happiness = pd.concat([happiness, tmp])
happiness

Unnamed: 0_level_0,year,rank,count,english,happiness,std
ngram,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
right,1997,23,10,right,6.54,1.27
performance,1997,26,9,performance,6.74,1.34
we,1997,27,9,we,6.38,1.31
activity,1997,34,8,activity,6.32,1.00
knowledge,1997,41,6,knowledge,7.24,1.46
...,...,...,...,...,...,...
forward,2018,8778,1,forward,6.10,0.84
acquire,2018,8801,1,acquire,6.36,1.32
soundtrack,2018,8808,1,soundtrack,6.34,1.71
composed,2018,8811,1,composed,6.06,1.25


In [23]:
def _count_publications(df: pd.DataFrame):

    df_agg = df.groupby('year').count()['study_id']
    return df_agg.reset_index()

_count_publications(ns)

Unnamed: 0,year,study_id
0,1997,7
1,1998,16
2,1999,48
3,2000,86
4,2001,110
5,2002,163
6,2003,356
7,2004,372
8,2005,509
9,2006,628


In [13]:
def _aggregate_happiness(df):

    hp = (df[['year', 'happiness']]
          .groupby('year')
          .agg(['mean', 'std'])
          .reset_index()
        )
    hp.columns =  hp.columns = ["_".join(a) for a in hp.columns.to_flat_index()]
    hp = hp.rename(columns={'year_': 'year'})

    return hp

In [24]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import kaleido

import plotly.express as px

def plot_pubcounts(df, title="Neurosynth Publication Counts", 
                   fpath='../data/08_reporting/project/ns_pubcount.pdf'):

    pubcounts = _count_publications(df)

    fig = px.line(pubcounts, x='year', y='study_id')
    fig.update_layout(
        title=title,
        xaxis_title='Year',
        yaxis_title='Publications'
    )
    fig.write_image(fpath, width=700, height=500)



plot_pubcounts(ns)

# px.violin(happiness, x='year', y='happiness')