In [6]:
%reload_kedro

In [10]:
import pandas as pd
import numpy as np
import nltk
import random

import matplotlib.pyplot as plt
import seaborn as sns
import scienceplots

import plotly.express as px
import plotly.graph_objects as go

import plotly.io as pio
pio.renderers.default = 'notebook'

from collections import Counter

from work.pipelines.word_helpers.nodes import parse_text, make_size_rank_dist, generic_sizerank_df

plt.style.use('science')

plt.rcParams["xtick.minor.visible"] =  False
plt.rcParams["ytick.minor.visible"] =  False
plt.rcParams['legend.frameon'] = True

In [3]:
text = catalog.load("detected_text")
text.head()
# text = text.drop_duplicates('study_id')
# text.head()

Unnamed: 0,study_id,id,contrast_id,abstract,authors,journal,year,title,keywords,det_sentences
0,10022492,10022492-1,1,A fundamental characteristic of working memory...,"Callicott JH, Mattay VS, Bertolino A, Finn K, ...","Cerebral cortex (New York, N.Y. : 1991)",1999,Physiological characteristics of capacity cons...,"response, contrast",Loci within dorsolateral prefrontal cortex (DL...
1,10022494,10022494-1,1,Electrophysiological studies on monkeys have b...,"Toni I, Schluter ND, Josephs O, Friston K, Pas...","Cerebral cortex (New York, N.Y. : 1991)",1999,"Signal-, set- and movement-related activity in...","event, response",By systematically varying the interval between...
2,10022496,10022496-1,1,Most functional imaging studies of the auditor...,"Lockwood AH, Salvi RJ, Coad ML, Arnold SA, Wac...","Cerebral cortex (New York, N.Y. : 1991)",1999,The functional anatomy of the normal human aud...,"response, network",We used positron emission tomography to map ne...
3,10051677,10051677-1,1,Positron emission tomography studies were cond...,"Denton D, Shade R, Zamarippa F, Egan G, Blair-...",Proceedings of the National Academy of Science...,1999,Correlation of regional cerebral blood flow an...,correlation,The correlation of regional cerebral blood flo...
4,10191322,10191322-1,1,The cortical organization of language in bilin...,"Chee MW, Tan EW, Thiel T",The Journal of neuroscience : the official jou...,1999,Mandarin and English single word processing st...,contrast,Blood oxygen level-dependent contrast function...


In [5]:
def rankcount_from_abstracts(df: pd.DataFrame):

    # turn abstracts into single string, parse
    # abs = df['abstract'].str.cat()
    abs = df['det_sentences'].str.cat()
    parsed = parse_text(abs)
    rankcount = generic_sizerank_df(parsed)

    return parsed, rankcount

parsed, rankcount = rankcount_from_abstracts(text)

In [6]:
rankcount.head()

Unnamed: 0,rank,ngram,count
0,0,the,48308
1,1,",",46418
2,2,and,35387
3,3,of,32190
4,4,in,28107


In [7]:
def add_nltk_pos(df: pd.DataFrame) -> pd.DataFrame:

    tags = nltk.pos_tag(df['ngram'], tagset='universal')
    pos = []
    for t in tags:
        pos.append(t[1])

    df['pos'] = pos

    return df

rankcount = add_nltk_pos(rankcount)
rankcount

Unnamed: 0,rank,ngram,count,pos
0,0,the,48308,DET
1,1,",",46418,.
2,2,and,35387,CONJ
3,3,of,32190,ADP
4,4,in,28107,ADP
...,...,...,...,...
34169,34169,re-sequencing,1,VERB
34170,34170,dilute,1,ADJ
34171,34171,0-100,1,ADJ
34172,34172,aspecific,1,NOUN


In [8]:
def filter_pos(df):
    pos_to_keep = ['NOUN', 'ADJ', 'VERB']
    simple_verbs = [
        'is', 'are', 'am',
        'were', 'was'
    ]
    df = df.drop(columns='rank')
    df = df[df['pos'].isin(pos_to_keep)]
    df = df[~df['ngram'].isin(simple_verbs)]

    df = (df
     .reset_index(drop=True)
     .reset_index(names='rank')
    )
    # reindex rank
    df['rank'] = df['rank'] + 1

    return df

rankcount_sub = filter_pos(rankcount)
rankcount_sub
    

Unnamed: 0,rank,ngram,count,pos
0,1,functional,7163,ADJ
1,2,brain,5400,NOUN
2,3,cortex,5100,ADJ
3,4,connectivity,4902,NOUN
4,5,regions,4872,NOUN
...,...,...,...,...
32027,32028,re-sequencing,1,VERB
32028,32029,dilute,1,ADJ
32029,32030,0-100,1,ADJ
32030,32031,aspecific,1,NOUN


In [11]:
def _decrease_plot_space(n_points):
    
    space_array = np.linspace(10, 100, n_points)
    posneg_array = np.resize([-1, 1], n_points)

    return space_array * posneg_array

def _make_annotation_dict(df, first_n, n_words_sampled, drop_n_prop=0):
    df = df.drop(columns='pos')
    df.columns = ['x', 'text', 'y']

    n_last = int(len(df) - len(df) * drop_n_prop)

    df_first_n = df.iloc[0:first_n]
    df_select = df.iloc[first_n+1:n_last]
    df_select= df_select.sample(n_words_sampled, weights=df['y'])
    df_last = df.iloc[[-1]]

    df = pd.concat([df_first_n, df_select, df_last])

    df['x'] = np.log10(df['x'])
    df['y'] = np.log10(df['y'])

    df['ay'] = _decrease_plot_space(len(df))
    print(df)

    return df.to_dict('records')


def make_rankcount_plot(df, first_n=10, n_words_sampled=10,
                        fpath='../data/08_reporting/project/rankcount.pdf'):


    annotations = _make_annotation_dict(df, first_n, n_words_sampled)

    fig1 = go.Scatter(
        x=df['rank'], 
        y=df['count'], 
        mode='markers'
        )

    # embedded plot
    fig2 = go.Histogram(
        x=df['pos'],
        xaxis='x2',
        yaxis='y2')

    data = [fig1, fig2]
    layout = go.Layout(
        yaxis=dict(
            title = dict(text='Number of Appearances'), 
            type='log',
            showline=True,
            linecolor='grey',
            linewidth=1
            ),
        xaxis=dict(
            title=dict(text='Term Rank'), 
            type='log',
            showline=True,
            linecolor='grey',
            linewidth=1
            ),

        # embedded plot position
        xaxis2=dict(
            domain=[0.1, 0.3],
            anchor='y2'
        ),

        yaxis2=dict(
            domain=[0.1, 0.4],
            anchor='x2'
        ),
        annotations=annotations,
        font_family="Computer Modern",
        showlegend=False,
        plot_bgcolor= "rgba(0, 0, 0, 0)",
        paper_bgcolor= "rgba(0, 0, 0, 0)",
        # title=dict(text='Rank-count distribution for content words')
    )

    fig = go.Figure(data=data, layout=layout)
    fig.write_image('../data/08_reporting/project/rankcount.pdf')

make_rankcount_plot(rankcount_sub)



              x           text         y     ay
0      0.000000     functional  3.855095  -10.0
1      0.301030          brain  3.732394   14.5
2      0.477121         cortex  3.707570  -19.0
3      0.602060   connectivity  3.690373   23.5
4      0.698970        regions  3.687707  -28.0
5      0.778151           fMRI  3.678791   32.5
6      0.845098        network  3.653598  -37.0
7      0.903090       response  3.616581   41.5
8      0.954243       activity  3.609488  -46.0
9      1.000000     activation  3.599774   50.5
324    2.511883      exhibited  2.523746  -55.0
75     1.880814         signal  3.068928   59.5
135    2.133539          level  2.871573  -64.0
102    2.012837           time  2.978181   68.5
234    2.371068         action  2.680336  -73.0
415    2.619093         humans  2.426511   77.5
95     1.982271            has  3.002598  -82.0
2013   3.304059     exhibiting  1.623249   86.5
1299   3.113943      unrelated  1.875061  -91.0
306    2.487138    connections  2.552668

In [7]:
catalog.load('rankcount_pos')

Unnamed: 0,rank,ngram,count,pos
0,1,brain,16318,NOUN
1,2,cortex,16284,ADJ
2,3,functional,15007,ADJ
3,4,activation,14733,NOUN
4,5,regions,14261,NOUN
...,...,...,...,...
57113,57114,GVS-induced,1,ADJ
57114,57115,re-sequencing,1,ADJ
57115,57116,:3312-3320,1,ADJ
57116,57117,dilute,1,NOUN
