In [1]:
%reload_kedro

In [2]:
import pandas as pd
import numpy as np
import nltk
import random

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.express as px
import plotly.graph_objects as go

import plotly.io as pio
pio.renderers.default = 'notebook'

from collections import Counter

from work.pipelines.word_helpers.nodes import parse_text, make_size_rank_dist, generic_sizerank_df

In [3]:
text = catalog.load("combined_text")
text = text.drop_duplicates('study_id')
text.head()

Unnamed: 0,id,study_id,contrast_id,abstract,authors,journal,year,title,source,citation
0,10022492-1,10022492,1,A fundamental characteristic of working memory...,"Callicott JH, Mattay VS, Bertolino A, Finn K, ...","Cerebral cortex (New York, N.Y. : 1991)",1999.0,Physiological characteristics of capacity cons...,neurosynth,
1,10022494-1,10022494,1,Electrophysiological studies on monkeys have b...,"Toni I, Schluter ND, Josephs O, Friston K, Pas...","Cerebral cortex (New York, N.Y. : 1991)",1999.0,"Signal-, set- and movement-related activity in...",neurosynth,
2,10022496-1,10022496,1,Most functional imaging studies of the auditor...,"Lockwood AH, Salvi RJ, Coad ML, Arnold SA, Wac...","Cerebral cortex (New York, N.Y. : 1991)",1999.0,The functional anatomy of the normal human aud...,neurosynth,
3,10051677-1,10051677,1,Positron emission tomography studies were cond...,"Denton D, Shade R, Zamarippa F, Egan G, Blair-...",Proceedings of the National Academy of Science...,1999.0,Correlation of regional cerebral blood flow an...,neurosynth,
4,10191322-1,10191322,1,The cortical organization of language in bilin...,"Chee MW, Tan EW, Thiel T",The Journal of neuroscience : the official jou...,1999.0,Mandarin and English single word processing st...,neurosynth,


In [4]:
# turn abstracts into single string, parse
abs = text['abstract'].str.cat()
parsed = parse_text(abs)

In [5]:
rankcount = generic_sizerank_df(parsed)
rankcount

Unnamed: 0,rank,ngram,count
0,0,the,199679
1,1,",",180871
2,2,.,158241
3,3,and,139884
4,4,of,139548
...,...,...,...
72818,72818,iS1,1
72819,72819,iPMv,1
72820,72820,Jebsen,1
72821,72821,RAVR-specific,1


In [6]:
def add_nltk_pos(df: pd.DataFrame) -> pd.DataFrame:

    tags = nltk.pos_tag(df['ngram'], tagset='universal')
    pos = []
    for t in tags:
        pos.append(t[1])

    df['pos'] = pos

    return df

rankcount = add_nltk_pos(rankcount)
rankcount

Unnamed: 0,rank,ngram,count,pos
0,0,the,199679,DET
1,1,",",180871,.
2,2,.,158241,.
3,3,and,139884,CONJ
4,4,of,139548,ADP
...,...,...,...,...
72818,72818,iS1,1,NOUN
72819,72819,iPMv,1,ADJ
72820,72820,Jebsen,1,NOUN
72821,72821,RAVR-specific,1,NOUN


In [7]:
def filter_pos(df):
    pos_to_keep = ['NOUN', 'ADJ', 'VERB']
    simple_verbs = [
        'is', 'are', 'am',
        'were', 'was'
    ]
    df = df.drop(columns='rank')
    df = df[df['pos'].isin(pos_to_keep)]
    df = df[~df['ngram'].isin(simple_verbs)]

    df = (df
     .reset_index(drop=True)
     .reset_index(names='rank')
    )

    # reindex rank
    df['rank'] = df['rank'] + 1

    return df

rankcount_sub = filter_pos(rankcount)
rankcount_sub
    

Unnamed: 0,rank,ngram,count,pos
0,1,brain,22737,NOUN
1,2,cortex,21133,ADJ
2,3,functional,19921,ADJ
3,4,activation,19223,NOUN
4,5,regions,18798,NOUN
...,...,...,...,...
68973,68974,iS1,1,NOUN
68974,68975,iPMv,1,ADJ
68975,68976,Jebsen,1,NOUN
68976,68977,RAVR-specific,1,NOUN


In [79]:
def _decrease_plot_space(n_points):
    
    space_array = np.linspace(10, 100, n_points)
    posneg_array = np.resize([-1, 1], n_points)

    return space_array * posneg_array

def make_annotation_dict(df, first_n, n_words_sampled, drop_n_prop=0):
    df = df.drop(columns='pos')
    df.columns = ['x', 'text', 'y']

    n_last = int(len(df) - len(df) * drop_n_prop)

    df_first_n = df.iloc[0:first_n]
    df_select = df.iloc[first_n+1:n_last]
    df_select= df_select.sample(n_words_sampled, weights=df['y'])
    df_last = df.iloc[[-1]]

    df = pd.concat([df_first_n, df_select, df_last])

    df['x'] = np.log10(df['x'])
    df['y'] = np.log10(df['y'])

    df['ay'] = _decrease_plot_space(len(df))
    print(df)

    return df.to_dict('records')


def make_rankcount_plot(df, first_n=10, n_words_sampled=10):


    annotations = make_annotation_dict(df, first_n, n_words_sampled)

    # fig1 = go.Figure(
    #     data=[go.Scatter(x=df['rank'], y=df['count'], mode='markers')],
    #     layout=go.Layout(
    #         yaxis=dict(
    #             title = dict(text='Number of Appearances'), 
    #             type='log'
    #             ),
    #         xaxis=dict(
    #             title=dict(text='Term Rank'), 
    #             type='log'
    #             )
    #     )
    # )

    fig1 = go.Scatter(
        x=df['rank'], 
        y=df['count'], 
        mode='markers'
        )


    fig2 = go.Histogram(
        x=df['pos'],
        xaxis='x2',
        yaxis='y2')

    data = [fig1, fig2]
        # setting layout
    layout = go.Layout(
        yaxis=dict(
            title = dict(text='Number of Appearances'), 
            type='log',
            showline=True,
            linecolor='grey',
            linewidth=1
            ),
        xaxis=dict(
            title=dict(text='Term Rank'), 
            type='log',
            showline=True,
            linecolor='grey',
            linewidth=1
            ),
        # setting y-axis position for chart 2
        xaxis2=dict(
            domain=[0.1, 0.3],
            anchor='y2'
            # position=0.2
        ),
    
        # setting y-axis position for chart 2
        yaxis2=dict(
            domain=[0.1, 0.4],
            anchor='x2'
            # position=0.2
        ),
        annotations=annotations,
        font_family="Computer Modern",
        showlegend=False,
        plot_bgcolor= "rgba(0, 0, 0, 0)",
        paper_bgcolor= "rgba(0, 0, 0, 0)",
        title=dict(text='Rank-count distribution for content words')
    )

    fig = go.Figure(data=data, layout=layout)
    fig.write_image('../data/08_reporting/project/rankcount.pdf')

make_rankcount_plot(rankcount_sub)

# fig3 = go.Figure(data=fig.data +fig2.data)
# fig.show()


              x                 text         y     ay
0      0.000000                brain  4.356733  -10.0
1      0.301030               cortex  4.324961   14.5
2      0.477121           functional  4.299311  -19.0
3      0.602060           activation  4.283821   23.5
4      0.698970              regions  4.274112  -28.0
5      0.778151             activity  4.203577   32.5
6      0.845098                 left  4.158393  -37.0
7      0.903090                 fMRI  4.157033   41.5
8      0.954243               neural  4.148387  -46.0
9      1.000000             patients  4.143920   50.5
1852   3.267875        complementary  2.309630  -55.0
96     1.986772               region  3.582404   59.5
61     1.792392                 data  3.769525  -64.0
162    2.212188          stimulation  3.394277   68.5
778    2.891537                small  2.772322  -73.0
2664   3.425697             familial  2.089905   77.5
93     1.973128                human  3.586362  -82.0
1171   3.068928          unc