In [8]:
import pandas as pd
import numpy as np
from bertopic import BERTopic

from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification

from sklearn.feature_extraction.text import CountVectorizer


from work.pipelines.project.nodes import parse_text

In [9]:
ns = catalog.load("detected_text")
ns.head()

Unnamed: 0,study_id,id,contrast_id,abstract,authors,journal,year,title,keywords,det_sentences
0,10022492,10022492-1,1,A fundamental characteristic of working memory...,"Callicott JH, Mattay VS, Bertolino A, Finn K, ...","Cerebral cortex (New York, N.Y. : 1991)",1999,Physiological characteristics of capacity cons...,"response, contrast",Loci within dorsolateral prefrontal cortex (DL...
1,10022494,10022494-1,1,Electrophysiological studies on monkeys have b...,"Toni I, Schluter ND, Josephs O, Friston K, Pas...","Cerebral cortex (New York, N.Y. : 1991)",1999,"Signal-, set- and movement-related activity in...","event, response",By systematically varying the interval between...
2,10022496,10022496-1,1,Most functional imaging studies of the auditor...,"Lockwood AH, Salvi RJ, Coad ML, Arnold SA, Wac...","Cerebral cortex (New York, N.Y. : 1991)",1999,The functional anatomy of the normal human aud...,"response, network",We used positron emission tomography to map ne...
3,10051677,10051677-1,1,Positron emission tomography studies were cond...,"Denton D, Shade R, Zamarippa F, Egan G, Blair-...",Proceedings of the National Academy of Science...,1999,Correlation of regional cerebral blood flow an...,correlation,The correlation of regional cerebral blood flo...
4,10191322,10191322-1,1,The cortical organization of language in bilin...,"Chee MW, Tan EW, Thiel T",The Journal of neuroscience : the official jou...,1999,Mandarin and English single word processing st...,contrast,Blood oxygen level-dependent contrast function...


In [10]:
year_map = catalog.load('params:year_map')
ns['year_group'] = ns['year']
ns['year_group'] = ns['year_group'].replace(year_map)

In [15]:
non_missing = ns[ns['det_sentences'] != '']
non_missing.groupby('year_group').count()

Unnamed: 0_level_0,study_id,id,contrast_id,abstract,authors,journal,year,title,keywords,det_sentences
year_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1997-2002,347,347,347,346,347,347,347,347,347,347
2003-2008,2818,2818,2818,2818,2818,2818,2818,2818,2818,2818
2009-2013,5288,5288,5288,5287,5288,5288,5288,5288,5288,5288
2014-2018,4328,4328,4328,4328,4328,4328,4328,4328,4328,4328


In [16]:
len(non_missing)

[1;36m12781[0m

In [3]:
#  nltk.download('wordnet')

In [5]:
def _process_text(input_text):

    # 1-grams
    tokens = word_tokenize(input_text)

    # remove stop words
    stop_words = set(stopwords.words('english'))

    # condense words into lemma
    lem = WordNetLemmatizer()

    # execute
    text = [lem.lemmatize(t.lower()) for t in tokens if t not in stop_words]

    # recycle
    out = " ".join(text)

    return out


def add_cleaned_text_column(df: pd.DataFrame, input_col = 'det_sentences'):
    
    df = df[df[input_col] != '']
    df['cleaned'] = df[input_col].apply(_process_text)
    docs = df['cleaned'].to_list()

    return df, docs


df, docs = add_cleaned_text_column(ns)

In [6]:
df[['det_sentences', 'cleaned']].head().to_csv('../data/02_intermediate/text_example.csv')


In [5]:
def fit_BERTopic_model(docs: list):

    topic_model = BERTopic()
    topics, probs = topic_model.fit_transform(docs)

    return topic_model

topic_model = fit_BERTopic_model(docs)

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [10]:
len(docs)

[1;36m12781[0m

In [17]:
topic_model.get_topic_freq()['Count'].iloc[1]

[1;36m372[0m

In [19]:
def extract_BERTopic_topics(mod, timepoint=None):

    topics = mod.get_topics()
    frequencies = mod.get_topic_freq()
    topic_idx = list(topics)[1:]
    first_n_topics = {k:topics[k] for k in topic_idx}

    bert_topics = pd.DataFrame()

    for k, v in first_n_topics.items():
        tmp = pd.DataFrame()
        topic = 'Topic ' + str(k)
        term = [t[0] for t in v]
        weight = [t[1] for t in v]
        frequency = frequencies['Count'].iloc[k]


        tmp['term'] = term
        tmp['weight'] = weight
        tmp['topic'] = topic
        tmp['frequency'] = frequency

        bert_topics = pd.concat([bert_topics, tmp])

    bert_topics['model'] = 'BERTopic'

    if timepoint:
        bert_topics['timepoint'] = timepoint

    return bert_topics

bert_topics = extract_BERTopic_topics(topic_model)
bert_topics
    

Unnamed: 0,term,weight,topic,frequency,model
0,schizophrenia,0.049316,Topic 0,5550,BERTopic
1,patient,0.018675,Topic 0,5550,BERTopic
2,control,0.011633,Topic 0,5550,BERTopic
3,healthy,0.011366,Topic 0,5550,BERTopic
4,psychosis,0.009851,Topic 0,5550,BERTopic
...,...,...,...,...,...
5,str,0.033692,Topic 144,10,BERTopic
6,transcranial,0.032512,Topic 144,10,BERTopic
7,rsn,0.030876,Topic 144,10,BERTopic
8,sst,0.029602,Topic 144,10,BERTopic


In [20]:
def fit_lda_model(docs: list):

    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(docs)

    lda = LatentDirichletAllocation(n_components=10)
    lda_fit= lda.fit_transform(X)

    return lda, vectorizer

lda, vectorizer = fit_lda_model(docs)

In [22]:
lda.components_


[1;35marray[0m[1m([0m[1m[[0m[1m[[0m[1;36m0.10000494[0m, [1;36m0.10492611[0m, [1;36m0.1[0m       , [33m...[0m, [1;36m0.1[0m       , [1;36m0.1002993[0m ,
        [1;36m0.1[0m       [1m][0m,
       [1m[[0m[1;36m0.10002096[0m, [1;36m0.10000101[0m, [1;36m0.10000874[0m, [33m...[0m, [1;36m0.1[0m       , [1;36m0.1[0m       ,
        [1;36m0.1[0m       [1m][0m,
       [1m[[0m[1;36m0.10001845[0m, [1;36m0.10000416[0m, [1;36m0.1[0m       , [33m...[0m, [1;36m1.09978763[0m, [1;36m0.1[0m       ,
        [1;36m0.1[0m       [1m][0m,
       [33m...[0m,
       [1m[[0m[1;36m6.7471009[0m , [1;36m0.10000031[0m, [1;36m0.1[0m       , [33m...[0m, [1;36m0.1[0m       , [1;36m0.1[0m       ,
        [1;36m0.1[0m       [1m][0m,
       [1m[[0m[1;36m0.10000209[0m, [1;36m0.10002521[0m, [1;36m0.10000742[0m, [33m...[0m, [1;36m0.1[0m       , [1;36m1.0997007[0m ,
        [1;36m0.1[0m       [1m][0m,
       [1m[[0m[1;36m0.1

In [172]:
def extract_LDA_topics(mod, vectorizer, timepoint=None):

    lda_topics = pd.DataFrame()
    feature_names = vectorizer.get_feature_names_out()
    for topic_idx, topic in enumerate(mod.components_):
        top_features_ind = topic.argsort()[::-1]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        tmp = pd.DataFrame()
        topic = 'Topic ' + str(topic_idx)

        tmp['term'] = top_features
        tmp['weight'] = weights
        tmp['topic'] = topic

        lda_topics = pd.concat([lda_topics, tmp])

    lda_topics['model'] = 'LDA'

    if timepoint:
        lda_topics['timepoint'] = timepoint
        
    return lda_topics

lda_topics = extract_LDA_topics(lda, vectorizer)
lda_topics

Unnamed: 0,term,weight,topic,model
0,gyrus,2234.543057,Topic 0,LDA
1,left,1704.274932,Topic 0,LDA
2,frontal,1457.109768,Topic 0,LDA
3,right,1412.331250,Topic 0,LDA
4,cortex,1234.181688,Topic 0,LDA
...,...,...,...,...
17346,fms,0.100000,Topic 9,LDA
17347,gmpve,0.100000,Topic 9,LDA
17348,prog,0.100000,Topic 9,LDA
17349,dysmenorrhoea,0.100000,Topic 9,LDA


In [173]:
def join_output(*dfs):
    return pd.concat(dfs)

join_output(bert_topics, lda_topics)

Unnamed: 0,term,weight,topic,model
0,auditory,0.035804,Topic 0,BERTopic
1,speech,0.032819,Topic 0,BERTopic
2,sound,0.027622,Topic 0,BERTopic
3,audiovisual,0.012064,Topic 0,BERTopic
4,acoustic,0.011719,Topic 0,BERTopic
...,...,...,...,...
17346,fms,0.100000,Topic 9,LDA
17347,gmpve,0.100000,Topic 9,LDA
17348,prog,0.100000,Topic 9,LDA
17349,dysmenorrhoea,0.100000,Topic 9,LDA


In [174]:
# pipe draft



In [207]:
year_map = {
    1997: '1997-2002',
    1998: '1997-2002',
    1999: '1997-2002',
    2000: '1997-2002',
    2001: '1997-2002',
    2002: '1997-2002',
    2003: '2003-2008',
    2004: '2003-2008',
    2005: '2003-2008',
    2006: '2003-2008',
    2007: '2003-2008',
    2008: '2003-2008',
    2009: '2009-2013',
    2010: '2009-2013',
    2011: '2009-2013',
    2012: '2009-2013',
    2013: '2009-2013',
    2014: '2014-2018',
    2015: '2014-2018',
    2016: '2014-2018',
    2017: '2014-2018',
    2018: '2014-2018'
}

def fit_models_by_group(df: pd.DataFrame, year_map: dict):

    df['year_group'] = df['year']
    df['year_group'] = df['year_group'].replace(year_map)


    output = pd.DataFrame()
    grouped = df.groupby('year_group')
    for group_name, df in grouped:
        
        df, docs = add_cleaned_text_column(df)

        topic_model = fit_BERTopic_model(docs)
        lda, vectorizer = fit_lda_model(docs)

        bert_topics = extract_BERTopic_topics(topic_model, timepoint=group_name)
        lda_topics = extract_LDA_topics(lda, vectorizer, timepoint=group_name)

        tmp = join_output(bert_topics, lda_topics)

        output = pd.concat([output, tmp])

    return output

In [208]:
output 

Unnamed: 0,term,weight,topic,model,timepoint
0,schizophrenia,0.049201,Topic 0,BERTopic,1997-2002
1,patient,0.017947,Topic 0,BERTopic,1997-2002
2,control,0.011220,Topic 0,BERTopic,1997-2002
3,healthy,0.010965,Topic 0,BERTopic,1997-2002
4,psychosis,0.010314,Topic 0,BERTopic,1997-2002
...,...,...,...,...,...
17346,rissman,0.100000,Topic 9,LDA,2014-2018
17347,necklace,0.100000,Topic 9,LDA,2014-2018
17348,chow,0.100000,Topic 9,LDA,2014-2018
17349,caps,0.100000,Topic 9,LDA,2014-2018
