In [1]:
import os
import json

import numpy as np

from collections import defaultdict
from gensim.corpora import Dictionary
from gensim.models import TfidfModel

In [2]:
path2dir = 'data_processing_feb2020/abstracts_2018_200k/'
outfile='tfidf_results.json'
N_samples = 3
    

In [3]:
### list the paths to the respective corpus in a standardized order

In [4]:
corpus_paths = []

files_list = [f for f in os.listdir(path2dir) if 'corpus.json' in f]

for i_sample in range(N_samples):
    
    # NOTE this switch for titles vs abstracts (todo clean this up)
    #corpus_sublist = [f for f in files_list if 'titles_{}_'.format(i_sample) in f]
    corpus_sublist = [f for f in files_list if 'abstracts_{}_'.format(i_sample) in f]
    
    N_communities = len(corpus_sublist)
    
    corpus_paths.append([])
    for i_community in range(N_communities):
        
        for j_path in corpus_sublist:
            if 'cluster{}'.format(i_community) in j_path:
                this_path = j_path
        corpus_paths[i_sample].append(this_path)
  
'''
KEY:
sample 0
    community 0
    community 1
    etc
sample 1
    community 0
    community 1
    etc
etc
'''

print(corpus_paths)

[['abstracts_0_cluster0_2018_200k_corpus.json', 'abstracts_0_cluster1_2018_200k_corpus.json', 'abstracts_0_cluster2_2018_200k_corpus.json', 'abstracts_0_cluster3_2018_200k_corpus.json', 'abstracts_0_cluster4_2018_200k_corpus.json', 'abstracts_0_cluster5_2018_200k_corpus.json', 'abstracts_0_cluster6_2018_200k_corpus.json', 'abstracts_0_cluster7_2018_200k_corpus.json', 'abstracts_0_cluster8_2018_200k_corpus.json'], ['abstracts_1_cluster0_2018_200k_corpus.json', 'abstracts_1_cluster1_2018_200k_corpus.json', 'abstracts_1_cluster2_2018_200k_corpus.json', 'abstracts_1_cluster3_2018_200k_corpus.json', 'abstracts_1_cluster4_2018_200k_corpus.json', 'abstracts_1_cluster5_2018_200k_corpus.json', 'abstracts_1_cluster6_2018_200k_corpus.json', 'abstracts_1_cluster7_2018_200k_corpus.json', 'abstracts_1_cluster8_2018_200k_corpus.json'], ['abstracts_2_cluster0_2018_200k_corpus.json', 'abstracts_2_cluster1_2018_200k_corpus.json', 'abstracts_2_cluster2_2018_200k_corpus.json', 'abstracts_2_cluster3_2018_2

In [5]:
# tfidf on cluster text

In [6]:
### merge articles into one large document per community
###   run tfidf to distinguish these communities

In [7]:
# populate list of top tfidf words for each sample, over each cluster

d = []
for i_sample in range(N_samples):
    
    N_communities = len(corpus_paths[i_sample])
    sample_texts = []
    for i_community in range(N_communities):

        cluster_text = []
        full_path = path2dir + corpus_paths[i_sample][i_community]
        with open(full_path,'r') as f:
            documents = json.load(f)

        N_words = 0
        for doc_text in documents:  # list of e.g. titles
            for word in doc_text:   #   list of words
                
                cluster_text.append(word)

        print("{} words in sample {} cluster {}".format(
                    len(cluster_text),
                    i_sample,
                    i_community))
        print()
        
        # here - optionally, stopwords
        # here - optionally, filter based on wordfrequency
        sample_texts.append(cluster_text)
        
    sample_dictionary = Dictionary(sample_texts)
    sample_corpus = [sample_dictionary.doc2bow(t) for t in sample_texts]
    sample_tfidf_model = TfidfModel(sample_corpus)   # computes idf
    sample_tfidf = sample_tfidf_model[sample_corpus]  # applies tfidf
    
    d.append([])  # list of dicts for this sample
    for i_community, community_doc in enumerate(sample_tfidf):
        
        scores = [t[1] for t in community_doc]
        P_thresh = 99
        thresh = np.percentile(scores, P_thresh)
        print("score threshold: {}".format(thresh))
        subset = [t for t in community_doc if t[1] >= thresh]
        d_tfidf = {}
        for (w_id, score) in subset:
            w = sample_dictionary[w_id]
            d_tfidf[w] = score
        d[i_sample].append( d_tfidf )
        
        print(sorted(d[i_sample][i_community],
                     key=d[i_sample][i_community].get,
                     reverse=True)[:100])
        print()
        

4462612 words in sample 0 cluster 0

1845120 words in sample 0 cluster 1

1246692 words in sample 0 cluster 2

563895 words in sample 0 cluster 3

827687 words in sample 0 cluster 4

605475 words in sample 0 cluster 5

2587533 words in sample 0 cluster 6

1263811 words in sample 0 cluster 7

697911 words in sample 0 cluster 8

score threshold: 0.016992374053535493
['mir', 'akt', 'erk', 'mscs', 'autophagy', 'mice', 'jnk', 'knockdown', 'κb', 'inflammation', 'hfd', 'lps', 'ampk', 'apoptotic', 'nrf', 'downregulated', 'caspase', 'wnt', 'macrophages', 'blotting', 'bcl', 'tlr', 'nlrp', 'β', 'hmgb', 'oxidative', 'apoptosis', 'catenin', 'phosphorylation', 'mitochondrial', 'mrna', 'mesenchymal', 'bax', 'smad', 'inflammasome', 'sirna', 'sirt', 'rats', 'tgf', 'endothelial', 'reperfusion', 'invasion', 'cxcl', 'blot', 'cd', 'mapk', 'hcc', 'microglia', 'transcriptional', 'yap', 'bbb', 'resveratrol', 'fibroblasts', 'upregulated', 'ko', 'exosomes', 'overexpression', 'tnf', 'transfected', 'microbiota', 

score threshold: 0.029057829589973768
['os', 'carcinoma', 'lymph', 'adjuvant', 'dfs', 'neoadjuvant', 'chemotherapy', 'meier', 'woman', 'gy', 'metastases', 'radiotherapy', 'surgery', 'imrt', 'nsclc', 'resection', 'cyst', 'nivolumab', 'hcc', 'gastric', 'ipilimumab', 'tnm', 'tumors', 'wbrt', 'seer', 'trastuzumab', 'breast', 'boy', 'clinicopathological', 'adenocarcinoma', 'metastasis', 'thrombocytopenia', 'chop', 'cisplatin', 'kaplan', 'bevacizumab', 'chest', 'ccrt', 'nlr', 'ajcc', 'melanoma', 'pembrolizumab', 'relapsed', 'malignant', 'debulking', 'chemoradiotherapy', 'rituximab', 'recist', 'lenalidomide', 'mg', 'apatinib', 'sbrt', 'paclitaxel', 'stereotactic', 'iiic', 'invasion', 'mucinous', 'anemia', 'papillary', 'carboplatin', 'brachytherapy', 'gastrectomy', 'mediastinal', 'chemoradiation', 'gemcitabine', 'node', 'neutropenia', 'clinicopathologic', 'unresectable', 'anaplastic', 'orr', 'histologically', 'eribulin', 'asct', 'rfa', 'lymphadenectomy', 'hipec', 'braf', 'hepatocellular', 'egf

score threshold: 0.03163436749366373
['nurses', 'leadership', 'pharmacists', 'perceptions', 'attitudes', 'interviews', 'curriculum', 'nursing', 'thematic', 'interprofessional', 'respondents', 'students', 'facilitators', 'trainees', 'competencies', 'participants', 'nurse', 'semistructured', 'education', 'violence', 'workforce', 'carers', 'undergraduate', 'staff', 'palliative', 'graduates', 'physicians', 'themes', 'residency', 'graduate', 'residents', 'educators', 'midwives', 'curricula', 'moral', 'caregivers', 'scoping', 'leaders', 'pharmacy', 'mental', 'chws', 'mentorship', 'pharmacist', 'engagement', 'beliefs', 'rural', 'midwifery', 'doctors', 'organizations', 'stakeholders', 'youth', 'teaching', 'literacy', 'informal', 'mentoring', 'mentors', 'stakeholder', 'prep', 'subthemes', 'participatory', 'stigma', 'empowerment', 'aboriginal', 'psychosocial', 'professionalism', 'sexual', 'organisations', 'preparedness', 'interviewed', 'satisfaction', 'thematically', 'organisational', 'accredita

score threshold: 0.030074030351911155
['segmentation', 'cnn', 'convolutional', 'denoising', 'github', 'saliency', 'iot', 'subspace', 'phantom', 'gpu', 'hashing', 'dictionary', 'pixels', 'elm', 'kalman', 'regularization', 'wavelet', 'speckle', 'phantoms', 'dice', 'adversarial', 'packet', 'eeg', 'backstepping', 'snr', 'controller', 'beamforming', 'generative', 'sparsity', 'classifiers', 'lyapunov', 'snomed', 'wsns', 'fault', 'graph', 'cnns', 'cbct', 'svm', 'detruncation', 'gaussian', 'learns', 'autoencoder', 'semantic', 'reconstructed', 'acquisitions', 'hyperspectral', 'python', 'experiments', 'routing', 'regularized', 'noisy', 'observability', 'wsn', 'cest', 'convex', 'actuator', 'gnss', 'multiobjective', 'rois', 'voxels', 'motion', 'stochastic', 'nmf', 'computational', 'boolean', 'bioinformatics', 'imgt', 'ontologies', 'quantization', 'voxel', 'interpolation', 'adjacency', 'lstm', 'superpixel', 'priors', 'overfitting', 'adaboost', 'biclustering', 'entropy', 'factorization', 'computing'

In [8]:
### save results

In [9]:

path = path2dir + outfile
with open(path, 'w') as f:
    json.dump(d, f)
    