## Source statistics 

In [56]:
from elasticsearch import Elasticsearch, helpers
from elasticsearch_dsl import Search,  Q
from datetime import datetime
import sys, json, os
import difflib 
import uuid
import spacy
import ast
from statistics import mean, median
from collections import Counter
import itertools
import traceback
import json
import gc

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [67]:
def readRawCandidates( list_NCT, label_type=None ):

    nct_ids = []
    tokens = []
    labels = []
    pos = []

    with open(list_NCT, 'r', encoding='latin1') as NCT_ids_file:

        for i, eachLine in enumerate(NCT_ids_file):
            annot = json.loads(eachLine)
            id_ = annot['id']

            for target_key, target in annot.items():

                if 'id' not in target_key:
                    for sentence_key, sentence in target.items():

                        if set(sentence['tokens'])!={0}:
                            tokens.append( sentence['tokens'] )
                            labels.append( sentence['annotation'] )
                            nct_ids.append( id_ )

                            # Generate dummy POS items
                            pos_i = [0] * len( sentence['tokens'] )
                            pos.append( pos_i )
                        else:
                            print('All the labels are nil')

    corpus_df = pd.DataFrame(
        {'ids': nct_ids,
        'tokens': tokens,
        'labels': labels,
        'pos': pos
        })

    df = corpus_df.sample(frac=1).reset_index(drop=True) # Shuffles the dataframe after creation
    
    # can delete this one (corpusDf)
    del corpus_df
    gc.collect() # mark if for garbage collection

    return df

## Majority and minority labels

In [62]:
merged_data = '/mnt/nas2/data/systematicReview/clinical_trials_gov/Weak_PICO/PICOS_data_preprocessed/merged_1_0.txt'

df = readRawCandidates( merged_data, label_type=None )

In [68]:
token_collection = []

for eachTokenList in df['labels']:
    token_collection.extend( eachTokenList )

In [76]:
labels_counted = Counter(token_collection)

In [82]:
percent_minor = ( (labels_counted[1] + labels_counted[2] + labels_counted[3] + labels_counted[4]) / labels_counted[0] * 100)

In [84]:
print('The percentage of minority labels: ', percent_minor)

The percentage of minority labels:  10.577408759933324


In [2]:
prim_outcomes = '/mnt/nas2/data/systematicReview/clinical_trials_gov/distant_pico_pre/primary_outcomes.txt'
#prim_outcomes = '/mnt/nas2/data/systematicReview/clinical_trials_gov/distant_pico_pre/secondary_outcomes.txt'
counter = 0

outcome_names = []
outcome_tokens = []
pos_all = []
posfine_all = []
outcome_tokens_all = []

with open(prim_outcomes, 'r') as pof:
    try:
        
        for eachOutcome in pof:
            counter = counter + 1
            #rint(counter)
            j = json.loads(eachOutcome)

            if j:
                for key, value in j.items():
                    for eachOne in value:
                        if 'text' in eachOne:
                            outcome_names.append( eachOne['text'] )
                        if 'tokens' in eachOne:
                            outcome_tokens.append( eachOne['tokens'] )
                            outcome_tokens_all.extend( list(map(lambda x: x.lower(), eachOne['tokens'])) ) 
                        if 'pos' in eachOne:
                            pos_all.extend( eachOne['pos'] )
                        if 'pos_fine' in eachOne:
                            posfine_all.extend( eachOne['pos_fine'] )

            #f counter == 10:
            #  break
    except:
        print('something strange happened')

print( len(outcome_names) )
print( len(outcome_tokens) )


longest_outcome = max(map(len, outcome_tokens))
print('Longest outcome term: ', longest_outcome)

shortest_outcome = min(map(len, outcome_tokens))
print('Longest outcome term: ', shortest_outcome)

mean_outcome = mean(map(len, outcome_tokens))
print('Mean outcome term: ', mean_outcome)

median_outcome = median(map(len, outcome_tokens))
print('Median outcome term: ', median_outcome)
print('####################################################################')
pos_counter = Counter(pos_all)
print( pos_counter.most_common(50) )
print('####################################################################')
posfine_counter = Counter(posfine_all)
print( posfine_counter.most_common(50) )

print('####################################################################')
# Most common words
print('Total number of tokens: ', len(outcome_tokens_all))
outcometerms_counter = Counter(outcome_tokens_all)
most_common_outcome_terms = outcometerms_counter.most_common(10)

for tuple_i in most_common_outcome_terms:
    print(tuple_i[0], '\t', tuple_i[1])

464037
464037
Longest outcome term:  68
Longest outcome term:  1
Mean outcome term:  10.124744363057257
Median outcome term:  8
####################################################################
[('NOUN', 1589426), ('PROPN', 744440), ('ADP', 718375), ('PUNCT', 510399), ('ADJ', 389293), ('VERB', 212547), ('DET', 168427), ('NUM', 144452), ('CCONJ', 118636), ('PART', 49679), ('ADV', 19831), ('PRON', 11321), ('AUX', 6121), ('X', 5819), ('SCONJ', 5440), ('SYM', 3718), ('INTJ', 332)]
####################################################################
[('NN', 1332428), ('IN', 722014), ('NNP', 686128), ('JJ', 380177), ('NNS', 257011), ('-RRB-', 176226), ('-LRB-', 169869), ('DT', 164240), ('CD', 144452), ('CC', 118636), ('VBN', 89007), (',', 68176), ('NNPS', 58312), ('VBG', 47613), ('.', 44736), ('TO', 42760), ('VB', 38053), (':', 31667), ('RB', 17871), ('HYPH', 15538), ('VBD', 15028), ('VBP', 13895), ('VBZ', 8951), ('WP', 8099), ('MD', 6121), ('POS', 5858), ('JJR', 5088), ('JJS', 4028), ('W

## TF-IDF POC

In [40]:
def get_ifidf_for_words(text):
    tfidf_matrix= vectorizer.transform([text]).todense()
    feature_index = tfidf_matrix[0,:].nonzero()[1]
    tfidf_scores = zip([feature_names[i] for i in feature_index], [tfidf_matrix[0, x] for x in feature_index])
    sorted_dict = {k: v for k, v in sorted(dict(tfidf_scores).items(), key=lambda item: item[1])}
    return sorted_dict

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
vectorizer = TfidfVectorizer()

In [6]:
outcome_tfidf = vectorizer.fit_transform(outcome_names)

In [33]:
feature_names = vectorizer.get_feature_names()

In [51]:
input_string = 'Flexion knees measured goniometer side side difference expressed degrees.'

In [52]:
get_ifidf_for_words(input_string)

{'measured': 0.168605292180929,
 'difference': 0.2244409287049211,
 'expressed': 0.3000135047712927,
 'flexion': 0.3118107683732015,
 'degrees': 0.3191784079340699,
 'goniometer': 0.38741566434403935,
 'knees': 0.41333903188241167,
 'side': 0.5578078148467132}

## Examine overlapping spans in PICOS weak annotations

In [33]:
labels = ['p', 'ic', 'o', 's']
label_combinations = itertools.combinations(labels, 2)
label_combinations = list(label_combinations)

In [47]:
label_combinations

[('p', 'ic'), ('p', 'o'), ('p', 's'), ('ic', 'o'), ('ic', 's'), ('o', 's')]

In [184]:
l1 = label_combinations[1][0]
l2 =  label_combinations[1][1]

overlaps = []
non_overlaps = []

annotations_global = '/home/anjani/distant-PICO/CandidateGeneration/ResultInspection/pico_multiclass.txt'
#annotations_global = '/home/anjani/distant-PICO/CandidateGeneration/ResultInspection/label_overlap_inspection.txt'

counter = 0
with open(annotations_global, 'r', encoding='latin1') as af:

        for annot in af: # Each annotation file
            try:
                counter = counter + 1
                j = json.loads( annot )
                for k,v in j.items(): # target
                    if 'id' not in k:
                        for k_i, v_i in v.items(): # each sentence
                            if l1 in v_i and l2 in v_i:
                                phrase = []
                                non_o_phrase = []
                                list1 = v_i[l1]
                                list2 = v_i[l2]
                                for n, (a1, a2) in enumerate( zip(list1, list2) ):
                                    if a1 != 0 and a2 != 0:
                                        phrase.append( v_i['tokens'][n] )
                                    if a1 != 0 or a2 != 0:
                                        non_o_phrase.append( v_i['tokens'][n] )
                                if phrase:
                                    overlaps.append( ' '.join(phrase) )
                                if non_o_phrase:
                                    non_overlaps.append( ' '.join(non_o_phrase) )
            except Exception as ex:
                pass
                template = "An exception of type {0} occurred. Arguments:{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                #print( message )

                exc_type, exc_obj, exc_tb = sys.exc_info()
                fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1]
                #print(exc_type, fname, exc_tb.tb_lineno)

                #print(traceback.format_exc())

print('Percentage overlap between ', l1, ' and ', l2, ' is: ', (len( set( overlaps ) ) / len( list(non_overlaps) ) * 100))

Percentage overlap between  p  and  o  is:  13.003077294394497


## Explore "outcome" sources

In [2]:
outcomes_file = '/mnt/nas2/data/systematicReview/clinical_trials_gov/distant_pico_pre/temp.txt'

outcomes = dict()

counter = 0
with open(outcomes_file, 'r') as rf:
    for eachOutcome in rf:
        j = json.loads( eachOutcome )
        if j:
            counter = counter + 1
            outcomes[counter] = j
            
            
print('The number of non-null outcome sources retrieved', len( outcomes ))

The number of non-null outcome sources retrieved 473893


In [3]:
outcomes[1]

{'PrimaryOutcome_0': [{'text': '( 0 - 10) Itching scale',
   'tokens': ['(', '0', '-', '10', ')', 'Itching', 'scale'],
   'lemma': ['(', '0', '-', '10', ')', 'itching', 'scale'],
   'pos': ['PUNCT', 'NUM', 'PUNCT', 'NUM', 'PUNCT', 'NOUN', 'NOUN'],
   'pos_fine': ['-LRB-', 'CD', 'HYPH', 'CD', '-RRB-', 'NN', 'NN']}]}

In [6]:
text = []
tokens = []
pos = []
pos_fine = []
tokens_lengths = []

counter_i = 0
for key, value in outcomes.items():
    for a_key, a_value in value.items():
        text.append( a_value[0]['text'] )
        tokens.append( a_value[0]['tokens'] )
        pos.append( a_value[0]['pos'] )
        pos_fine.append( a_value[0]['pos_fine'] )
        tokens_lengths.append( len(a_value[0]['tokens']) )
        counter_i = counter_i + 1

In [9]:
df = pd.DataFrame({'text' : text, 'tokens' : tokens, 'pos' : pos, 'pos_fine' : pos_fine })

In [35]:
df.tail()

Unnamed: 0,text,tokens,pos,pos_fine
1564619,Oswestry Disability Index,"[Oswestry, Disability, Index]","[PROPN, PROPN, PROPN]","[NNP, NNP, NNP]"
1564620,Numeric Rating Scale,"[Numeric, Rating, Scale]","[PROPN, PROPN, PROPN]","[NNP, NNP, NNP]"
1564621,Medical Outcomes Study Short-Form Health Surve...,"[Medical, Outcomes, Study, Short-Form, Health,...","[PROPN, PROPN, PROPN, PROPN, PROPN, PROPN, NOU...","[NNP, NNP, NNP, NNP, NNP, NNP, NN, CD, -LRB-, ..."
1564622,Centre for Epidemiological Studies Depression ...,"[Centre, for, Epidemiological, Studies, Depres...","[NOUN, ADP, PROPN, PROPN, PROPN, PROPN]","[NN, IN, NNP, NNPS, NNP, NNP]"
1564623,"Zurich claudification Scale, Numeric Rating Sc...","[Zurich, claudification, Scale, ,, Numeric, Ra...","[ADJ, NOUN, PROPN, PUNCT, PROPN, PROPN, PROPN,...","[JJ, NN, NNP, ,, NNP, NNP, NNP, IN, NN, NN, ,,..."


In [42]:
token_percents = dict()

for i in range(min(tokens_lengths), max(tokens_lengths)):
    #print('Percentage of the outcome mentions with token length ', str(i) , ' : ', ( tokens_lengths.count( i ) / len(tokens_lengths) ) * 100 )
    token_percents[i] = ( tokens_lengths.count( i ) / len(tokens_lengths) ) * 100