In [1]:
import os, sys, glob
import logging
from datetime import datetime 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline 

In [14]:
from pyscopus import Scopus
import json
import requests

In [15]:
MY_API_KEY = '6f08887e7863302c0431a454c96fb54c'
headerz_api={'Accept':'application/json', 'X-ELS-APIKey': MY_API_KEY}

class TargetObject: 
    def __init__(self, urlz, headerz, parser_xtra ):
        self.urlz = urlz
        self.headerz = headerz
        self.parser_xtra = parser_xtra
        print("Target: ", self.urlz)
        


In [16]:
scopus = Scopus(MY_API_KEY)

In [17]:
resp = requests.get("http://api.elsevier.com/content/search/scopus?query=fundus&count=25", #af-id(60032114)+OR+af-id(60022265)
                    headers={'Accept':'application/json',
                             'X-RateLimit-Limit':'1',                             
                             'X-ELS-APIKey': MY_API_KEY})


for k, v in resp.headers.items():
    print("{:20}  {}".format(k, v))

Date                  Sat, 10 Apr 2021 03:04:05 GMT
Content-Type          application/json;charset=UTF-8
Transfer-Encoding     chunked
Connection            keep-alive
allow                 GET
Content-Encoding      gzip
Vary                  Accept-Encoding, Origin
X-ELS-APIKey          6f08887e7863302c0431a454c96fb54c
X-ELS-ReqId           bf388d067f3600d8
X-ELS-ResourceVersion  default
X-ELS-Status          OK
X-ELS-TransId         2c8b40325ea4934f
X-RateLimit-Limit     20000
X-RateLimit-Remaining  20000
X-RateLimit-Reset     1618628644000
CF-Cache-Status       DYNAMIC
cf-request-id         095b55a03a00001b5a699d4000000001
Expect-CT             max-age=604800, report-uri="https://report-uri.cloudflare.com/cdn-cgi/beacon/expect-ct"
Server                cloudflare
CF-RAY                63d8bee05a4a1b5a-NBO


In [20]:
def run_search(query, fname):
    def save_to_file(data, fname):
        with open(fname, 'a', encoding='utf-8') as f:
            json.dump( data, f, ensure_ascii=False, indent=4)
            f.write(",\n")

    pg = 0
    TOTAL_ITEMS = 0
    PAGEZ = 10 
    PG_MAX = 25
    # while pg is not None:
    # for pg in range(PAGEZ): 
    while pg < PAGEZ:
        ## 1. get page    
        resp = requests.get(f"http://api.elsevier.com/content/search/scopus?query={query}&count={PG_MAX}&start={pg*PG_MAX}", #af-id(60032114)+OR+af-id(60022265)
                        headers={'Accept':'application/json',
                                 'X-ELS-APIKey': MY_API_KEY})
        REZ = resp.headers['X-ELS-Status']
        if REZ.strip() != 'OK':
            print( f"*****{REZ}")
            break
        pg += 1
        
        ## 2. get next page id  and content object 
        for k, v in resp.json().items(): 
            for r, v in resp.json()['search-results'].items():
                if r == 'opensearch:totalResults' and TOTAL_ITEMS == 0:
                    TOTAL_ITEMS = int( v )
                    PAGEZ = TOTAL_ITEMS//PG_MAX
                    mod = TOTAL_ITEMS%PG_MAX
                    PAGEZ += 0 if mod == 0 else 1 
                    print( "*** TOTAL_ITEMS: ", TOTAL_ITEMS, ", << ", PAGEZ) 
                    
                if r == 'entry':        
                    ## save to file
                    save_to_file(v, f"0ea_{PAGEZ}__{fname}")
                    print(f"PG: {pg} dumped to file")

In [36]:
def parse_json_file_to_csv(fname): 
    keyz = [ 'prism:url', 
             "dc:identifier", "dc:title", "prism:publicationName",
             "prism:coverDate", "citedby-count", 
            ]
    affilz = "affiliation" ## sub = affilname, affiliation-city, affiliation-country
    first_affilz_country = "country_1"
    first_affilz_org = "institution_1"
    
    has_fundus_in_title = "fundus_in_title"
    pub_year = "pub_year"
    
    abhref = 'abstract'
    
    def parse_elsevier_json_item(item):    
        
        def get_pub_year(pub_date):
            return str( datetime.strptime(pub_date, "%Y-%m-%d").year )
        
        def has_fundus(titl):
            return str( int( 'fundus' in titl.lower() ) )
        
        O_ = []
        ## 1. everything top level 
        for k in keyz:
            O_.append( item.get(k, '')) #item[k] ) 
        
        ## 2. parse affiliation details and join org,city,country
        k_affz = ['affilname', 'affiliation-city', 'affiliation-country']
        A_ = []
        I_ = '-'
        C_ = '-'
        ia = 0 
        affitem = item.get("affiliation", None)
        if affitem:
            for aff in affitem:
                if aff:
                    a_ = []
                    for k in k_affz:
                        x = aff.get(k, "-") 
                        a_.append( x if x is not None else "-")
#                     print( a_ )
                    A_.append(", ".join(a_) )         
                    if ia == 0:
                        C_ = a_[-1]  
                        I_ = a_[0]
                        ia = 100 
            O_.append( "++".join(A_) )
        else:
            O_.append( "-" )
        O_.append( I_ )
        O_.append( C_ )
        
        ## 3. preprocs
        O_.append( has_fundus( item.get("dc:title", "") ) )
        O_.append( get_pub_year(item.get("prism:coverDate", "")))
        
        ## 4. parse abstract url 
        abstract = "link" # @ref = "self", "@href" 
        for abl in item[abstract]:
            if abl['@ref'] == 'self':
                O_.append( abl['@href'] ) 
        
        return O_ 
    
        
    def fetch_abstract(ahref):
        pass 
    
    to_csv = []
    with open( f'{fname}', 'r') as fd:
        pagez = json.load( fd)
#         print( type(pagez), len(pagez) )
        for page in pagez:
            for item in page:
                try:
                    to_csv.append( parse_elsevier_json_item(item) )
                except:
                    print(f"failed at item: {str(item)}")
            
    
    def dump_csv_row(rec):
#         print(type(rec), len(rec), rec)
        fd.write( "\t".join( rec) )
        fd.write("\n")
        
    fcsv = fname.split(".")[0] 
    with open( f"{fcsv}.csv", 'w') as fd:
        headz = keyz + [affilz, first_affilz_org, first_affilz_country, 
                        has_fundus_in_title, pub_year, 
                        abhref] 
        dump_csv_row(headz)
        for rec in to_csv:
            dump_csv_row(rec)
            

# Search Strategies

[From Scopus Webinar](https://blog.scopus.com/posts/6-simple-search-tips-lessons-learned-from-the-scopus-webinar)
- Phareses in quotes (loose phrase) else treated as AND
- TITLE-ABS-KEY is default search field --> 
- Auto thingies
    - case insensitive
    - accented characters - with or without 
    - lemmatization --> conjugations etc <-- EXACT PHRASE marker if don't want this
    - equivalents @ terms or symbols
    - punctuations are ignored
    - stopwords are exluded
    - override with exact phase --> enclose in braces {}
- Proximity operators to find words near each other --> e.g. preceding Pre/n, within W/n, <-- E.g. traditional Pre/2 features to capture 'in between words' or W/2 if id doesn't matter which words comes first
- Wildcards - in any word or loose phrase e.g. spelling variations. `?` or `*` == any single character AND n >= 0 respectively 
- [Scopus help files](https://blog.scopus.com/topics/tips-and-tricks)

In [22]:
searchez = [ ("fundus 'traditional W/2 analysis' 'deep W/2 learning'", 'fundus_hc_dl.json'),
           ("medical 'image W/2 preprocessing'", "medical_img_preproc.json"),
           ("image 'fuse W/2 features'", 'fuse_hc_dl.json'),
           ("image 'combine W/2 features'", 'fuse2_hc_dl.json'),
           ("image 'qualtity W/2 measurement'", 'quality.json')]


for q, f in searchez:
    run_search(q, f)


*** TOTAL_ITEMS:  14 , <<  1
PG: 1 dumped to file
*** TOTAL_ITEMS:  3479 , <<  140
PG: 1 dumped to file
PG: 2 dumped to file
PG: 3 dumped to file
PG: 4 dumped to file
PG: 5 dumped to file
PG: 6 dumped to file
PG: 7 dumped to file
PG: 8 dumped to file
PG: 9 dumped to file
PG: 10 dumped to file
PG: 11 dumped to file
PG: 12 dumped to file
PG: 13 dumped to file
PG: 14 dumped to file
PG: 15 dumped to file
PG: 16 dumped to file
PG: 17 dumped to file
PG: 18 dumped to file
PG: 19 dumped to file
PG: 20 dumped to file
PG: 21 dumped to file
PG: 22 dumped to file
PG: 23 dumped to file
PG: 24 dumped to file
PG: 25 dumped to file
PG: 26 dumped to file
PG: 27 dumped to file
PG: 28 dumped to file
PG: 29 dumped to file
PG: 30 dumped to file
PG: 31 dumped to file
PG: 32 dumped to file
PG: 33 dumped to file
PG: 34 dumped to file
PG: 35 dumped to file
PG: 36 dumped to file
PG: 37 dumped to file
PG: 38 dumped to file
PG: 39 dumped to file
PG: 40 dumped to file
PG: 41 dumped to file
PG: 42 dumped to file
PG

In [24]:
searchez = [ ("fundus 'traditional W/2 features'", 'fundus-2_hc_dl.json'),
             ("fundus 'handcrafted W/2 features'", 'fundus-3_hc_dl.json'), 
             ("fundus 'deep W/2 features'", 'fundus-4_hc_dl.json'),  
           ("fundus 'multi W/2 learning'", 'fundus-multi-1_hc_dl.json'), 
           ("fundus 'multiple W/2 disease'", 'fundus-multi-2_hc_dl.json'),
           ("image 'handcrafted W/2 features'", 'quality.json')]


for q, f in searchez:
    run_search(q, f)

*** TOTAL_ITEMS:  50 , <<  2
PG: 1 dumped to file
PG: 2 dumped to file
*** TOTAL_ITEMS:  88 , <<  4
PG: 1 dumped to file
PG: 2 dumped to file
PG: 3 dumped to file
PG: 4 dumped to file
*** TOTAL_ITEMS:  1114 , <<  45
PG: 1 dumped to file
PG: 2 dumped to file
PG: 3 dumped to file
PG: 4 dumped to file
PG: 5 dumped to file
PG: 6 dumped to file
PG: 7 dumped to file
PG: 8 dumped to file
PG: 9 dumped to file
PG: 10 dumped to file
PG: 11 dumped to file
PG: 12 dumped to file
PG: 13 dumped to file
PG: 14 dumped to file
PG: 15 dumped to file
PG: 16 dumped to file
PG: 17 dumped to file
PG: 18 dumped to file
PG: 19 dumped to file
PG: 20 dumped to file
PG: 21 dumped to file
PG: 22 dumped to file
PG: 23 dumped to file
PG: 24 dumped to file
PG: 25 dumped to file
PG: 26 dumped to file
PG: 27 dumped to file
PG: 28 dumped to file
PG: 29 dumped to file
PG: 30 dumped to file
PG: 31 dumped to file
PG: 32 dumped to file
PG: 33 dumped to file
PG: 34 dumped to file
PG: 35 dumped to file
PG: 36 dumped to file
P

In [37]:
# datz = ['0ea_4__fuse_hc_dl', 
#         '0ea_5__fundus_hc_dl', 
#         '0ea_642__medical_img_preproc', 
#         '0ea_3926__fundus'
#        ]

datz = glob.glob("*.json")

for fname in datz:
    parse_json_file_to_csv(fname)
    print("******** FINISHED - ", fname )

******** FINISHED -  0ea_140__medical_img_preproc.json
******** FINISHED -  0ea_149__fuse2_hc_dl.json
******** FINISHED -  0ea_19__fundus-multi-2_hc_dl.json
******** FINISHED -  0ea_1__fundus_hc_dl.json
******** FINISHED -  0ea_26__fundus-multi-1_hc_dl.json
******** FINISHED -  0ea_2__fundus-2_hc_dl.json
******** FINISHED -  0ea_45__fundus-4_hc_dl.json
******** FINISHED -  0ea_4__fundus-3_hc_dl.json
******** FINISHED -  0ea_65__fuse_hc_dl.json
******** FINISHED -  0ea_92__quality.json


In [64]:
def merge_csv_files(ls_fnamez, ls_category_labelz, mname):
    dfz = [pd.read_csv(f"0ea_{f}.csv", sep='\t', ) for f in ls_fnamez ]
    print( len(dfz ) )
    print( len(dfz[0]), dfz[0].columns ) 
    #add_category = lambda x: x[0]['ctype'] = x[1]
    def add_category(df, lbl):
        df['ctype'] = lbl 
        return df
    dfz = [ add_category(d, l) for d, l in zip(dfz, ls_category_labelz) ]
    df = pd.concat( dfz , ignore_index=True)
    df.to_csv(f"{mname}.csv")

In [65]:
mergez = [(['1__fundus_hc_dl', '2__fundus-2_hc_dl', '4__fundus-3_hc_dl', '45__fundus-4_hc_dl',
                "140__medical_img_preproc",  '92__quality'],
           ['Fundus HC or DL', 'FUndus HC', 'Fundus HC', 'Fundus DL', 
                'IMG Preproc', 'IMG HC-Qy'], 
           'funduz_hc_dl_ALL'),
         
          (['65__fuse_hc_dl', '149__fuse2_hc_dl'],
           ['IMG fusion', 'IMG fusion'],
           'general_fuse'),
          
          (['26__fundus-multi-1_hc_dl', '19__fundus-multi-2_hc_dl'], 
           ['Fundus multi-disease', 'Fundus multi-disease'],
           "funduz_multitask")
         ]

In [66]:
for fz, lb, n in mergez:
    print("MERGING: ", fz)
    merge_csv_files( fz, lb, n)

MERGING:  ['1__fundus_hc_dl', '2__fundus-2_hc_dl', '4__fundus-3_hc_dl', '45__fundus-4_hc_dl', '140__medical_img_preproc', '92__quality']
6
14 Index(['prism:url', 'dc:identifier', 'dc:title', 'prism:publicationName',
       'prism:coverDate', 'citedby-count', 'affiliation', 'institution_1',
       'country_1', 'fundus_in_title', 'pub_year', 'abstract'],
      dtype='object')
MERGING:  ['65__fuse_hc_dl', '149__fuse2_hc_dl']
2
1622 Index(['prism:url', 'dc:identifier', 'dc:title', 'prism:publicationName',
       'prism:coverDate', 'citedby-count', 'affiliation', 'institution_1',
       'country_1', 'fundus_in_title', 'pub_year', 'abstract'],
      dtype='object')
MERGING:  ['26__fundus-multi-1_hc_dl', '19__fundus-multi-2_hc_dl']
2
635 Index(['prism:url', 'dc:identifier', 'dc:title', 'prism:publicationName',
       'prism:coverDate', 'citedby-count', 'affiliation', 'institution_1',
       'country_1', 'fundus_in_title', 'pub_year', 'abstract'],
      dtype='object')
