# Read libraries

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from copy import copy, deepcopy
from pathlib import Path
from sys import path

path.append( str(Path.cwd().parent) )

In [2]:
import requests
import pandas as pd

from numpy import nan
from collections import Counter
from bs4 import BeautifulSoup
from selenium import webdriver
from string import punctuation, whitespace
from time import sleep

from Project_libraries.pubmed import ( search_for_journal_match,
                                       assign_publisher, 
                                       manual_assignment_of_publisher,
                                       consensus_publisher,
                                     )


# Initialize variables

In [3]:
chrome_bin = Path.home() / 'share' / 'bin'

url = 'https://www.ncbi.nlm.nih.gov/nlmcatalog?term='

catalogue_file = Path.cwd() / 'Project_libraries' / 'journal_catalogue.json'

catalogue_NLM = Path.cwd() / 'Project_libraries' / 'journal_NLM_query.json'

folders = [ 'Case_brca2', 
            'Case_chest_imaging_pneumonia', 
            'Case_green_synthesis_silver_np', 
            'Case_skin_wound_healing', 
            ]

filename = 'articles_clean.json'

# Import journal data

In [None]:
journals = {}
for folder in folders:
    print(f"Downloading from folder {folder}...")
    with open(Path.cwd() / folder / filename, 'r', encoding = 'utf-8') as f_json:
        articles = json.load(f_json)
    
    for article in articles:
        journal = article['journal']
        if journal in journals.keys():
            if article['copyright']:
                journals[journal]['cr_note'] += ' ' + article['copyright']
            continue
            
        journals[journal] = {}
        if article['copyright']:
            journals[journal]['cr_note'] = article['copyright']
        else:
            journals[journal]['cr_note'] = ''
            
print(f"\nThere are {len(journals)} unique journals.")


# Load NLM query results 

In [4]:
catalogue = {}
with open(catalogue_NLM, 'r', encoding = 'utf-8') as file_in:
    catalogue = json.load(file_in)
    
print('Loaded NLM query results.')

Loaded NLM query results.


# Search NLM catalogue for journal information

**Skip this section** in case you already have searched all the journals in the NLM catalogue.

In [None]:
with webdriver.Chrome() as browser:
    browser.get(url)
    
    for i, journal in enumerate( journals ):
        if journal in catalogue.keys() and catalogue[journal] is not None:
            continue

        j_url = f"{url}{journal}[Journal]"
        print(f"{i:>4} -- {j_url}")
        response = requests.get(j_url)

        # The analysis of the response to request can yield one of three outcomes:
        #     - a journal match (table will have something in it)
        #     - a list of possible matches (table will be empty)
        #     - no possible matches (table will be empty)
        #
        # For the latter two, we call the function 'search_for_journal_match'
        #
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('dl', {'class': 'nlmcat_dl'})

        if table is None:
            table = search_for_journal_match( browser, journal, j_url )

            # If table is None, we are under the third case above
            #
            if table is None:
                print(f"The search for journal {journal} did not yield results.\n")
                catalogue[journal] = None
                continue

        info = list( table.children )        
        catalogue[journal] = {}

        search_strings = [ ('Title(s):', 'full_title'),
                           ('Publication Start Year:', 'start_date' ), 
                           ('Country of Publication:', 'country'),
                           ('Publisher:', 'publisher_note'),
                           ('Latest Publisher:', 'last_publisher_note'),
                           ('MeSH:', 'mesh'),
                           ('Notes:', 'notes'),
                          ]

        for text, key in search_strings:
            k = 0
            while k < len(info) and info[k].text != text:
                k += 1

            if k == len(info):
                catalogue[journal][key] = None

            else:
                k += 1
                catalogue[journal][key] = info[k].text.strip(whitespace)   

        catalogue[journal]['cr_note'] = journals[journal]['cr_note']

        print(f"\t{catalogue[journal]}\n")   
        sleep(3)


## Save NLM query results 

In [None]:
with open(catalogue_NLM, 'w', encoding = 'utf-8') as file_out:
    json.dump(catalogue, file_out)
    
print('Saved NLM query results.')

# Check catalogue

In [5]:
catalogue['Vascular']

{'full_title': 'Vascular.',
 'start_date': '2004',
 'country': 'England',
 'publisher_note': 'Hamilton, Ont., Canada : BC Decker, [2004]-',
 'last_publisher_note': 'Dec. 2012- : London : Sage',
 'mesh': 'Vascular DiseasesVascular Surgical Procedures*',
 'notes': 'Title from cover.Also issued online.Official publication of: the International Society for Vascular Surgery; International Society of Vascular Specialists; International Society for Cardiovascular Surgery.',
 'cr_note': ''}

In [6]:
catalogue['Molecules']

In [7]:
n = len(catalogue)
empty_entries = 0
for journal in catalogue:
    if catalogue[journal] is None:
        empty_entries += 1
    
df_publ = pd.DataFrame(catalogue)
print(f"There are:\n\t- {empty_entries} empty entries\n\t- "
      f"{n - df_publ.loc['publisher_note'].count()} journals without a "
      f"publisher note,\n\t- "
      f"{n - df_publ.loc['last_publisher_note'].count()} journals without "
      f"a last publisher note.\n")


There are:
	- 328 empty entries
	- 329 journals without a publisher note,
	- 1829 journals without a last publisher note.



# Assign publisher names in journal catalogue

In [8]:
for i, journal in enumerate(sorted( list(catalogue) )[:] ):
    
    # Fix empty catalogue entries
    #
    if catalogue[journal] is None:
        catalogue[journal] = {}
     
    journal_keys = catalogue[journal].keys()
    
    if 'last_publisher_note' not in journal_keys:
        catalogue[journal]['last_publisher_note'] = None
        
    if 'publisher_note' not in journal_keys:
        catalogue[journal]['publisher_note'] = None
        
    if 'cr_note' not in journal_keys:
        catalogue[journal]['cr_note'] = None
    
    # Guess publisher from NLM entries or rule assignment
    #
    publisher = consensus_publisher(
                assign_publisher(catalogue[journal]['last_publisher_note']),
                assign_publisher(catalogue[journal]['publisher_note']),
                assign_publisher(catalogue[journal]['cr_note']) )
    
    manual_publisher = manual_assignment_of_publisher( journal )
    
    # Decide on publisher based on guesses
    #
    if publisher == manual_publisher:
        catalogue[journal]['publisher'] = publisher
        
    elif manual_publisher:
        print(f"\n{i:>4} ----- {journal[:40]:40} -- {manual_publisher}")
        catalogue[journal]['publisher'] = manual_publisher
        
    else:
        print(f"\n{i:>4} -- {journal[:40]:40} -- {publisher}")
        catalogue[journal]['publisher'] = publisher
                
#         print(f"{'LP':3} -- {catalogue[journal]['last_publisher_note']}")
#         print(f"{'P':3} -- {catalogue[journal]['publisher_note']}")
#         print(f"{'CR':3} -- {catalogue[journal]['cr_note']}")
        




   0 -- 3 Biotech                                -- Springer Nature

   1 -- A A Pract                                -- Wolters Kluwer

   2 -- AACN Adv Crit Care                       -- Wolters Kluwer

   3 -- AACN Clin Issues                         -- Wolters Kluwer

   4 -- AAPS PharmSciTech                        -- Springer Nature

   5 -- ACS Appl Bio Mater                       -- American Chemical Society

   6 -- ACS Appl Mater Interfaces                -- American Chemical Society

   7 -- ACS Biomater Sci Eng                     -- American Chemical Society

   8 -- ACS Chem Biol                            -- American Chemical Society

   9 -- ACS Chem Neurosci                        -- American Chemical Society

  10 -- ACS Nano                                 -- American Chemical Society

  11 -- ACS Omega                                -- American Chemical Society

  12 -- ACS Pharmacol Transl Sci                 -- American Chemical Society

  13 -- ADMET DMPK       

In [9]:
publishers = []
for journal in catalogue:
    publishers.append( catalogue[journal]['publisher'] )
    
print(f"There are {len(set(publishers))} distinct publishers identified in the "
      f"journal catalogue.")

aux = Counter(publishers)
aux.most_common(25)

There are 103 distinct publishers identified in the journal catalogue.


[(None, 968),
 ('Elsevier', 547),
 ('Springer Nature', 449),
 ('Wiley', 245),
 ('Taylor & Francis', 128),
 ('Wolters Kluwer', 119),
 ('Oxford University Press', 93),
 ('Sage', 81),
 ('Hindawi', 54),
 ('MDPI', 41),
 ('Bentham', 35),
 ('Karger', 33),
 ('Mary Ann Liebert', 32),
 ('Frontiers', 29),
 ('American Chemical Society', 28),
 ('Thieme Group', 25),
 ('British Medical Association', 21),
 ('Cell Press', 18),
 ('CRC Press', 18),
 ('De Gruyter', 13),
 ('Cambridge University Press', 13),
 ('IEEE', 13),
 ('Royal Society of Chemistry', 12),
 ('Future Science', 10),
 ('American Medical Association', 10)]

## Check for unmatched journals

In [None]:
for i, journal in enumerate(sorted( list(catalogue) )[:] ):
    if catalogue[journal]['publisher'] is None:
        print(f"{i:>4} -- {journal}")
        

# Save updated Journal Catalogue

In [10]:
with open(catalogue_file, 'w', encoding = 'utf-8') as file_out:
    json.dump(catalogue, file_out)
    
print('Saved updated Journal Catalogue.')

Saved updated Journal Catalogue.


In [11]:
print(catalogue['Molecules'])

{'last_publisher_note': None, 'publisher_note': None, 'cr_note': None, 'publisher': 'MDPI'}


# Early work for direct assignment of publisher to journals lacking other information

In [None]:
for i, journal in enumerate( sorted(journals) ):
    if journal in catalogue.keys() and catalogue[journal] is not None:
        continue
        
    print(f"{i:>4} -- {journal}") 

In [None]:
catalogue['Sys Cell']

In [None]:
    elif article['journal'] in [ 'J Immunol', ]:
        article['publisher'] = 'American Association of Immunologists'
        
    elif article['journal'] in [ 'Int J Nanomedicine', ]:
        article['publisher'] = 'Dove Medical Press'
    
    elif article['journal'] in [ 'Int J Clin Exp Pathol', ]:
        article['publisher'] = 'e-Century Pub. Corp.'
        
    elif article['journal'] in [ 'Int J Inflam', ]:
        article['publisher'] = 'Hindawi'

    elif article['journal'] in [ 'Mol Biol (Mosk)', ]:
        article['publisher'] = 'Izdatelstvo Nauka'
        

    elif article['journal'][:4] == 'PLoS':
        article['publisher'] = 'PLoS'
        
    elif article['journal'] in ['Biomater Sci', ]:
        article['publisher'] = 'Royal Society of Chemistry'

    elif 'J Invest Dermatol' == article['journal']:
        article['publisher'] = 'Society for Investigative Dermatology'

    elif article['journal'] in [ 'Acta Pharmacol Sin', 'Methods Mol Biol', 
                                 'Sci Rep', 
                                 'Stem Cell Res Ther' ]:
        article['publisher'] = 'Springer Nature' 

    elif article['journal'] in [ 'Wound Repair Regen', 
                                 'Exp Dermatol' ]:
        article['publisher'] = 'Wiley'
