# Read libraries

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from copy import copy, deepcopy
from pathlib import Path
from sys import path

path.append( str(Path.cwd().parent) )

In [2]:
import requests
import pandas as pd

from numpy import nan
from collections import Counter
from bs4 import BeautifulSoup
from selenium import webdriver
from string import punctuation, whitespace
from time import sleep

from Project_libraries.pubmed import ( search_for_journal_match,
                                       assign_publisher, 
                                       manual_assignment_of_publisher,
                                       consensus_publisher,
                                     )


# Initialize variables

In [3]:
chrome_bin = Path.home() / 'share' / 'bin'

url = 'https://www.ncbi.nlm.nih.gov/nlmcatalog?term='

catalogue_file = Path.cwd() / 'Project_libraries' / 'journal_catalogue.json'

catalogue_NLM = Path.cwd() / 'Project_libraries' / 'journal_NLM_query.json'

filename = 'articles_clean.json'

# Import journal data

In [4]:
cwd = Path.cwd()

journals = {}
for folder in cwd.glob('Case_*'):
    print(f"Downloading from folder '{folder.parts[-1]}' ...")
    with open(Path.cwd() / folder / filename, 'r', encoding = 'utf-8') as f_json:
        articles = json.load(f_json)
    
    for article in articles:
        journal = article['journal']
        if journal in journals.keys():
            if article['copyright']:
                journals[journal]['cr_note'] += ' ' + article['copyright']
            continue
            
        journals[journal] = {}
        if article['copyright']:
            journals[journal]['cr_note'] = article['copyright']
        else:
            journals[journal]['cr_note'] = ''
            
print(f"\nThere are {len(journals)} unique journals.\n\n")


journal_list = sorted( list(journals.keys()) )

print(journal_list[0])
print(journal_list[-1])

Downloading from folder 'Case_green_synthesis_np' ...
Downloading from folder 'Case_brain_cancer_stem_cells' ...
Downloading from folder 'Case_graphene_sensors' ...
Downloading from folder 'Case_chest_imaging_pneumonia' ...
Downloading from folder 'Case_green_synthesis_silver_np' ...
Downloading from folder 'Case_statins_cancer' ...
Downloading from folder 'Case_rnai_cancer' ...
Downloading from folder 'Case_prions' ...
Downloading from folder 'Case_deep_learning_tumor' ...
Downloading from folder 'Case_brca2' ...
Downloading from folder 'Case_skin_wound_healing' ...

There are 5613 unique journals.


21 Century Pathol
touchREV Endocrinol


# NLM queries

## Load prior results 

In [5]:
catalogue = {}
with open(catalogue_NLM, 'r', encoding = 'utf-8') as file_in:
    catalogue = json.load(file_in)
    
print('Loaded NLM query results.')

Loaded NLM query results.


## Search NLM catalogue for journal information

**Skip this subsection** in case you have not added additional journals.

In [None]:
with webdriver.Chrome() as browser:
    browser.get(url)
    
    for i, journal in enumerate( journal_list ):
        if journal in catalogue.keys() and catalogue[journal] is not None:
            continue

        j_url = f"{url}{journal}[Journal]"
        print(f"{i:>4} -- {j_url}")
        response = requests.get(j_url)

        # The analysis of the response to request can yield one of three outcomes:
        #     - a journal match (table will have something in it)
        #     - a list of possible matches (table will be empty)
        #     - no possible matches (table will be empty)
        #
        # For the latter two, we call the function 'search_for_journal_match'
        #
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('dl', {'class': 'nlmcat_dl'})

        if table is None:
            table = search_for_journal_match( browser, journal, j_url )

            # If table is None, we are under the third case above or
            # there were too many pages
            #
            if table is None:
                print(f"The search for journal {journal} did not yield results.\n")
                catalogue[journal] = None
                continue

        info = list( table.children )        
        catalogue[journal] = {}

        search_strings = [ ('Title(s):', 'full_title'),
                           ('Publication Start Year:', 'start_date' ), 
                           ('Country of Publication:', 'country'),
                           ('Publisher:', 'publisher_note'),
                           ('Latest Publisher:', 'last_publisher_note'),
                           ('MeSH:', 'mesh'),
                           ('Notes:', 'notes'),
                          ]

        for text, key in search_strings:
            k = 0
            while k < len(info) and info[k].text != text:
                k += 1

            if k == len(info):
                catalogue[journal][key] = None

            else:
                k += 1
                catalogue[journal][key] = info[k].text.strip(whitespace)   

        catalogue[journal]['cr_note'] = journals[journal]['cr_note']

        print(f"\t{catalogue[journal]}\n")   
        sleep(3)


## Save results of new NLM queries

In [None]:
with open(catalogue_NLM, 'w', encoding = 'utf-8') as file_out:
    json.dump(catalogue, file_out)
    
print('Saved NLM query results.')

# Check catalogue after NLM queries

## Summary

In [6]:
n = len(catalogue)
empty_entries = 0
for journal in catalogue:
    if catalogue[journal] is None:
        empty_entries += 1
    
df_publ = pd.DataFrame(catalogue)
print(f"There are:\n\t- {empty_entries} empty entries\n\t- "
      f"{n - df_publ.loc['publisher_note'].count()} journals without a "
      f"publisher note,\n\t- "
      f"{n - df_publ.loc['last_publisher_note'].count()} journals without "
      f"a last publisher note.\n")


There are:
	- 138 empty entries
	- 139 journals without a publisher note,
	- 3061 journals without a last publisher note.



## View by country

In [None]:
for i, journal in enumerate(sorted( list(catalogue) )[:] ):
    if catalogue[journal] is None:
        continue
    if 'Korea' in catalogue[journal]['country']:
        print(catalogue[journal]['publisher_note'])

## Check for manual assignment

In [None]:
#
journal = 'Transplantation'
if journal in catalogue:
    print( catalogue[journal] )

# Assign publisher names in journal catalogue

In [7]:
for i, journal in enumerate(sorted( list(catalogue) )[:] ):
    
    # Fix empty catalogue entries
    #
    if catalogue[journal] is None:
        catalogue[journal] = {}
     
    journal_keys = catalogue[journal].keys()
    
    if 'last_publisher_note' not in journal_keys:
        catalogue[journal]['last_publisher_note'] = None
        
    if 'publisher_note' not in journal_keys:
        catalogue[journal]['publisher_note'] = None
        
    if 'cr_note' not in journal_keys:
        catalogue[journal]['cr_note'] = None
    
    # Guess publisher from NLM entries or rule assignment
    #
    publisher = consensus_publisher(
                assign_publisher(catalogue[journal]['last_publisher_note']),
                assign_publisher(catalogue[journal]['publisher_note']),
                assign_publisher(catalogue[journal]['cr_note']) )
    
    manual_publisher = manual_assignment_of_publisher( journal )
    
    # Decide on publisher based on guesses
    #
    if publisher == manual_publisher:
        catalogue[journal]['publisher'] = publisher
        
    elif manual_publisher:
        print(f"\n{i:>4} ----- {journal[:40]:40} -- {manual_publisher}")
        catalogue[journal]['publisher'] = manual_publisher
        
    else:
        print(f"\n{i:>4} -- {journal[:40]:40} -- {publisher}")
        catalogue[journal]['publisher'] = publisher
                
#         print(f"{'LP':3} -- {catalogue[journal]['last_publisher_note']}")
#         print(f"{'P':3} -- {catalogue[journal]['publisher_note']}")
#         print(f"{'CR':3} -- {catalogue[journal]['cr_note']}")
        




   1 -- 2d Mater                                 -- IOP Publishing

   2 -- 3 Biotech                                -- Springer Nature

   3 -- A A Case Rep                             -- Wolters Kluwer

   4 -- A A Pract                                -- Wolters Kluwer

   5 -- AACN Adv Crit Care                       -- Wolters Kluwer

   6 -- AACN Clin Issues                         -- Wolters Kluwer

   9 -- AAPS PharmSciTech                        -- Springer Nature

  10 -- ACS Appl Bio Mater                       -- American Chemical Society

  11 -- ACS Appl Electron Mater                  -- American Chemical Society

  12 -- ACS Appl Energy Mater                    -- American Chemical Society

  13 -- ACS Appl Mater Interfaces                -- American Chemical Society

  14 -- ACS Appl Nano Mater                      -- American Chemical Society

  15 -- ACS Appl Polym Mater                     -- American Chemical Society

  16 -- ACS Bio Med Chem Au                    

In [8]:
publishers = []
for journal in catalogue:
    publishers.append( catalogue[journal]['publisher'] )
    
print(f"There are {len(set(publishers))} distinct publishers identified in the "
      f"journal catalogue.")

aux = Counter(publishers)
aux.most_common(25)

There are 105 distinct publishers identified in the journal catalogue.


[(None, 1463),
 ('Elsevier', 936),
 ('Springer Nature', 767),
 ('Wiley', 503),
 ('Taylor & Francis', 238),
 ('Wolters Kluwer', 219),
 ('Sage', 174),
 ('Oxford University Press', 173),
 ('Hindawi', 95),
 ('Karger', 81),
 ('MDPI', 71),
 ('Mary Ann Liebert', 65),
 ('Bentham', 63),
 ('American Chemical Society', 57),
 ('Frontiers', 56),
 ('Thieme Group', 50),
 ('Royal Society of Chemistry', 34),
 ('British Medical Association', 32),
 ('Cell Press', 30),
 ('IEEE', 26),
 ('Cambridge University Press', 25),
 ('CRC Press', 25),
 ('De Gruyter', 23),
 ('IOS Press', 21),
 ('Future Science', 21)]

## Check for unmatched journals

In [9]:
for i, journal in enumerate(sorted( list(catalogue) )[:] ):
    if catalogue[journal]['publisher'] is None:
        print(f"{i:>4} -- {journal}")
        

   0 -- 21 Century Pathol
   7 -- AAPS J
   8 -- AAPS PharmSci
  37 -- AIDS
  39 -- AIDS Read
  41 -- AIDS Rev
  42 -- AIMS Genet
  43 -- AIMS Neurosci
  46 -- AJR Am J Roentgenol
  55 -- ARP Rheumatol
  60 -- Abdom Radiol (NY)
  64 -- Access Microbiol
  70 -- Acta Biomed
  75 -- Acta Chir Orthop Traumatol Cech
  76 -- Acta Chromatogr
  77 -- Acta Cir Bras
  79 -- Acta Clin Croat
  85 -- Acta Derm Venereol
  87 -- Acta Ethol
  88 -- Acta Gastroenterol Belg
  91 -- Acta Histochem Cytochem
  92 -- Acta Inform Med
  94 -- Acta Med Indones
  95 -- Acta Med Iran
  96 -- Acta Med Litu
  97 -- Acta Med Okayama
  98 -- Acta Med Port
  99 -- Acta Medica (Hradec Kralove)
 100 -- Acta Medica (Hradec Kralove) Suppl
 101 -- Acta Microbiol Immunol Hung
 102 -- Acta Naturae
 103 -- Acta Neurobiol Exp (Wars)
 104 -- Acta Neurochir (Wien)
 106 -- Acta Neurol Belg
 108 -- Acta Neurol Taiwan
 114 -- Acta Ortop Bras
 115 -- Acta Ortop Mex
 117 -- Acta Otorhinolaryngol Belg
 118 -- Acta Otorhinolaryngol It

# Save updated Journal Catalogue

In [10]:
with open(catalogue_file, 'w', encoding = 'utf-8') as file_out:
    json.dump(catalogue, file_out)
    
print('Saved updated Journal Catalogue.')

Saved updated Journal Catalogue.


In [11]:
print(catalogue['Molecules'])

{'full_title': 'Molecules : a journal of synthetic chemistry and natural product chemistry.', 'start_date': '1996', 'country': 'Switzerland', 'publisher_note': 'Basel, Switzerland : MDPI, c1995-', 'last_publisher_note': None, 'mesh': 'Biological Factors/chemistry*Biological Products/chemistry*', 'notes': 'Title from journal homepage (viewed Sept. 11, 2000).Vol. 1, 1996 was published in collaboration with Springer-Verlag.Also available on CD-ROM yearly.Mode of access: Internet.Vols. for 1997-2001 include: Molbank, which became a separate publication in 2002.', 'cr_note': '', 'publisher': 'MDPI'}


In [12]:
print(catalogue['Oncotarget'])

{'full_title': 'Oncotarget.', 'start_date': '2009', 'country': 'United States', 'publisher_note': 'Albany, N.Y. : Impact Journals', 'last_publisher_note': None, 'mesh': 'Oncogenes*', 'notes': '"Open access impact journal."', 'cr_note': ' Copyright: © 2021 Paracchini et al Copyright: © 2021 Barakeh et al Copyright: © 2019 Terashima et al Copyright: © 2021 Okawa et al Copyright: © 2020 Petrova et al', 'publisher': 'Impact Journals'}
