# Read libraries

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from copy import copy, deepcopy
from pathlib import Path
from sys import path

path.append( str(Path.cwd().parent) )

In [None]:
import requests
import pandas as pd

from numpy import nan
from collections import Counter
from bs4 import BeautifulSoup
from selenium import webdriver
from string import punctuation, whitespace
from time import sleep

from Project_libraries.pubmed import ( search_for_journal_match,
                                       assign_publisher, 
                                       manual_assignment_of_publisher,
                                       consensus_publisher,
                                     )


# Initialize variables

In [None]:
chrome_bin = Path.home() / 'share' / 'bin'

url = 'https://www.ncbi.nlm.nih.gov/nlmcatalog?term='

catalogue_file = Path.cwd() / 'Project_libraries' / 'journal_catalogue.json'

catalogue_NLM = Path.cwd() / 'Project_libraries' / 'journal_NLM_query.json'

filename = 'articles_clean.json'

# Import journal data

In [None]:
cwd = Path.cwd()

journals = {}
for folder in cwd.glob('Case_*'):
    print(f"Downloading from folder '{folder.parts[-1]}' ...")
    with open(Path.cwd() / folder / filename, 'r', encoding = 'utf-8') as f_json:
        articles = json.load(f_json)
    
    for article in articles:
        journal = article['journal']
        if journal in journals.keys():
            if article['copyright']:
                journals[journal]['cr_note'] += ' ' + article['copyright']
            continue
            
        journals[journal] = {}
        if article['copyright']:
            journals[journal]['cr_note'] = article['copyright']
        else:
            journals[journal]['cr_note'] = ''
            
print(f"\nThere are {len(journals)} unique journals.\n\n")


journal_list = sorted( list(journals.keys()) )

print(journal_list[0])
print(journal_list[-1])

# NLM queries

## Load prior results 

In [None]:
catalogue = {}
with open(catalogue_NLM, 'r', encoding = 'utf-8') as file_in:
    catalogue = json.load(file_in)
    
print('Loaded NLM query results.\n')

print(f"Have records for {len(catalogue)} journals.")

## Search NLM catalogue for journal information

**Skip this subsection** in case you have not added additional journals.

In [None]:
# Maximum number of pages or results I will check
n_max = 10
i0 = 0
i1 = 7000
with webdriver.Chrome() as browser:
    sleep(5)
    browser.get(url)
    sleep(5)
    
    for i, journal in enumerate( journal_list[i0:] ):
        if journal in catalogue.keys():
            if ( catalogue[journal] is not None ): 
                if 'failed' not in catalogue[journal].keys():
                    continue
                
                elif ( type(catalogue[journal]['failed']) == int and 
                       catalogue[journal]['failed'] > n_max ):
                    continue
                
                elif type(catalogue[journal]['failed']) == str:
                    continue
                    

        j_url = f"{url}{journal}[Journal]"
        print(f"{i0+i:>4} -- {j_url}")
        response = requests.get(j_url)

        # The analysis of the response to request can yield one of three outcomes:
        #     - a journal match (table will have something in it)
        #     - a list of possible matches (table will be empty)
        #     - no possible matches (table will be empty)
        #
        # For the latter two, we call the function 'search_for_journal_match'
        #
        soup = BeautifulSoup(response.text, 'html.parser')
        table = soup.find('dl', {'class': 'nlmcat_dl'})

        if table is None:
            table = search_for_journal_match( browser, journal, j_url, n_max )

            # If table is still None, we are under the third case above or
            # there were too many pages
            #
            if type(table) == str or type(table) == int:
                print(f"The search for journal {journal} did not yield results.\n")
                catalogue[journal] = {}
                catalogue[journal]['failed'] = table
                continue
            
            elif table is None:
                print(f"The search for journal {journal} did not yield results.\n")
                catalogue[journal] = {}
                catalogue[journal]['failed'] = 'No results'
                continue
                
        info = list( table.children )        
        catalogue[journal] = {}

        search_strings = [ ('Title(s):', 'full_title'),
                           ('Publication Start Year:', 'start_date' ), 
                           ('Country of Publication:', 'country'),
                           ('Publisher:', 'publisher_note'),
                           ('Latest Publisher:', 'last_publisher_note'),
                           ('MeSH:', 'mesh'),
                           ('Notes:', 'notes'),
                          ]

        for text, key in search_strings:
            k = 0
            while k < len(info) and info[k].text != text:
                k += 1

            if k == len(info):
                catalogue[journal][key] = None

            else:
                k += 1
                catalogue[journal][key] = info[k].text.strip(whitespace)   

        catalogue[journal]['cr_note'] = journals[journal]['cr_note']

        print(f"\t{catalogue[journal]}\n")   
        sleep(3)


## Save results of new NLM queries

In [None]:
with open(catalogue_NLM, 'w', encoding = 'utf-8') as file_out:
    json.dump(catalogue, file_out)
    
print('Saved NLM query results.')

# Check catalogue after NLM queries

## Summary

In [None]:
n = len(catalogue)
print(f"The journal catalogue has {len(catalogue)} entries.\n")
empty_entries = 0
for journal in catalogue:
    if catalogue[journal] is None:
        empty_entries += 1
    
df_publ = pd.DataFrame(catalogue)
print(f"There are:\n\t- {empty_entries} empty entries\n\t- "
      f"{n - df_publ.loc['publisher_note'].count()} journals without a "
      f"publisher note,\n\t- "
      f"{n - df_publ.loc['last_publisher_note'].count()} journals without "
      f"a last publisher note.\n")


## View by country

In [None]:
for i, journal in enumerate(sorted( list(catalogue) )[:] ):
    if catalogue[journal] is None:
        continue
    if 'Korea' in catalogue[journal]['country']:
        print(catalogue[journal]['publisher_note'])

## Check for manual assignment

In [None]:
#
journal = 'Transplantation'
if journal in catalogue:
    print( catalogue[journal] )

# Assign publisher names in journal catalogue

In [None]:
for i, journal in enumerate(sorted( list(catalogue) )[:] ):
    
    # Fix empty catalogue entries
    #
    if catalogue[journal] is None:
        catalogue[journal] = {}
     
    journal_keys = catalogue[journal].keys()
    
    if 'last_publisher_note' not in journal_keys:
        catalogue[journal]['last_publisher_note'] = None
        
    if 'publisher_note' not in journal_keys:
        catalogue[journal]['publisher_note'] = None
        
    if 'cr_note' not in journal_keys:
        catalogue[journal]['cr_note'] = None
    
    # Guess publisher from NLM entries or rule assignment
    #
    publisher = consensus_publisher(
                assign_publisher(catalogue[journal]['last_publisher_note']),
                assign_publisher(catalogue[journal]['publisher_note']),
                assign_publisher(catalogue[journal]['cr_note']) )
    
    manual_publisher = manual_assignment_of_publisher( journal )
    
    # Decide on publisher based on guesses
    #
    if publisher == manual_publisher:
        catalogue[journal]['publisher'] = publisher
        
    elif manual_publisher:
        print(f"\n{i:>4} ----- {journal[:40]:40} -- {manual_publisher}")
        catalogue[journal]['publisher'] = manual_publisher
        
    else:
        print(f"\n{i:>4} -- {journal[:40]:40} -- {publisher}")
        catalogue[journal]['publisher'] = publisher
                
#         print(f"{'LP':3} -- {catalogue[journal]['last_publisher_note']}")
#         print(f"{'P':3} -- {catalogue[journal]['publisher_note']}")
#         print(f"{'CR':3} -- {catalogue[journal]['cr_note']}")
        



In [None]:
publishers = []
for journal in catalogue:
    publishers.append( catalogue[journal]['publisher'] )
    
print(f"There are {len(set(publishers))} distinct publishers identified in the "
      f"journal catalogue.")

aux = Counter(publishers)
aux.most_common(25)

## Check for unmatched journals

In [None]:
for i, journal in enumerate(sorted( list(catalogue) )[:] ):
    if catalogue[journal]['publisher'] is None:
        print(f"{i:>4} -- {journal}")
        

# Save updated Journal Catalogue

In [None]:
with open(catalogue_file, 'w', encoding = 'utf-8') as file_out:
    json.dump(catalogue, file_out)
    
print('Saved updated Journal Catalogue.')

In [None]:
print(catalogue['Molecules'])

In [None]:
print(catalogue['Oncotarget'])