# Read libraries

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from colorama import Back, Fore, Style
from copy import copy, deepcopy
from pathlib import Path
from sys import path

path.append( str(Path.cwd().parent) )


In [None]:
import json

import matplotlib.pyplot as plt
import pandas as pd

from collections import Counter
from string import punctuation, whitespace

from Project_libraries.my_stats import ( place_commas )

from Project_libraries.pubmed import ( concatenate_lines, 
                                       get_articles_start_line,
                                       extract_publication_date, 
                                       get_article_data,
                                       classify_articles )


# Import and process data files

In [None]:
cwd = Path.cwd()
aux = cwd.glob('Case_*')

cases = []
for case in aux:
    cases.append( case.parts[-1] )
    
cases.sort()

for i in range(len(cases)):
    print(f"{i:>2} -- {cases[i]}")

In [None]:
case = cases[10]
print(f"We will be processing the data in folder '{case}'.\n")

case_folder = cwd / case
abstracts_files = list( case_folder.glob( 'abstract-*.txt' ) )


print(f"The files to be read are:")
for x in abstracts_files:
    print(f"\t* {x.parts[-1]}")
    
print('\n'*2)
    
articles_start, data = get_articles_start_line( abstracts_files )

## Highlight retracted articles and articles that fail to process

**NOTE: If alert about Copyright Surprise prints two lines and the first one
is the correct copyright, then this is not a problem!**

In [None]:
articles = []
retracted_articles = []

count1 = 0
count2 = 0
for i in range(0, len(articles_start) - 1):
#     print(articles_start[i],articles_start[i+1])
    info = copy( data[articles_start[i]:articles_start[i+1]])
    
    try:
        flag, article = get_article_data(i, info)
        
        if flag:
            retracted_articles.append( article )
    
        articles.append( article )

    except IndexError:
        count1 += 1
        print( f"{count1} -- Article {i+1}, lines {articles_start[i]} to "
               f"{articles_start[i+1]} failed to process" )
        articles.append( None )
        
    except ValueError:
        count2 += 1
        print( f"{count2} -- Article {i+1}, lines {articles_start[i]} to "
               f"{articles_start[i+1]} lacks year." )
        articles.append( None )
        
    except:
        print( f"Article {i+1}, lines {articles_start[i]} to "
               f"{articles_start[i+1]}." )
        flag, article = get_article_data(info)
        
    
print(f"\n----> There are {place_commas(len(articles))} articles for analysis.\n\n" )



## Check output for problematic record

In [None]:
i = 1940
print(articles_start[i],articles_start[i+1])
print()
info = copy( data[articles_start[i]:articles_start[i+1]])
print(info)
print()
print()
print()
flag, article = get_article_data(i, info)
print()
print()
print(article)

# Verify records

The function call implements manual corrections to problematic articles.

**If this cell print anything, it means that some articles are not processed correctly.**

This may be fixable within function or require a manual correction function.

In [None]:
# Test for errors
for i, article in enumerate( articles ):
    if article:
        if ( article['other_ids'][:3] == 'DOI' 
             or article['other_ids'][:4] == 'PMID' 
             or article['other_ids'][:5] == 'PMCID' ):
            continue
        
        print(i, article['other_ids'], '\n')
        

print(f"There are {place_commas(len(articles))} articles for analysis.\n\n" )


## Check output for problematic record

In [None]:
i = 25
info = copy(data[articles_start[i]:
                 articles_start[i+1]])

for line in info:
    print(line.strip())
print()

# for line in concatenate_lines(info):
#     print(line.strip())
#     print()

print('------')
print(get_article_data(info))

# Clean article data

## Remove articles reporting retractions

In [None]:
to_remove = []
for i, article in enumerate( articles ):
    if article and article['retraction']:
        print( f"{i} -- {article['journal']}. {article['year']}; {article['volume']} " 
               f"\n\t{article['title']}\n\t{article['retraction']}\n")
        if article['retraction'][:19] == 'Retraction Notice: ':
            to_remove.append(i)
            print(to_remove)

print(to_remove)

In [None]:
for i in reversed(to_remove):
    articles.pop(i)

print(f"There are {place_commas(len(articles))} articles left for for analysis.\n\n" )

## Remove errata, i.e., articles reporting corrections

In [None]:
to_remove = []
for i, article in enumerate( articles ):
    if article and article['erratum']:
        print( f"{i} -- {article['journal']}. {article['year']}; {article['volume']} " 
               f"\n\t{article['title']}\n\t{article['erratum']}\n")
        if article['erratum'][:12] == 'Erratum for ':
            to_remove.append(i)
            print(to_remove)
            
print(to_remove)

In [None]:
for i in reversed(to_remove):
    articles.pop(i)

print(f"There are {place_commas(len(articles))} articles left for for analysis.\n\n" )

## Remove comments

**Dealing with *Comments* is complicated!**

Some *Comments* are perspectives aiming to publicize the target paper (like News & Views in Nature). These are published in the same issue by different authors.

We remove the actual *Comment* (if available) from the set of publications to analyze. We adjust `comment` key of target paper to convey this positive information. 

Some *Comments* appear to be summaries of the target papers. They are published in a different journal by a subset of the original authors.

We remove the actual *Comment* (if available) from the set of publications to analyze. We adjust `comment` key of target paper to convey this information. 

Some *Comments* appear to be actual criticisms of the target papers. They are published in the same journal as the target bu later than the target and are authored by different researchers.

We remove the actual *Comment* (if available) from the set of publications to analyze. We adjust `comment` key of target paper to convey this negative information.  

In [None]:
to_remove = []
for i, article in enumerate( articles[:] ):
    if article and article['comment']:
        print('\n', i, 'Focus: ', article['journal'], article['year'], 
              article['date'], article['volume'],  
              article['pages'], article['doi'])
        print('--', article['comment'])
        
        # Extract info about paper discussed in comment
        target = {}
        aux = article['comment'].split(';')
        if len(aux) == 1:
            to_remove.append(i)
            continue
    
        aux = aux[0].split('.')
        date_string =  aux[-1].strip()
        if len(date_string) < 4:
            to_remove.append(i)
            continue

        # The code would otherwise fail if there is no period after journal name
        #
        try:
            target['year'] = int( date_string[:4] )
            target['date'] = date_string[4:]
        except ValueError:
            to_remove.append(i)
            continue
            
        target['journal'] = aux[-2].strip()
        
        aux = article['comment'].split(';')[1]
        aux = aux.split(':')
        target['volume'] = aux[0]
        target['pages'] = aux[1].split('.')[0]
        
        if len(aux) > 2:
            target['doi'] = aux[2].split()[0].strip('.')
        else:
            target['doi'] = None
          
        # Extract dates of publication
        date_focus = extract_publication_date(article)
        date_target = extract_publication_date(target)
        
        # Determine type of comment
        #
        # Same journal
        print(article['journal'])
        print(target['journal'].lstrip('Coment').strip().lstrip('ion').strip())
        if ( article['journal'] == 
             target['journal'].lstrip('Coment').strip().lstrip('ion').strip() ):
            print('===>>', article['journal'])
            print('===>>', article['year'], target['year'])
            
            # Same year 
            if article['year'] == target['year']:
                print('===>>', article['volume'], target['volume'])
            
                # Same volume
                if target['volume'] and article['volume'] == target['volume']:
                    print('--->', article['date'], target['date'])
                    
                    # Same date
                    if article['date'] == target['date']:
                        print('--->', article['date'])

                        if '-' in article['pages']:
                            focus_pages = article['pages'].split('-')[0]
                            if focus_pages.isnumeric():
                                focus_pages = int(focus_pages)
                        else:
                            focus_pages = article['pages']
                            
                        if '-' in target['pages']:
                            target_pages = target['pages'].split('-')[0]
                            if target_pages.isnumeric():
                                target_pages = int(target_pages)
                        else:
                            target_pages = target['pages']
                            
                        if type(target_pages) != type(focus_pages):
                            target_pages = str(target_pages)
                            focus_pages = str(focus_pages)

                        print('--->', focus_pages, target_pages)
                        
                        # Same date, earlier pages
                        if focus_pages < target_pages:
                            to_remove.append(i)
                            print(f"---> Article {i} is a commentary and is to be removed")

                        # Same date, later pages
                        else:
                            article['comment'] = ('Received highlight article! ' 
                                                  + article['comment'])
                            print('---> Received highlight article\n')

                    # Same volume, later time
                    elif article['date'] > target['date']:
                        to_remove.append(i)
                        print(f"---> Article {i} is a comment and is to be removed")

                    # Same volume, earlier time
                    elif article['date'] < target['date']:
                        article['comment'] = 'Received comment! ' + article['comment']
                        print('---> Received comment\n')

                # Same year, volumes that exist, later volume
                elif ( target['volume'] and article['volume'] and 
                       article['volume'] > target['volume'] ):
                    to_remove.append(i)
                    print(f"---> Article {i} is a comment and is to be removed")
                
                # Same year, volumes that exist, earlier volume
                elif ( target['volume'] and article['volume'] and 
                       article['volume'] < target['volume'] ):
                    article['comment'] = 'Received comment! ' + article['comment']
                    print('---> Received comment\n')

                # Same journal, same year, missing volumes 
                else:
                    # Same journal, same year, missing volumes, same date
                    if article['date'] == target['date']:
                        print('--->', article['date'])

                        if '-' in article['pages']:
                            focus_pages = article['pages'].split('-')[0]
                            if focus_pages.isnumeric():
                                focus_pages = int(focus_pages)
                        if '-' in target['pages']:
                            target_pages = target['pages'].split('-')[0]
                            if target_pages.isnumeric():
                                target_pages = int(target_pages)

                        print('--->', focus_pages, target_pages)

                        # Same journal, same year, missing volumes, same date, earlier pages
                        if focus_pages < target_pages:
                            to_remove.append(i)
                            print(f"---> Article {i} is a commentary and is to be removed")

                        # Same journal, same year, missing volumes, same date, earlier pages
                        else:
                            article['comment'] = ('Received highlight article! ' 
                                                  + article['comment'])
                            print('---> Received highlight article\n')
        
                    # Same journal, same year, missing volumes, later time
                    elif article['date'] > target['date']:
                        to_remove.append(i)
                        print(f"---> Article {i} is a comment and is to be removed")

                    # Same journal, same year, missing volumes, earlier time
                    elif article['date'] < target['date']:
                        article['comment'] = 'Received comment! ' + article['comment']
                        print('---> Received comment\n')
        
            # Same journal, later year 
            elif article['year'] > target['year']:
                to_remove.append(i)
                print(f"---> Article {i} is a comment and is to be removed")

            # Same journal, earlier year
            elif article['year'] < target['year']:
                article['comment'] = 'Received comment! ' + article['comment']
                print('---> Received comment\n')


        # Different journals, later year
        elif article['year'] > target['year']:
            to_remove.append(i)
            print(f"---> Article {i} is a commentary and is to be removed")

        # Different journals, earlier year
        elif article['year'] < target['year']: 
            article['comment'] = 'Received commentary! ' + article['comment']
            print('---> Received commentary\n')

        # Different journals, same year
        else:
            if date_focus > date_target:
                to_remove.append(i)
                print(f"---> Article {i} is a commentary and is to be removed")
            
            elif date_focus <= date_target:
                article['comment'] = 'Received commentary! ' + article['comment']
                print('---> Received commentary\n')
                
            else:
                print( Style.BRIGHT, Fore.RED,'----------RED ALERT!!!---------', 
                       Style.RESET_ALL )

print(to_remove)

In [None]:
for i in reversed(to_remove):
    articles.pop(i)

print(f"There are {place_commas(len(articles))} articles left for for analysis.\n\n" )

## Remove articles with no information

In [None]:
to_remove = []

for i, article in enumerate( articles ):
    if article is None:
        to_remove.append(i)
        
print(to_remove)

In [None]:
for i in reversed(to_remove):
    articles.pop(i)

print(f"There are {place_commas(len(articles))} articles left for for analysis.\n\n" )

## Count number of articles of different types 

In [None]:
a = classify_articles( articles, case )


# Save cleaned article data to file

In [None]:
with open(case_folder / 'articles_clean.json', 'w', encoding = 'utf-8') as f_json:
    json.dump(articles, f_json)
    
print('Done saving file!')

In [None]:
temp = []
for article in articles:
    if article['copyright']:
        temp.append(article['copyright'][:5])
    
print(sorted(temp))

In [None]:
['©', '(c)', 'Copyright', 'All rights', 
 'Published by', 'Published on behalf', 'Published under',
 'AG', 'A.G.', 'BV', 'B.V.', 'GmbH', 'Inc', 'LLC', 'Ltd', 'S.A.U.']

In [None]:
    if ( '©' in info[5] or 'Copyright' in info[5]  
         or 'All rights reserved' in info[5] 
         or 'Published by' in info[5] 
         or 'Verlag KG' in info[5] 
         or 'Wiley-Liss, Inc.' in info[5] 
         or 'John Wiley & Sons' in info[5] 
         or 'Wiley Periodicals, Inc' in info[5]
         or 'wileyonlinelibrary.com/journal/jgc4' in info[5]
         or 'Thieme' in info[5] 
         or 'Thieme Medical Publishers' in info[5] 
         or '(Cancer Epidemiol Biomarkers Prev' in info[5]
         or 'American Cancer Society' in info[5] 
         or 'Massachusetts Medical Society' in info[5]
         or 'BMJ Publishing Group Ltd' in info[5]
         or 'Celsius' in info[5]
         or 'Karger AG' in info[5] 
         or 'APA' in info[5] 
         or 'RSNA' in info[5]
         or 'AACR' in info[5] 
         or 'Multimed Inc' in info[5]
         or 'in the public domain' in info[5]
         or 'Creative Commons Attribution' in info[5]
         or 'Radiological Society of North America, Inc' in info[5]
         or 'American Institute of Chemical Engineers Biotechnol' in info[5] 
         or 'American Academy of Family Physicians'  in info[5] ):

In [None]:
    
    
#         article['copyright'] = info[5].strip(punctuation + whitespace)
#         article['other_ids'] = info[6]
#         m = 6
#     else:
#         article['copyright'] = None
#         article['other_ids'] = info[5]
#         m = 5
        
#     # Check for error due to abstract in foreign language
#     if article['other_ids'][:10] == 'Publisher:' or len(article['other_ids']) > 300:
#         if article['abstract']:
#             article['abstract'] += ' ' + article['other_ids']
#         else:
#             article['abstract'] = article['other_ids']
            
#         if ( '©' in info[m+1] or 'Copyright' in info[m+1]  
#              or 'All rights reserved' in info[m+1] 
#              or 'Published by' in info[m+1] 
#              or 'Verlag KG' in info[m+1] 
#              or 'Wiley-Liss, Inc.' in info[m+1] 
#              or 'John Wiley & Sons' in info[m+1] 
#              or 'Wiley Periodicals, Inc' in info[m+1]
#              or 'wileyonlinelibrary.com/journal/jgc4' in info[m+1]
#              or 'Thieme' in info[m+1]
#              or 'Thieme Medical Publishers' in info[m+1]
#              or '(Cancer Epidemiol Biomarkers Prev' in info[m+1]
#              or 'American Cancer Society' in info[m+1]
#              or 'Massachusetts Medical Society' in info[m+1]
#              or 'BMJ Publishing Group Ltd' in info[m+1]
#              or 'Celsius' in info[m+1]
#              or 'Karger AG' in info[m+1] 
#              or 'APA' in info[m+1]
#              or 'RSNA' in info[m+1]
#              or 'AACR' in info[m+1]
#              or 'Multimed Inc' in info[m+1]
#              or 'in the public domain' in info[m+1]
#              or 'Creative Commons Attribution License' in info[m+1]
#              or 'Radiological Society of North America, Inc' in info[m+1]
#              or 'American Institute of Chemical Engineers Biotechnol' in info[m+1] 
#              or 'American Academy of Family Physicians'  in info[m+1] ):
#             article['copyright'] = info[m+1].strip(punctuation + whitespace)
#             article['other_ids'] = info[m+2]
#         else:
#             article['copyright'] = None
#             article['other_ids'] = info[m+1]
