# Read libraries

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from copy import copy, deepcopy
from pathlib import Path
from sys import path

path.append( str(Path.cwd().parent) )


In [2]:
import json
import re

import matplotlib.pyplot as plt
import pandas as pd

from collections import Counter
from string import punctuation, whitespace

from Project_libraries.pubmed import ( concatenate_lines, 
                                       extract_publication_date, 
                                       get_article_data )

In [3]:
csv_file = Path.cwd() / 'csv-greensynth-set.csv'
abstracts_file = Path.cwd() / 'abstract-greensynth-set.txt'

# Import CSV file

In [4]:
df = pd.read_csv(csv_file)
df

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI
0,30328732,Green synthesis of silver nanoparticles toward...,"Mousavi SM, Hashemi SA, Ghasemi Y, Atapour A, ...",Artif Cells Nanomed Biotechnol. 2018;46(sup3):...,Mousavi SM,Artif Cells Nanomed Biotechnol,2018,2018/10/18,,,10.1080/21691401.2018.1517769
1,34769419,Bionanofactories for Green Synthesis of Silver...,"Jain AS, Pawar PS, Sarkar A, Junnuthula V, Dya...",Int J Mol Sci. 2021 Nov 5;22(21):11993. doi: 1...,Jain AS,Int J Mol Sci,2021,2021/11/13,PMC8584914,,10.3390/ijms222111993
2,24937409,"Silver nanoparticles: Synthesis methods, bio-a...","Abbasi E, Milani M, Fekri Aval S, Kouhi M, Akb...",Crit Rev Microbiol. 2016;42(2):173-80. doi: 10...,Abbasi E,Crit Rev Microbiol,2016,2014/06/18,,,10.3109/1040841X.2014.912200
3,27649147,"Silver Nanoparticles: Synthesis, Characterizat...","Zhang XF, Liu ZG, Shen W, Gurunathan S.",Int J Mol Sci. 2016 Sep 13;17(9):1534. doi: 10...,Zhang XF,Int J Mol Sci,2016,2016/09/21,PMC5037809,,10.3390/ijms17091534
4,35426251,Medicinal plants mediated the green synthesis ...,"Habeeb Rahuman HB, Dhandapani R, Narayanan S, ...",IET Nanobiotechnol. 2022 Jun;16(4):115-144. do...,Habeeb Rahuman HB,IET Nanobiotechnol,2022,2022/04/15,PMC9114445,,10.1049/nbt2.12078
...,...,...,...,...,...,...,...,...,...,...,...
2984,36365347,Bioinspired Green Synthesis of Silver Nanopart...,"Tian Y, Luo J, Wang H, Zaki HEM, Yu S, Wang X,...",Plants (Basel). 2022 Oct 28;11(21):2892. doi: ...,Tian Y,Plants (Basel),2022,2022/11/11,PMC9654092,,10.3390/plants11212892
2985,35479817,Chemical and biological studies on the soft co...,"Abdelhafez OH, Fahim JR, El Masri RR, Salem MA...",RSC Adv. 2021 Jul 5;11(38):23654-23663. doi: 1...,Abdelhafez OH,RSC Adv,2021,2022/04/28,PMC9036784,,10.1039/d1ra03045k
2986,35424016,Highly sensitive and selective colorimetric de...,"Paw R, Hazarika M, Boruah PK, Kalita AJ, Guha ...",RSC Adv. 2021 Apr 20;11(24):14700-14709. doi: ...,Paw R,RSC Adv,2021,2022/04/15,PMC8697840,,10.1039/d0ra09926k
2987,36015345,Moringa concanensis-Mediated Synthesis and Cha...,"Zafar N, Uzair B, Menaa F, Khan BA, Niazi MBK,...",Pharmaceutics. 2022 Aug 17;14(8):1719. doi: 10...,Zafar N,Pharmaceutics,2022,2022/08/26,PMC9412270,,10.3390/pharmaceutics14081719


In [5]:
n_papers = len(df)
n_journals = len(set(df['Journal/Book']))

print(f"There are {n_papers} published in {n_journals} unique journals.")

There are 2989 published in 455 unique journals.


In [6]:
journals = Counter(df['Journal/Book'])

In [7]:
journals.most_common(20)

[('Nanomaterials (Basel)', 120),
 ('J Photochem Photobiol B', 111),
 ('Molecules', 102),
 ('Int J Biol Macromol', 101),
 ('Int J Nanomedicine', 89),
 ('Mater Sci Eng C Mater Biol Appl', 85),
 ('Sci Rep', 78),
 ('IET Nanobiotechnol', 77),
 ('Spectrochim Acta A Mol Biomol Spectrosc', 74),
 ('Artif Cells Nanomed Biotechnol', 69),
 ('Carbohydr Polym', 62),
 ('Colloids Surf B Biointerfaces', 59),
 ('Environ Sci Pollut Res Int', 54),
 ('RSC Adv', 53),
 ('Heliyon', 39),
 ('Saudi J Biol Sci', 38),
 ('ACS Omega', 38),
 ('Int J Mol Sci', 37),
 ('Materials (Basel)', 34),
 ('J Nanosci Nanotechnol', 33)]

# Import abstract file

In [8]:
with open(abstracts_file, 'r', encoding = 'utf-8') as f_abs:
    data = f_abs.readlines()

# Identify starting line of each article
#
pattern = '\d{1,4}[.] '

articles_start = []
k = 0
while True:
    
    # Start of record  
    #
    potential_match = re.match(pattern, data[k])
    if potential_match:
        potential_index = int( data[k][potential_match.start(): 
                                       potential_match.end()-2] )
        
        if potential_index - 1 == len(articles_start):
            articles_start.append( k )
        
    k += 1
    if k == len(data):
        break

print(f"There are {len(articles_start)} articles in file." )

# Need to add last line of file so all articles are checked later
articles_start.append( len(data) )

There are 2989 articles in file.


In [9]:
articles = []
retracted_articles = []

for i in range(0, len(articles_start) - 1):
#     print(articles_start[i],articles_start[i+1])
    info = copy( data[articles_start[i]:articles_start[i+1]])
    flag, article = get_article_data(info)
    
    if flag:
        retracted_articles.append( article )
    
    articles.append( article )
    

-----> 182. RETRACTED ARTICLE

-----> 818. RETRACTED ARTICLE

-----> 941. RETRACTED ARTICLE

-----> 1169. RETRACTED ARTICLE

-----> 1410. RETRACTED ARTICLE

-----> 1413. RETRACTED ARTICLE

-----> 1492. RETRACTED ARTICLE

-----> 1507. RETRACTED ARTICLE

-----> 1745. RETRACTED ARTICLE

-----> 1818. RETRACTED ARTICLE

-----> 1841. RETRACTED ARTICLE

-----> 1862. RETRACTED ARTICLE

-----> 2338. RETRACTED ARTICLE



## Manual corrections

In [None]:
i = 1817
info = copy(data[articles_start[i]:
                 articles_start[i+1]])

for line in info:
    print(line.strip())
print()

# for line in concatenate_lines(info):
#     print(line.strip())
#     print()

print('------')
print(get_article_data(info))

In [10]:
i = 2116
print(articles[i], '\n')
articles[i]['abstract'] = '[Formula: see text].'
articles[i]['other_ids'] = 'DOI: 10.1177/15347346221133627 PMID: 36325727'

print(articles[i], '\n------')

{'retraction': None, 'journal': 'Int J Low Extrem Wounds', 'year': 2022, 'date': ' Nov 3:15347346221133627', 'volume': None, 'pages': None, 'doi': '2022', 'title': 'Green Synthesis of Silver Nanoparticles from Aqueous Extract of Tinospora crispa Stems Accelerate Wound Healing in Rats.', 'authors': 'Osman Mahmud S(1)(2), Hamad Shareef S(3), Jabbar AAJ(4), Hassan RR(5), Jalal HK(2), Abdulla MA(1)', 'affiliations': 'Author information: (1)Department of Medical Microbiology, College of Science, 357115Cihan University-Erbil, Erbil, Iraq. (2)Department of Pharmacognosy, College of Pharmacy, 125618Hawler Medical University, Erbil, Iraq. (3)Department of Biology, College of Education, 275716Salahaddin University-Erbil, Erbil, Iraq. (4)Department of Medical Laboratory Technology, Erbil Technical Health and Medical College, 566876Erbil Polytechnic University, Erbil, Iraq. (5)Department of Medical Laboratory Science, College of Science, 594155Knowledge University, Erbil, Iraq', 'erratum': None, '

# Verify records

If this cell print anything, it means that some articles are not processed correctly.

This may be fixable within function or require manual correction.

In [12]:
for i, article in enumerate( articles ):
    if ( article['other_ids'][:3] == 'DOI' 
         or article['other_ids'][:4] == 'PMID' 
         or article['other_ids'][:5] == 'PMCID' ):
        continue
        
    print(i, article['other_ids'], '\n')

# Classify articles

In [13]:
to_remove = []
for i, article in enumerate( articles ):
    if article['retraction']:
        print( f"{i} -- {article['journal']}. {article['year']}; {article['volume']} " 
               f"\n\t{article['title']}\n\t{article['retraction']}\n")
        if article['retraction'][:19] == 'Retraction Notice: ':
            to_remove.append(i)
            print(to_remove)

print(to_remove)

181 -- Spectrochim Acta A Mol Biomol Spectrosc. 2013; 115 
	Green synthesis of silver nanoparticles from Gloriosa superba L. leaf extract and their catalytic activity.
	Retraction in Spectrochim Acta A Mol Biomol Spectrosc. 2017 Feb 15;173:969.

619 -- Carbohydr Polym. 2015; 134 
	Cationic guar gum orchestrated environmental synthesis for silver nano-bio-composite films.
	Expression of concern in Carbohydr Polym. 2016 Feb 10;137:748.

817 -- Spectrochim Acta A Mol Biomol Spectrosc. 2014; 121 
	Synthesis, characterization and catalytic activity of silver nanoparticles using Tribulus terrestris leaf extract.
	Retraction in Spectrochim Acta A Mol Biomol Spectrosc. 2017 Jan 15;171:527.

940 -- Int J Nanomedicine. 2020; 15 
	Green Synthesis of Silver Nanoparticles Using Extract of Jasminum officinal L. Leaves and Evaluation of Cytotoxic Activity Towards Bladder (5637) and Breast Cancer (MCF-7) Cell Lines.
	Retraction in Int J Nanomedicine. 2022 Jun 29;17:2805-2806.

1168 -- Int J Nanomedici

In [14]:
for i in reversed(to_remove):
    articles.pop(i)

print(len(articles))

2987


In [15]:
to_remove = []
for i, article in enumerate( articles ):
    if article['erratum']:
        print( f"{i} -- {article['journal']}. {article['year']}; {article['volume']} " 
               f"\n\t{article['title']}\n\t{article['erratum']}\n")
        if article['erratum'][:12] == 'Erratum for ':
            to_remove.append(i)
            print(to_remove)
            
print(to_remove)

126 -- Artif Cells Nanomed Biotechnol. 2018; 46(sup1) 
	Green synthesis of silver nanoparticles using Thymbra spicata L. var. spicata (zahter) aqueous leaf extract and evaluation of their morphology-dependent antibacterial and cytotoxic activity.
	Erratum in Artif Cells Nanomed Biotechnol. 2018;46(sup1):392.

379 -- J Biotechnol. 2017; 260 
	Investigation of the effect of pomegranate extract and monodisperse silver nanoparticle combination on MCF-7 cell line.
	Erratum in J Biotechnol. 2021 Feb 20;328:115-116.

669 -- Int J Nanomedicine. 2021; 16 
	In vitro Anticancer Effects of Vernonia amygdalina Leaf Extract and Green-Synthesised Silver Nanoparticles.
	Erratum in Int J Nanomedicine. 2021 Sep 13;16:6263-6264.

713 -- J Microbiol Methods. 2019; 162 
	Cyanobacteria as a bioreactor for synthesis of silver nanoparticles-an effect of different reaction conditions on the size of nanoparticles and their dye decolorization ability.
	Erratum in J Microbiol Methods. 2020 Jan;168:105764.

719 --

In [16]:
for i in reversed(to_remove):
    articles.pop(i)

print(len(articles))

2980


## Dealing with *Comments* is complicated

Some *Comments* are perspectives aiming to publicize the target paper (like News & Views in Nature). These are published in the same issue by different authors.

We remove the actual *Comment* (if available) from the set of publications to analyze. We adjust `comment` key of target paper to convey this positive information. 

Some *Comments* appear to be summaries of the target papers. They are published in a different journal by a subset of the original authors.

We remove the actual *Comment* (if available) from the set of publications to analyze. We adjust `comment` key of target paper to convey this information. 

Some *Comments* appear to be actual criticisms of the target papers. They are published in the same journal as the target bu later than the target and are authored by different researchers.

We remove the actual *Comment* (if available) from the set of publications to analyze. We adjust `comment` key of target paper to convey this negative information.  

In [17]:
to_remove = []
for i, article in enumerate( articles ):
    if article['comment']:
        print('\n', i, 'Focus: ', article['journal'], article['year'], 
              article['date'], article['volume'],  
              article['pages'], article['doi'])
        print('--', article['comment'])
        
        # Extract info about paper discussed in comment
        target = {}
        aux = article['comment'].split(';')[0]
        aux = aux.split('.')
        date_string =  aux[-1].strip()
        target['year'] = int( date_string[:4] )
        target['date'] = date_string[4:]
        target['journal'] = aux[-2].strip()
        
        aux = article['comment'].split(';')[1]
        aux = aux.split(':')
        target['volume'] = aux[0]
        target['pages'] = aux[1].split('.')[0]
        
        if len(aux) > 2:
            target['doi'] = aux[2].split()[0].strip('.')
        else:
            target['doi'] = None
          
        # Extract dates of publication
        date_focus = extract_publication_date(article)
        date_target = extract_publication_date(target)
        
        # Determine type of comment
        #
        # Same journal
        if ( article['journal'] == 
             target['journal'].lstrip('Coment').strip().lstrip('ion').strip() ):
            print('===>>', article['journal'])
            print('===>>', article['year'], target['year'])
            
            # Same year 
            if article['year'] == target['year']:
                print('===>>', article['volume'], target['volume'])
            
                # Same volume
                if target['volume'] and article['volume'] == target['volume']:
                    print('--->', article['date'], target['date'])
                    
                    # Same date
                    if article['date'] == target['date']:
                        print('--->', article['date'])

                        if '-' in article['pages']:
                            focus_pages = article['pages'].split('-')[0]
                            if focus_pages.isnumeric():
                                focus_pages = int(focus_pages)
                        else:
                            focus_pages = article['pages']
                            
                        if '-' in target['pages']:
                            target_pages = target['pages'].split('-')[0]
                            if target_pages.isnumeric():
                                target_pages = int(target_pages)
                        else:
                            target_pages = target['pages']
                            
                        if type(target_pages) != type(focus_pages):
                            target_pages = str(target_pages)
                            focus_pages = str(focus_pages)

                        print('--->', focus_pages, target_pages)
                        
                        # Same date, earlier pages
                        if focus_pages < target_pages:
                            to_remove.append(i)
                            print(f"---> Article {i} is a commentary and is to be removed")

                        # Same date, later pages
                        else:
                            article['comment'] = ('Received highlight article! ' 
                                                  + article['comment'])
                            print('---> Received highlight article\n')

                    # Same volume, later time
                    elif article['date'] > target['date']:
                        to_remove.append(i)
                        print(f"---> Article {i} is a comment and is to be removed")

                    # Same volume, earlier time
                    elif article['date'] < target['date']:
                        article['comment'] = 'Received comment! ' + article['comment']
                        print('---> Received comment\n')

                # Same year, volumes that exist, later volume
                elif target['volume'] and article['volume'] > target['volume']:
                    to_remove.append(i)
                    print(f"---> Article {i} is a comment and is to be removed")
                
                # Same year, volumes that exist, earlier volume
                elif target['volume'] and article['volume'] < target['volume']:
                    article['comment'] = 'Received comment! ' + article['comment']
                    print('---> Received comment\n')

                # Same journal, same year, missing volumes 
                else:
                    # Same journal, same year, missing volumes, same date
                    if article['date'] == target['date']:
                        print('--->', article['date'])

                        if '-' in article['pages']:
                            focus_pages = article['pages'].split('-')[0]
                            if focus_pages.isnumeric():
                                focus_pages = int(focus_pages)
                        if '-' in target['pages']:
                            target_pages = target['pages'].split('-')[0]
                            if target_pages.isnumeric():
                                target_pages = int(target_pages)

                        print('--->', focus_pages, target_pages)

                        # Same journal, same year, missing volumes, same date, earlier pages
                        if focus_pages < target_pages:
                            to_remove.append(i)
                            print(f"---> Article {i} is a commentary and is to be removed")

                        # Same journal, same year, missing volumes, same date, earlier pages
                        else:
                            article['comment'] = ('Received highlight article! ' 
                                                  + article['comment'])
                            print('---> Received highlight article\n')
        
                    # Same journal, same year, missing volumes, later time
                    elif article['date'] > target['date']:
                        to_remove.append(i)
                        print(f"---> Article {i} is a comment and is to be removed")

                    # Same journal, same year, missing volumes, earlier time
                    elif article['date'] < target['date']:
                        article['comment'] = 'Received comment! ' + article['comment']
                        print('---> Received comment\n')
        
            # Same journal, later year 
            elif article['year'] > target['year']:
                to_remove.append(i)
                print(f"---> Article {i} is a comment and is to be removed")

            # Same journal, earlier year
            elif article['year'] < target['year']:
                article['comment'] = 'Received comment! ' + article['comment']
                print('---> Received comment\n')


        # Different journals, later year
        elif article['year'] > target['year']:
            to_remove.append(i)
            print(f"---> Article {i} is a commentary and is to be removed")

        # Different journals, earlier year
        elif article['year'] < target['year']: 
            article['comment'] = 'Received commentary! ' + article['comment']
            print('---> Received commentary\n')

        # Different journals, same year
        else:
            if date_focus > date_target:
                to_remove.append(i)
                print(f"---> Article {i} is a commentary and is to be removed")
            
            elif date_focus < date_target:
                article['comment'] = 'Received commentary! ' + article['comment']
                print('---> Received commentary\n')
            else:
                print('FUCK!!!')

print(to_remove)


 178 Focus:  Int J Biol Macromol 2018  Mar 108 1128-1139 10.1016/j.ijbiomac.2017.12.040
-- Comment in Int J Biol Macromol. 2018 Jul 15;114:1084-1085.
===>> Int J Biol Macromol
===>> 2018 2018
===>> 108 114
---> Received comment

[]


In [18]:
for i in reversed(to_remove):
    articles.pop(i)

print(len(articles))

2980


## Create collections

In [19]:
pattern1 = 'review'
pattern2 = 'reviewed'


erratum_articles = []
comment_articles = []
review_articles = []
no_abstract_articles = []

for article in articles:
    if not article['retraction']:
        if article['erratum']:
            erratum_articles.append(article)
            continue
            
        if article['comment'] and 'Received comment! ' == article['comment'][:18]:
            comment_articles.append(article)
            continue

        if not article['abstract']:
            no_abstract_articles.append(article)
        else:
            if pattern1 in article['abstract'] or pattern2 in article['abstract']:
                review_articles.append(article)


In [20]:
print(len(articles))
print(len(retracted_articles))
print(len(erratum_articles))
print(len(comment_articles))
print(len(no_abstract_articles))
print(len(review_articles))

2980
17
30
1
5
169


# Save to file

In [21]:
with open(Path.cwd() / 'articles_clean.json', 'w', encoding = 'utf-8') as f_json:
    json.dump(articles, f_json)
    
print('Done saving file!')

Done saving file!
