# Read libraries

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from copy import copy, deepcopy
from pathlib import Path
from sys import path

path.append( str(Path.cwd().parent) )


In [2]:
import json
import re

import matplotlib.pyplot as plt
import pandas as pd

from collections import Counter
from string import punctuation, whitespace

from Project_libraries.pubmed import ( concatenate_lines, 
                                       extract_publication_date, 
                                       get_article_data )

In [11]:
csv_file = Path.cwd() / 'csv-brca2-set.csv'
abstracts_file = Path.cwd() / 'abstract-brca2-set.txt'

# Import CSV file

In [205]:
df = pd.read_csv(csv_file)
df

Unnamed: 0,PMID,Title,Authors,Citation,First Author,Journal/Book,Publication Year,Create Date,PMCID,NIHMS ID,DOI
0,29687286,The BRCA1 and BRCA2 Genes in Early-Onset Breas...,"Saleem M, Ghazali MB, Wahab MAMA, Yusoff NM, M...",Adv Exp Med Biol. 2020;1292:1-12. doi: 10.1007...,Saleem M,Adv Exp Med Biol,2020,2018/04/25,,,10.1007/5584_2018_147
1,23747889,Breast cancer genes: beyond BRCA1 and BRCA2,"Filippini SE, Vega A.",Front Biosci (Landmark Ed). 2013 Jun 1;18(4):1...,Filippini SE,Front Biosci (Landmark Ed),2013,2013/06/11,,,10.2741/4185
2,26187060,Comprehensive spectrum of BRCA1 and BRCA2 dele...,"Kwong A, Shin VY, Ho JC, Kang E, Nakamura S, T...",J Med Genet. 2016 Jan;53(1):15-23. doi: 10.113...,Kwong A,J Med Genet,2016,2015/07/19,PMC4681590,NIHMS721868,10.1136/jmedgenet-2015-103132
3,31915789,BRCA2 gene mutation and prostate cancer risk. ...,"Junejo NN, AlKhateeb SS.",Saudi Med J. 2020 Jan;41(1):9-17. doi: 10.1553...,Junejo NN,Saudi Med J,2020,2020/01/10,PMC7001059,,10.15537/smj.2020.1.24759
4,32005245,Mutations of BRCA2 in canine mammary tumors an...,"Thumser-Henner P, Nytko KJ, Rohrer Bley C.",BMC Vet Res. 2020 Jan 31;16(1):30. doi: 10.118...,Thumser-Henner P,BMC Vet Res,2020,2020/02/02,PMC6995156,,10.1186/s12917-020-2247-4
...,...,...,...,...,...,...,...,...,...,...,...
9987,33112397,Activity of Platinum-Based Chemotherapy in Pat...,"Schmid S, Omlin A, Higano C, Sweeney C, Martin...",JAMA Netw Open. 2020 Oct 1;3(10):e2021692. doi...,Schmid S,JAMA Netw Open,2020,2020/10/28,PMC7593810,,10.1001/jamanetworkopen.2020.21692
9988,27197191,"Cross-Cancer Genome-Wide Analysis of Lung, Ova...","Fehringer G, Kraft P, Pharoah PD, Eeles RA, Ch...",Cancer Res. 2016 Sep 1;76(17):5103-14. doi: 10...,Fehringer G,Cancer Res,2016,2016/05/20,PMC5010493,NIHMS780930,10.1158/0008-5472.CAN-15-2980
9989,23535731,Multiple independent variants at the TERT locu...,"Bojesen SE, Pooley KA, Johnatty SE, Beesley J,...","Nat Genet. 2013 Apr;45(4):371-84, 384e1-2. doi...",Bojesen SE,Nat Genet,2013,2013/03/29,PMC3670748,NIHMS467552,10.1038/ng.2566
9990,33990587,Author Correction: A case-only study to identi...,"Coignard J, Lush M, Beesley J, O'Mara TA, Denn...",Nat Commun. 2021 May 14;12(1):2986. doi: 10.10...,Coignard J,Nat Commun,2021,2021/05/15,PMC8121813,,10.1038/s41467-021-23162-4


In [206]:
n_papers = len(df)
n_journals = len(set(df['Journal/Book']))

print(f"There are {n_papers} published in {n_journals} unique journals.")

There are 9992 published in 1454 unique journals.


In [207]:
journals = Counter(df['Journal/Book'])

In [208]:
journals.most_common(20)

[('Breast Cancer Res Treat', 456),
 ('Fam Cancer', 227),
 ('Gynecol Oncol', 209),
 ('J Clin Oncol', 197),
 ('Clin Cancer Res', 151),
 ('BMC Cancer', 146),
 ('PLoS One', 143),
 ('J Genet Couns', 121),
 ('Cancer', 116),
 ('Breast Cancer Res', 114),
 ('Br J Cancer', 110),
 ('Cancers (Basel)', 109),
 ('Clin Genet', 106),
 ('Genet Med', 105),
 ('Oncotarget', 105),
 ('Int J Cancer', 104),
 ('Ann Oncol', 101),
 ('Cancer Res', 98),
 ('J Med Genet', 92),
 ('J Natl Cancer Inst', 91)]

# Import abstract file

In [209]:
with open(abstracts_file, 'r', encoding = 'utf-8') as f_abs:
    data = f_abs.readlines()

# Identify starting line of each article
#
pattern = '\d{1,4}[.] '

articles_start = []
k = 0
while True:
    
    # Start of record  
    #
    potential_match = re.match(pattern, data[k])
    if potential_match:
        potential_index = int( data[k][potential_match.start(): 
                                       potential_match.end()-2] )
        
        if potential_index - 1 == len(articles_start):
            articles_start.append( k )
        
    k += 1
    if k == len(data):
        break

print(f"There are {len(articles_start)} articles in file." )

# Need to add last line of file so all articles are checked later
articles_start.append( len(data) )

There are 9992 articles in file.


In [210]:
articles = []
retracted_articles = []

count1 = 0
count2 = 0
for i in range(0, len(articles_start) - 1):
#     print(articles_start[i],articles_start[i+1])
    info = copy( data[articles_start[i]:articles_start[i+1]])
    
    try:
        flag, article = get_article_data(info)
        
        if flag:
            retracted_articles.append( article )
    
        articles.append( article )

    except IndexError:
        count1 += 1
        print( f"{count1} -- Article {i+1}, lines {articles_start[i]} to "
               f"{articles_start[i+1]} failed to process" )
        articles.append( None )
        
    except ValueError:
        count2 += 1
        print( f"{count2} -- Article {i+1}, lines {articles_start[i]} to "
               f"{articles_start[i+1]} lacks year." )
        articles.append( None )
        
    except:
        print( f"Article {i+1}, lines {articles_start[i]} to "
               f"{articles_start[i+1]}." )
        flag, article = get_article_data(info)
        
    
    

1 -- Article 172, lines 5493 to 5503 failed to process
2 -- Article 248, lines 7768 to 7777 failed to process
3 -- Article 462, lines 12742 to 12753 failed to process
4 -- Article 486, lines 13086 to 13096 failed to process
5 -- Article 490, lines 13135 to 13144 failed to process
6 -- Article 494, lines 13185 to 13194 failed to process
7 -- Article 497, lines 13220 to 13231 failed to process
8 -- Article 499, lines 13245 to 13255 failed to process
9 -- Article 505, lines 13385 to 13397 failed to process
10 -- Article 579, lines 15725 to 15735 failed to process
11 -- Article 590, lines 15979 to 15989 failed to process
12 -- Article 762, lines 21970 to 21980 failed to process
13 -- Article 801, lines 23427 to 23437 failed to process
14 -- Article 1131, lines 35444 to 35454 failed to process
15 -- Article 1209, lines 38443 to 38455 failed to process
16 -- Article 1365, lines 45017 to 45028 failed to process
-----> 2151. RETRACTED ARTICLE

1 -- Article 3546, lines 146933 to 147010 lacks ye

## Manual corrections

In [211]:
i = 265
print(articles[i], '\n')
articles[i]['abstract'] = 'Breast conservation surgery is safe in selected women when combined with adjuvant therapy'
articles[i]['other_ids'] = 'DOI: 10.1136/bmj.39114.354248.80 PMCID: PMC1808129 PMID: 17332541 [Indexed for MEDLINE]'

print(articles[i], '\n------')

{'retraction': None, 'journal': 'BMJ', 'year': 2007, 'date': ' Mar 3', 'volume': '334(7591)', 'pages': '437-8', 'doi': '10.1136/bmj.39114.354248.80', 'title': 'Management of breast cancer in women with BRCA gene mutation.', 'authors': 'Kell MR, Burke JP', 'affiliations': None, 'erratum': None, 'comment': None, 'abstract': None, 'copyright': None, 'other_ids': 'Breast conservation surgery is safe in selected women when combined with adjuvant therapy'} 

{'retraction': None, 'journal': 'BMJ', 'year': 2007, 'date': ' Mar 3', 'volume': '334(7591)', 'pages': '437-8', 'doi': '10.1136/bmj.39114.354248.80', 'title': 'Management of breast cancer in women with BRCA gene mutation.', 'authors': 'Kell MR, Burke JP', 'affiliations': None, 'erratum': None, 'comment': None, 'abstract': 'Breast conservation surgery is safe in selected women when combined with adjuvant therapy', 'copyright': None, 'other_ids': 'DOI: 10.1136/bmj.39114.354248.80 PMCID: PMC1808129 PMID: 17332541 [Indexed for MEDLINE]'} 
--

In [212]:
i = 364
print(articles[i], '\n')
articles[i]['other_ids'] = 'DOI: 10.7705/biomedica.5663 PMCID: PMC8768485 PMID: 34936260 [Indexed for MEDLINE]'

print(articles[i], '\n------')

{'retraction': None, 'journal': 'Biomedica', 'year': 2021, 'date': ' Dec 15', 'volume': '41(4)', 'pages': '773-786', 'doi': '10.7705/biomedica.5663', 'title': 'Correlation between the number of false positive variants and the quality of results using Ion Torrent PGM™ sequencing to screen BRCA genes.', 'authors': 'Gouvêa Moreira TC(1), Da Silva Spínola P(2), Campos Rezende M(3), Moreira de Freitas CS(3), Borges Mury F(4), Rodrigues Bonvicino C(2), De Andrade Agostinho L(5)', 'affiliations': 'Author information: (1)Hospital do Câncer de Muriaé, Fundação Cristiano Varella, Muriaé, Brazil;\xa0Centro Universitário UNIFAMINAS, Muriaé, Brazil. biomedica@ins.gov.co. (2)Divisão de Genética, Instituto Nacional do Câncer, Rio de Janeiro, Brazil; Programa de Pós-Graduação em Genética, Universidade Federal do Rio de Janeiro, Rio de Janeiro, Brazil. biomedica@ins.gov.co. (3)Hospital do Câncer de Muriaé, Fundação Cristiano Varella, Muriaé, Brazil; Instituto de Ensino e Pesquisa Santa Casa BH, Belo Ho

In [213]:
i = 442
print(articles[i], '\n')
articles[i]['abstract'] = 'Breast surgeons recommend against the procedure unless cancer risk is increased.'
articles[i]['other_ids'] = 'DOI: 10.1097/01.NAJ.0000505576.27516.9d PMID: 27787313 [Indexed for MEDLINE]'

print(articles[i], '\n------')

{'retraction': None, 'journal': 'Am J Nurs', 'year': 2016, 'date': ' Nov', 'volume': '116(11)', 'pages': '17', 'doi': '10.1097/01.NAJ.0000505576.27516.9d', 'title': 'New Advisory on Contralateral Prophylactic Mastectomy.', 'authors': 'Zolot J(1)', 'affiliations': 'Author information: (1)Joan Zolot, PA', 'erratum': None, 'comment': None, 'abstract': None, 'copyright': None, 'other_ids': 'Breast surgeons recommend against the procedure unless cancer risk is increased.'} 

{'retraction': None, 'journal': 'Am J Nurs', 'year': 2016, 'date': ' Nov', 'volume': '116(11)', 'pages': '17', 'doi': '10.1097/01.NAJ.0000505576.27516.9d', 'title': 'New Advisory on Contralateral Prophylactic Mastectomy.', 'authors': 'Zolot J(1)', 'affiliations': 'Author information: (1)Joan Zolot, PA', 'erratum': None, 'comment': None, 'abstract': 'Breast surgeons recommend against the procedure unless cancer risk is increased.', 'copyright': None, 'other_ids': 'DOI: 10.1097/01.NAJ.0000505576.27516.9d PMID: 27787313 [I

In [214]:
i = 1662
print(articles[i], '\n')
articles[i]['abstract'] = 'A collaborative assessment of options and trade-offs-perhaps using visual decision aids-can help.'
articles[i]['other_ids'] = 'PMID: 32555753 [Indexed for MEDLINE]'

print(articles[i], '\n------')

{'retraction': None, 'journal': 'J Fam Pract', 'year': 2020, 'date': ' Jun', 'volume': '69(5)', 'pages': '237-243', 'doi': None, 'title': 'Managing a woman with BRCA mutations? Shared decision-making is key.', 'authors': 'Schrager S(1), Torell E(2), Ledford K(3), Elezaby M(4), Barroleit L(5), Sadowski E(4)', 'affiliations': 'Author information: (1)University of Wisconsin Department of Family Medicine and Community Health, Madison, USA. Email: sbschrag@wisc.edu. (2)Indian Health Board, Minneapolis, MN, USA. (3)Group Health of Wisconsin, Madison, USA. (4)University of Wisconsin Department of Radiology, Madison, USA. (5)University of Wisconsin Department of Obstetrics and Gynecology, Madison, USA', 'erratum': None, 'comment': None, 'abstract': None, 'copyright': None, 'other_ids': 'A collaborative assessment of options and trade-offs-perhaps using visual decision aids-can help.'} 

{'retraction': None, 'journal': 'J Fam Pract', 'year': 2020, 'date': ' Jun', 'volume': '69(5)', 'pages': '23

In [215]:
i = 2453
print(articles[i], '\n')
articles[i]['other_ids'] = 'PMID: 25191729 [Indexed for MEDLINE]'

print(articles[i], '\n------')

{'retraction': None, 'journal': 'Probl Radiac Med Radiobiol', 'year': 2013, 'date': '', 'volume': '(18)', 'pages': '253-60', 'doi': None, 'title': 'Prognosis of probability of BRCA1 and BRCA2 mutations carriage in women with compromised family history of breast and/or ovarian cancer. [Article in English, Ukrainian; Abstract available in Ukrainian from the publisher]', 'authors': 'Rybchenko LA, Bychkova AM, Skyban GV, Klymenko SV', 'affiliations': None, 'erratum': None, 'comment': None, 'abstract': "Burdened family history of breast and/or ovarian cancer may indicate the mutations carriage in the BRCA1 and BRCA2 genes. OBJECTIVE: Estimation and compare of the Manchester Scoring system, Penn II and Myriad algorithm in an ability to distinguish the cases with BRCA1/2 mutation those and no mutant alleles at the individual level among the Ukrainian women with early onset of a breast cancer and/or compromised family history with breast cancer and/or ovarian cancer. MATERIAL AND METHODS: Resu

In [216]:
i = 4116
print(articles[i], '\n')
articles[i]['other_ids'] = 'DOI: 10.33145/2304-8336-2019-24-455-464 PMID: 31841487 [Indexed for MEDLINE]'

print(articles[i], '\n------')

{'retraction': None, 'journal': 'Probl Radiac Med Radiobiol', 'year': 2019, 'date': ' Dec', 'volume': '24', 'pages': '455-464', 'doi': '10.33145/2304-8336-2019-24-455-464', 'title': 'MUTATIONS OF GENES BRCA1 AND BRCA2 IN WOMEN WITH OVARIAN CANCER EXPOSED TO FACTORS OF CHORNOBYL NUCLEAR ACCIDENT. [Article in English, Ukrainian; Abstract available in Ukrainian from the publisher]', 'authors': 'Rybchenko LA(1), Poluben LO(1), Bychkova GM(1), Stephanovych GV(1), Klymenko SV(1)', 'affiliations': 'Author information: (1)State Institution «National Research Center for Radiation Medicine of the National Academy of MedicalSciences of Ukraine», 53 Yuriia Illienka St., Kyiv, 04050, Ukraine', 'erratum': None, 'comment': None, 'abstract': "OBJECTIVE: to determine a frequency of germline mutations 185delAG, 5382insC in BRCA1 gene and 6174delT in BRCA2 gene in Ukrainian patients with OC including women who were exposed to the factors of Chornobyl nuclear accident. MATERIAL AND METHODS: In the study w

In [217]:
i = 5952
print(articles[i], '\n')
articles[i]['other_ids'] = 'PMID: 26695891'

print(articles[i], '\n------')

{'retraction': None, 'journal': 'Probl Radiac Med Radiobiol', 'year': 2015, 'date': ' Dec', 'volume': '20', 'pages': '12-24', 'doi': None, 'title': 'State Institution "National Research Center for Radiation Medicine of the National Academy of Medical Sciences of Ukraine" - research activities and scientific advance in 2014. [Article in English, Ukrainian; Abstract available in Ukrainian from the publisher]', 'authors': 'Bazyka D(1), Sushko V(1), Chumak A(1), Buzunov V(1), Talko V(1), Yanovich L(1)', 'affiliations': 'Author information: (1)State Institution National Research Center for Radiation Medicine of the National Academy of MedicalSciences of Ukraine, Melnykov str., 53, Kyiv, 04050, Ukraine', 'erratum': None, 'comment': None, 'abstract': 'Research activities and scientific advance achieved in 2014 at the State Institution "National Research Center for Radiation Medicine of the National Academy of Medical Sciences of Ukraine" (NRCRM) concerning medical problems of the Chornobyl di

In [218]:
i = 6389
print(articles[i], '\n')
articles[i]['other_ids'] = 'DOI: 10.1001/jamanetworkopen.2021.7728 PMCID: PMC8105747 PMID: 33961040 [Indexed for MEDLINE]'

print(articles[i], '\n------')

{'retraction': None, 'journal': 'JAMA Netw Open', 'year': 2021, 'date': ' May 3', 'volume': '4(5)', 'pages': 'e217728', 'doi': '10.1001/jamanetworkopen.2021.7728', 'title': 'Evaluation of BRCA1 and BRCA2 as Indicators of Response to Immune Checkpoint Inhibitors.', 'authors': 'Zhou Z(1), Li M(1)', 'affiliations': 'Author information: (1)Department of Medicine, The University of Oklahoma Health Sciences Center, Oklahoma City', 'erratum': None, 'comment': None, 'abstract': None, 'copyright': None, 'other_ids': 'This cohort study examines the association of BRCA1 and BRCA2 with tumor mutation burden and response to immune checkpoint inhibitors.'} 

{'retraction': None, 'journal': 'JAMA Netw Open', 'year': 2021, 'date': ' May 3', 'volume': '4(5)', 'pages': 'e217728', 'doi': '10.1001/jamanetworkopen.2021.7728', 'title': 'Evaluation of BRCA1 and BRCA2 as Indicators of Response to Immune Checkpoint Inhibitors.', 'authors': 'Zhou Z(1), Li M(1)', 'affiliations': 'Author information: (1)Departmen

In [219]:
i = 6567
print(articles[i], '\n')
articles[i]['other_ids'] = 'DOI: 10.1038/sj.onc.1209153 PMID: 16205630 [Indexed for MEDLINE]'

print(articles[i], '\n------')

{'retraction': None, 'journal': 'Oncogene', 'year': 2006, 'date': ' Feb 23', 'volume': '25(8)', 'pages': '1186-94', 'doi': '10.1038/sj.onc.1209153', 'title': 'DSS1 is required for the stability of BRCA2.', 'authors': 'Li J(1), Zou C, Bai Y, Wazer DE, Band V, Gao Q', 'affiliations': 'Author information: (1)Division of Cancer Biology, Department of Medicine, Evanston Northwestern Healthcare Research Institute, Northwestern University Feinberg School of Medicine, Evanston, IL 60201, USA', 'erratum': None, 'comment': None, 'abstract': 'DSS1 is an evolutionarily conserved acidic protein that binds to BRCA2. However, study of the function of DSS1 in mammalian cells has been hampered because endogenous DSS1 has not been detectable by Western blotting. Here, we developed a modified Western blotting protocol that detects endogenous DSS1 protein, and used it to study the function of DSS1 and its interaction with BRCA2 in mammalian cells. We found that essentially all BRCA2 in human cell lines is

In [220]:
i = 6627
print(articles[i], '\n')
articles[i]['abstract'] = 'Clinical genomics is poised for a rapid expansion but more work must be done to build a supporting ethical infrastructure.'
articles[i]['other_ids'] = 'DOI: 10.1371/journal.pbio.1001663 PMCID: PMC3782420 PMID: 24086107 [Indexed for MEDLINE]'

print(articles[i], '\n------')

{'retraction': None, 'journal': 'PLoS Biol', 'year': 2013, 'date': ' Sep', 'volume': '11(9)', 'pages': 'e1001663', 'doi': '10.1371/journal.pbio.1001663', 'title': 'The actress, the court, and what needs to be done to guarantee the future of clinical genomics.', 'authors': 'Caplan AL(1)', 'affiliations': 'Author information: (1)Division of Medical Ethics, NYU Langone Medical Center, New York, New York, United States of America', 'erratum': None, 'comment': None, 'abstract': None, 'copyright': None, 'other_ids': 'Clinical genomics is poised for a rapid expansion but more work must be done to build a supporting ethical infrastructure.'} 

{'retraction': None, 'journal': 'PLoS Biol', 'year': 2013, 'date': ' Sep', 'volume': '11(9)', 'pages': 'e1001663', 'doi': '10.1371/journal.pbio.1001663', 'title': 'The actress, the court, and what needs to be done to guarantee the future of clinical genomics.', 'authors': 'Caplan AL(1)', 'affiliations': 'Author information: (1)Division of Medical Ethics,

In [221]:
i = 6628
print(articles[i], '\n')
articles[i]['comment'] = articles[i]['comment'].replace('Juni', 'Jun')

print(articles[i], '\n------')

{'retraction': None, 'journal': 'Laeknabladid', 'year': 2018, 'date': ' Jun', 'volume': '104(6)', 'pages': '281', 'doi': '10.17992/lbl.2018.06.187', 'title': '[To know or not to know, that is the question - screening for BRCA]. [Article in Icelandic]', 'authors': 'Johannsson OT(1)', 'affiliations': 'Author information: (1)MD PhD, Counsulting Specialist and Clinical Professor of Medical Oncology LSH', 'erratum': None, 'comment': 'Comment on Laeknabladid. 2018 Juni;104(6):289-296.', 'abstract': None, 'copyright': None, 'other_ids': 'DOI: 10.17992/lbl.2018.06.187 PMID: 29863479 [Indexed for MEDLINE]'} 

{'retraction': None, 'journal': 'Laeknabladid', 'year': 2018, 'date': ' Jun', 'volume': '104(6)', 'pages': '281', 'doi': '10.17992/lbl.2018.06.187', 'title': '[To know or not to know, that is the question - screening for BRCA]. [Article in Icelandic]', 'authors': 'Johannsson OT(1)', 'affiliations': 'Author information: (1)MD PhD, Counsulting Specialist and Clinical Professor of Medical Onc

In [222]:
i = 7382
print(articles[i], '\n')
articles[i]['date'] = articles[i]['date'].replace('Juni', 'Jun')
articles[i]['comment'] = articles[i]['comment'].replace('Juni', 'Jun')

print(articles[i], '\n------')

{'retraction': None, 'journal': 'Laeknabladid', 'year': 2018, 'date': ' Juni', 'volume': '104(6)', 'pages': '289-296', 'doi': '10.17992/lbl.2018.06.189', 'title': '[Views of Icelandic women towards genetic counseling - and testing of BRCA2 mutations]. [Article in Icelandic]', 'authors': 'Jonsdottir T(1), Valdimarsdottir H(2), Tryggvadottir L(3), Lund SH(1), Thordardottir M(1), Magnusson MK(4), Valdimarsdottir U(1)', 'affiliations': 'Author information: (1)Center of Public Health sciences, Faculty of Medicine, University of Iceland. (2)Reykjavik University. (3)Icelandic Cancer Registry, Icelandic Cancer Society, Skógarhlíð 8. (4)Faculty of Medicine, University of Iceland, Reykjavik', 'erratum': None, 'comment': 'Comment in Laeknabladid. 2018 Juni;104(6):281.', 'abstract': 'Introduction The aim of this study was to explore the attitudes of Icelandic women towards existing genetic information, genetic counseling and genetic testing for BRCA mutations which dramatically increase risk for a

In [223]:
i = 6875
print(articles[i], '\n')
articles[i]['abstract'] = 'A lupus causing anti-DNA antibody penetrates living cells and targets DNA repair for therapeutic advantage in human cancer cells.'
articles[i]['other_ids'] = 'DOI: 10.1126/scitranslmed.3004955 PMID: 23100623 [Indexed for MEDLINE]'
    
print(articles[i], '\n------')

{'retraction': None, 'journal': 'Sci Transl Med', 'year': 2012, 'date': ' Oct 24', 'volume': '4(157)', 'pages': '157fs38', 'doi': '10.1126/scitranslmed.3004955', 'title': 'Lupus antibody tops cancer cells.', 'authors': 'Ford JM(1)', 'affiliations': 'Author information: (1)Departments of Medicine and Genetics, Division of Oncology, Stanford University School of Medicine, Stanford, CA 94305, USA. jmf@stanford.edu', 'erratum': None, 'comment': None, 'abstract': None, 'copyright': None, 'other_ids': 'A lupus causing anti-DNA antibody penetrates living cells and targets DNA repair for therapeutic advantage in human cancer cells.'} 

{'retraction': None, 'journal': 'Sci Transl Med', 'year': 2012, 'date': ' Oct 24', 'volume': '4(157)', 'pages': '157fs38', 'doi': '10.1126/scitranslmed.3004955', 'title': 'Lupus antibody tops cancer cells.', 'authors': 'Ford JM(1)', 'affiliations': 'Author information: (1)Departments of Medicine and Genetics, Division of Oncology, Stanford University School of M

In [224]:
i = 7033
print(articles[i], '\n')
articles[i]['abstract'] = "A report of the Keystone Symposium 'DNA Replication and Recombination' held in Keystone, USA, 27 February to 4 March 2011."
articles[i]['copyright'] = '© 2011 BioMed Central Ltd'
articles[i]['other_ids'] = 'DOI: 10.1186/gb-2011-12-4-304 PMCID: PMC3218856 PMID: 21554750 [Indexed for MEDLINE]'
    
# print(articles[i], '\n------')

{'retraction': None, 'journal': 'Genome Biol', 'year': 2011, 'date': '', 'volume': '12(4)', 'pages': '304', 'doi': '10.1186/gb-2011-12-4-304', 'title': 'A top-down view on DNA replication and recombination from 9,000 feet above sea level.', 'authors': 'Johansson E(1), Speck C, Chabes A', 'affiliations': 'Author information: (1)Department of Medical Biochemistry and Biophysics, Umeå University, SE 90187 Umeå, Sweden', 'erratum': None, 'comment': None, 'abstract': None, 'copyright': None, 'other_ids': "A report of the Keystone Symposium 'DNA Replication and Recombination' held in Keystone, USA, 27 February to 4 March 2011."} 



In [225]:
i = 7576
print(articles[i], '\n')
articles[i]['other_ids'] = 'DOI: 10.1024/1661-8157/a001489 PMID: 24280604 [Indexed for MEDLINE]' 
    
print(articles[i], '\n------')

{'retraction': None, 'journal': 'Praxis (Bern 1994)', 'year': 2013, 'date': ' Nov 27', 'volume': '102(24)', 'pages': '1475-82', 'doi': '10.1024/1661-8157/a001489', 'title': '[Genetic diagnostics of cancer diseases]. [Article in German; Abstract available in German from the publisher]', 'authors': 'Cobilanschi J(1)', 'affiliations': 'Author information: (1)Institut für Medizinische Genetik, Universität Zürich, Schlieren', 'erratum': None, 'comment': None, 'abstract': "Cancer is caused by genetic alterations, but only 10% of the cancer diseases are inherited. The probability for an individual or a family of having inherited cancer, individual consequences of the respective results of genetic testing, as well as its costs and reimbursement by the health insurance must be addressed by expert genetic counseling which at-risk requires special expertise. Identification of a germline mutation which may predispose to a variety of different cancer types allows determination of an individual's sp

In [226]:
i = 8248
print(articles[i], '\n')
articles[i]['abstract'] = '[Image: see text]'
articles[i]['other_ids'] = 'DOI: 10.15252/embr.201744508 PMCID: PMC5494498 PMID: 28673926'
    
print(articles[i], '\n------')

{'retraction': None, 'journal': 'EMBO Rep', 'year': 2017, 'date': ' Jul', 'volume': '18(7)', 'pages': '1264', 'doi': '10.15252/embr.201744508', 'title': 'Structural basis for recruitment of BRCA2 by PALB2.', 'authors': 'Oliver AW, Swift S, Lord CJ, Ashworth A, Pearl LH', 'affiliations': None, 'erratum': 'Erratum for EMBO Rep. 2009 Sep;10(9):990-6.', 'comment': None, 'abstract': None, 'copyright': None, 'other_ids': '[Image: see text]'} 

{'retraction': None, 'journal': 'EMBO Rep', 'year': 2017, 'date': ' Jul', 'volume': '18(7)', 'pages': '1264', 'doi': '10.15252/embr.201744508', 'title': 'Structural basis for recruitment of BRCA2 by PALB2.', 'authors': 'Oliver AW, Swift S, Lord CJ, Ashworth A, Pearl LH', 'affiliations': None, 'erratum': 'Erratum for EMBO Rep. 2009 Sep;10(9):990-6.', 'comment': None, 'abstract': '[Image: see text]', 'copyright': None, 'other_ids': 'DOI: 10.15252/embr.201744508 PMCID: PMC5494498 PMID: 28673926'} 
------


In [200]:
i = 1732
info = copy(data[articles_start[i]:
                 articles_start[i+1]])

for line in info:
    print(line.strip())
print()

# for line in concatenate_lines(info):
#     print(line.strip())
#     print()

print('------')
print(get_article_data(info))

1733. Psychooncology. 2011 Dec;20(12):1301-8. doi: 10.1002/pon.1846. Epub 2010 Oct 3.

Novel one-stop multidisciplinary follow-up clinic for BRCA1/2 carriers: patient
satisfaction and decision making.

Firth C(1), Jacobs C, Evison M, Pichert G, Izatt L, Hunter MS.

Author information:
(1)Clinical Genetics Department, Guy's and St. Thomas' NHS Foundation Trust,
London, UK. clare.firth@nhs.net

OBJECTIVE: To evaluate patient' satisfaction and cancer risk management decision
making, following attendance at a novel multidisciplinary one-stop follow-up
clinic (MDOSC) for BRCA1/2 carriers.
PATIENTS AND METHODS: 172 patients attended the MDOSC over a 2-year period
between 2006 and 2008. A total of 96 and 76 patients were seen in the first and
second year, respectively. All patients who attended the MDOSC were sent a
17-item Satisfaction Questionnaire (SQ) designed to examine their views about
the MDOSC, using rating scales and open questions after the first year. Patients
were asked to commen

# Verify records

If this cell print anything, it means that some articles are not processed correctly.

This may be fixable within function or require manual correction.

In [227]:
for i, article in enumerate( articles ):
    if article:
        if ( article['other_ids'][:3] == 'DOI' 
             or article['other_ids'][:4] == 'PMID' 
             or article['other_ids'][:5] == 'PMCID' ):
            continue
        
        print(i, article['other_ids'], '\n')

# Classify articles

In [202]:
articles_save = deepcopy( articles )

In [230]:
to_remove = []
for i, article in enumerate( articles ):
    if article and article['retraction']:
        print( f"{i} -- {article['journal']}. {article['year']}; {article['volume']} " 
               f"\n\t{article['title']}\n\t{article['retraction']}\n")
        if article['retraction'][:19] == 'Retraction Notice: ':
            to_remove.append(i)
            print(to_remove)

print(to_remove)

1984 -- Comput Math Methods Med. 2021; 2021 
	Detection of BRCA1/2 Mutation and Analysis of Clinicopathological Characteristics in 141 Cases of Ovarian Cancer.
	Retraction in Comput Math Methods Med. 2023 Dec 6;2023:9825317.

3905 -- PLoS Genet. 2009; 5(10) 
	p63 and p73 transcriptionally regulate genes involved in DNA repair.
	Retraction in PLoS Genet. 2023 Mar 27;19(3):e1010699.

6500 -- Hered Cancer Clin Pract. 2017; 15 
	The BRCA2 variant c.68-7 T>A is associated with breast cancer.
	Retraction in Hered Cancer Clin Pract. 2018 May 2;16:10.

6555 -- J Biol Chem. 2006; 281(31) 
	Up-regulation of Skp2 after prostate cancer cell adhesion to basement membranes results in BRCA2 degradation and cell proliferation.
	Retraction in J Biol Chem. 2014 Jun 20;289(25):17424.

7750 -- J Biol Chem. 2014; 289(25) 
	Down-regulation of BRCA2 expression by collagen type I promotes prostate cancer cell proliferation.
	Retraction Notice: Retraction of Moro L, Arbini AA, Marra E, Greco M. J Biol Chem. 20

In [231]:
for i in reversed(to_remove):
    articles.pop(i)

print(len(articles))

9584


In [232]:
to_remove = []
for i, article in enumerate( articles ):
    if article and article['erratum']:
        print( f"{i} -- {article['journal']}. {article['year']}; {article['volume']} " 
               f"\n\t{article['title']}\n\t{article['erratum']}\n")
        if article['erratum'][:12] == 'Erratum for ':
            to_remove.append(i)
            print(to_remove)
            
print(to_remove)

19 -- Clin Transl Oncol. 2021; 23(1) 
	Comprehensive study for BRCA1 and BRCA2 entire coding regions in breast cancer.
	Erratum in Clin Transl Oncol. 2021 Oct;23(10):2191.

111 -- BMJ. 2021; 375 
	Care of men with cancer-predisposing BRCA variants.
	Erratum in BMJ. 2021 Oct 19;375:n2550.

146 -- J Hum Genet. 2012; 57(3) 
	Spectra of BRCA1 and BRCA2 mutations in Korean patients with breast cancer: the importance of whole-gene sequencing.
	Erratum in J Hum Genet. 2012 Mar;57(3):222.

155 -- Hum Mutat. 2020; 41(10) 
	Variant effect on splicing regulatory elements, branchpoint usage, and pseudoexonization: Strategies to enhance bioinformatic prediction using hereditary cancer genes as exemplars.
	Erratum in Hum Mutat. 2022 Dec;43(12):2328.

186 -- Breast Cancer Res. 2009; 11(4) 
	Genomic profiling of breast tumours in relation to BRCA abnormalities and phenotypes.
	Erratum in Breast Cancer Res. 2009;11(5):404.

218 -- BMJ. 2013; 346 
	US Supreme Court hears arguments in case of BRCA1 and B

In [233]:
for i in reversed(to_remove):
    articles.pop(i)

print(len(articles))

9555


## Dealing with *Comments* is complicated

Some *Comments* are perspectives aiming to publicize the target paper (like News & Views in Nature). These are published in the same issue by different authors.

We remove the actual *Comment* (if available) from the set of publications to analyze. We adjust `comment` key of target paper to convey this positive information. 

Some *Comments* appear to be summaries of the target papers. They are published in a different journal by a subset of the original authors.

We remove the actual *Comment* (if available) from the set of publications to analyze. We adjust `comment` key of target paper to convey this information. 

Some *Comments* appear to be actual criticisms of the target papers. They are published in the same journal as the target bu later than the target and are authored by different researchers.

We remove the actual *Comment* (if available) from the set of publications to analyze. We adjust `comment` key of target paper to convey this negative information.  

In [228]:
to_remove = []
for i, article in enumerate( articles[:] ):
    if article and article['comment']:
        print('\n', i, 'Focus: ', article['journal'], article['year'], 
              article['date'], article['volume'],  
              article['pages'], article['doi'])
        print('--', article['comment'])
        
        # Extract info about paper discussed in comment
        target = {}
        aux = article['comment'].split(';')
        if len(aux) == 1:
            to_remove.append(i)
            continue
    
        aux = aux[0].split('.')
        date_string =  aux[-1].strip()
        if len(date_string) < 4:
            to_remove.append(i)
            continue
            
        target['year'] = int( date_string[:4] )
        target['date'] = date_string[4:]
        target['journal'] = aux[-2].strip()
        
        aux = article['comment'].split(';')[1]
        aux = aux.split(':')
        target['volume'] = aux[0]
        target['pages'] = aux[1].split('.')[0]
        
        if len(aux) > 2:
            target['doi'] = aux[2].split()[0].strip('.')
        else:
            target['doi'] = None
          
        # Extract dates of publication
        date_focus = extract_publication_date(article)
        date_target = extract_publication_date(target)
        
        # Determine type of comment
        #
        # Same journal
        print(article['journal'])
        print(target['journal'].lstrip('Coment').strip().lstrip('ion').strip())
        if ( article['journal'] == 
             target['journal'].lstrip('Coment').strip().lstrip('ion').strip() ):
            print('===>>', article['journal'])
            print('===>>', article['year'], target['year'])
            
            # Same year 
            if article['year'] == target['year']:
                print('===>>', article['volume'], target['volume'])
            
                # Same volume
                if target['volume'] and article['volume'] == target['volume']:
                    print('--->', article['date'], target['date'])
                    
                    # Same date
                    if article['date'] == target['date']:
                        print('--->', article['date'])

                        if '-' in article['pages']:
                            focus_pages = article['pages'].split('-')[0]
                            if focus_pages.isnumeric():
                                focus_pages = int(focus_pages)
                        else:
                            focus_pages = article['pages']
                            
                        if '-' in target['pages']:
                            target_pages = target['pages'].split('-')[0]
                            if target_pages.isnumeric():
                                target_pages = int(target_pages)
                        else:
                            target_pages = target['pages']
                            
                        if type(target_pages) != type(focus_pages):
                            target_pages = str(target_pages)
                            focus_pages = str(focus_pages)

                        print('--->', focus_pages, target_pages)
                        
                        # Same date, earlier pages
                        if focus_pages < target_pages:
                            to_remove.append(i)
                            print(f"---> Article {i} is a commentary and is to be removed")

                        # Same date, later pages
                        else:
                            article['comment'] = ('Received highlight article! ' 
                                                  + article['comment'])
                            print('---> Received highlight article\n')

                    # Same volume, later time
                    elif article['date'] > target['date']:
                        to_remove.append(i)
                        print(f"---> Article {i} is a comment and is to be removed")

                    # Same volume, earlier time
                    elif article['date'] < target['date']:
                        article['comment'] = 'Received comment! ' + article['comment']
                        print('---> Received comment\n')

                # Same year, volumes that exist, later volume
                elif target['volume'] and article['volume'] > target['volume']:
                    to_remove.append(i)
                    print(f"---> Article {i} is a comment and is to be removed")
                
                # Same year, volumes that exist, earlier volume
                elif target['volume'] and article['volume'] < target['volume']:
                    article['comment'] = 'Received comment! ' + article['comment']
                    print('---> Received comment\n')

                # Same journal, same year, missing volumes 
                else:
                    # Same journal, same year, missing volumes, same date
                    if article['date'] == target['date']:
                        print('--->', article['date'])

                        if '-' in article['pages']:
                            focus_pages = article['pages'].split('-')[0]
                            if focus_pages.isnumeric():
                                focus_pages = int(focus_pages)
                        if '-' in target['pages']:
                            target_pages = target['pages'].split('-')[0]
                            if target_pages.isnumeric():
                                target_pages = int(target_pages)

                        print('--->', focus_pages, target_pages)

                        # Same journal, same year, missing volumes, same date, earlier pages
                        if focus_pages < target_pages:
                            to_remove.append(i)
                            print(f"---> Article {i} is a commentary and is to be removed")

                        # Same journal, same year, missing volumes, same date, earlier pages
                        else:
                            article['comment'] = ('Received highlight article! ' 
                                                  + article['comment'])
                            print('---> Received highlight article\n')
        
                    # Same journal, same year, missing volumes, later time
                    elif article['date'] > target['date']:
                        to_remove.append(i)
                        print(f"---> Article {i} is a comment and is to be removed")

                    # Same journal, same year, missing volumes, earlier time
                    elif article['date'] < target['date']:
                        article['comment'] = 'Received comment! ' + article['comment']
                        print('---> Received comment\n')
        
            # Same journal, later year 
            elif article['year'] > target['year']:
                to_remove.append(i)
                print(f"---> Article {i} is a comment and is to be removed")

            # Same journal, earlier year
            elif article['year'] < target['year']:
                article['comment'] = 'Received comment! ' + article['comment']
                print('---> Received comment\n')


        # Different journals, later year
        elif article['year'] > target['year']:
            to_remove.append(i)
            print(f"---> Article {i} is a commentary and is to be removed")

        # Different journals, earlier year
        elif article['year'] < target['year']: 
            article['comment'] = 'Received commentary! ' + article['comment']
            print('---> Received commentary\n')

        # Different journals, same year
        else:
            if date_focus > date_target:
                to_remove.append(i)
                print(f"---> Article {i} is a commentary and is to be removed")
            
            elif date_focus < date_target:
                article['comment'] = 'Received commentary! ' + article['comment']
                print('---> Received commentary\n')
                
            else:
                print('FUCK!!!')

print(to_remove)


 5 Focus:  N Engl J Med 2021  Feb 4 384(5) 471-473 10.1056/NEJMe2035083
-- Comment on N Engl J Med. 2021 Feb 4;384(5):440-451. N Engl J Med. 2021 Feb 4;384(5):428-439.
N Engl J Med
N Engl J Med
===>> N Engl J Med
===>> 2021 2021
===>> 384(5) 384(5)
--->  Feb 4  Feb 4
--->  Feb 4
---> 471 440
---> Received highlight article


 10 Focus:  J Clin Oncol 2020  Nov 10 38(32) 3735-3739 10.1200/JCO.20.02246
-- Comment on J Clin Oncol. 2020 Nov 10;38(32):3763-3772.
J Clin Oncol
J Clin Oncol
===>> J Clin Oncol
===>> 2020 2020
===>> 38(32) 38(32)
--->  Nov 10  Nov 10
--->  Nov 10
---> 3735 3763
---> Article 10 is a commentary and is to be removed

 13 Focus:  Clin Cancer Res 2020  Apr 15 26(8) 2047-2064 10.1158/1078-0432.CCR-19-1570
-- Comment in Clin Cancer Res. 2020 Apr 15;26(8):1784-1786. Comment on Clin Cancer Res. 2020 Apr 15;26(8):1784-1786.
Clin Cancer Res
Clin Cancer Res
===>> Clin Cancer Res
===>> 2020 2020
===>> 26(8) 26(8)
--->  Apr 15  Apr 15
--->  Apr 15
---> 2047 1784
---> Received

In [229]:
for i in reversed(to_remove):
    articles.pop(i)

print(len(articles))

9587


In [236]:
to_remove = []

for i, article in enumerate( articles ):
    if article is None:
        to_remove.append(i)
        
print(to_remove)

[144, 200, 362, 385, 387, 390, 393, 395, 401, 467, 478, 639, 676, 997, 1073, 1227, 3344, 3558, 4587, 4790, 5867, 5890, 5933, 6062, 6126, 6261, 6384, 6485, 8747, 8800, 8857, 9017]


In [237]:
for i in reversed(to_remove):
    articles.pop(i)

print(len(articles))

9523


## Create collections

In [238]:
pattern1 = 'review'
pattern2 = 'reviewed'


erratum_articles = []
comment_articles = []
review_articles = []
no_abstract_articles = []

for article in articles:
    if not article['retraction']:
        if article['erratum']:
            erratum_articles.append(article)
            continue

        if article['comment'] and 'Received comment! ' == article['comment'][:18]:
            comment_articles.append(article)
            continue

        if not article['abstract']:
            no_abstract_articles.append(article)
        else:
            if pattern1 in article['abstract'] or pattern2 in article['abstract']:
                review_articles.append(article)


In [239]:
print(len(articles))
print(len(retracted_articles))
print(len(erratum_articles))
print(len(comment_articles))
print(len(no_abstract_articles))
print(len(review_articles))

9523
8
234
162
599
1299


# Save to file

In [240]:
with open(Path.cwd() / 'articles_clean.json', 'w', encoding = 'utf-8') as f_json:
    json.dump(articles, f_json)
    
print('Done saving file!')

Done saving file!
