Code to remove duplicates that appear after combining different databases (Web of Science, Scopus, Scielo, etc).
This is part of the project described in <https://github.com/amchagas/OSH_papers_DB>, check the project readme for more details.

In [1]:
#import necessary libraries
import os
import pandas as pd
import seaborn as sns
import bibtexparser
import matplotlib.pyplot as plt


  import pandas.util.testing as tm


In [2]:
#databases output path
dataPath = "../data/"
scopusPath = ["scopus1.bib","scopus2.bib"]
wosPath = ["wos1.bib","wos2.bib","wos3.bib","wos4.bib"]
#wos2Path = "wos_501to835.bib"
scieloPath = "scielo.bib"

#path to store figures
outputPath = "../figures/"

## notes on similarities and differences between the databases

### Document_type vs type

- Scopus sets "document_types" as a column, indicating Articles, proceedings, etc
- Web of Science sets this column as "type".

**Because of the above, we set Scopus to "type" so there aren't two columns with the same kind of data on the joint panda dataframe**

In [3]:
# Open Scopus database
scopusFrames = list()
for i in range(len(scopusPath)):
    with open(dataPath+scopusPath[i]) as scopusFile:
        scopusDatabase = bibtexparser.load(scopusFile)
        scData = pd.DataFrame(scopusDatabase.entries)
        scData = scData.rename(columns={"document_type": "type"})
        scData = scData.rename(columns={"ENTRYTYPE": "large_category"})
        scopusFrames.append(scData)

scData = pd.concat(scopusFrames, axis=0, join='outer', ignore_index=True, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True,sort=False)

scData.to_csv(dataPath+"scData.csv")





In [6]:
#print(scData.keys())
#print(scData.head())
#print(list(set(scData["ENTRYTYPE"])))
#print(list(set(scData["document_type"])))

In [7]:
#Open Web of Science database

wosFrames = list()
for i in range(len(wosPath)):
    print("running file: " + str(i))
    with open(dataPath+wosPath[i]) as wosFile:
        wosDatabase = bibtexparser.load(wosFile)
        wosFrames.append(pd.DataFrame(wosDatabase.entries))

wosData = pd.concat(wosFrames, axis=0, join='outer', ignore_index=True, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True,sort=False)

wosData.to_csv(dataPath+"wosData.csv")

running file: 0
running file: 1
running file: 2
running file: 3


In [8]:
#Open Scielo

with open(dataPath+scieloPath) as scieloFile:
    scieloDatabase = bibtexparser.load(scieloFile)
    scieloData = pd.DataFrame(scieloDatabase.entries)
    scieloData = scieloData.rename(columns={"ENTRYTYPE": "type"})
    

scieloData.to_csv(dataPath+"scieloData.csv")

In [9]:
scieloData

Unnamed: 0,url,pages,crossref,month,language,publisher,year,volume,author,journal,title,type,ID
0,http://www.scielo.br/scielo.php?script=sci_art...,1309-1317,10.21577/0103-5053.20190029,5.0,en,scielo,2019,30,"Silva, Daniel M., Pereira, Álvaro J., Pierre...",{Journal of the Brazilian Chemical Society},{Automatized Separation of Fractions from Petr...,article,SILVA2019
1,http://www.scielo.br/scielo.php?script=sci_art...,1196-1199,10.21577/0100-4042.20170289,12.0,"en, pt",scielo,2018,41,"Soares, Fernanda S. C., Vieira, Alan L., Soa...",{Química Nova},{CONSTRUÇÃO DE UMA BOMBA PERISTÁLTICA E DE UM ...,article,SOARES2018
2,http://www.scielo.br/scielo.php?script=sci_art...,38-41,10.1590/1678-457x.06617,12.0,en,scielo,2017,37,"NASCIMENTO, Wesley William Gonçalves, SOUZA, ...",{Food Science and Technology},{Results from portable and of low cost equipme...,article,NASCIMENTO2017
3,http://www.scielo.cl/scielo.php?script=sci_art...,45-54,,,"es, en",scielo,2017,28,"Guillot, Jordan D, Robles, Carlos A, Calleja...",{Información tecnológica},{Adquisición de Señales Ambientales para un Si...,article,GUILLOT2017
4,http://www.scielo.cl/scielo.php?script=sci_art...,57-66,10.4067/S0718-915X2016000300006,12.0,"es, en",scielo,2016,15,"Aparicio, Pablo, Salmerón, José Manuel, Ruiz...",{Revista de la construcción},{The globe thermometer in comfort and environm...,article,APARICIO2016
5,http://www.scielo.br/scielo.php?script=sci_art...,305-309,10.5935/0100-4042.20160020,4.0,en,scielo,2016,39,"González, Pablo, Pérez, Nicolás, Knochen, Mo...",{Química Nova},{LOW COST ANALYZER FOR THE DETERMINATION OF PH...,article,GONZÁLEZ2016
6,http://www.scielo.org.co/scielo.php?script=sci...,198-205,10.15446/dyna.v83n195.49828,2.0,"es, en",scielo,2016,83,"Aristizábal, Luis M., Rúa, Santiago, Gaviria...",{DYNA},{Design of an open source-based control platfo...,article,ARISTIZÁBAL2016
7,http://www.scielo.org.co/scielo.php?script=sci...,85-92,,1.0,"es, en, pt",scielo,2016,25,"Millán-Rojas, Edwin Eduardo, Gallego-Torres, ...",{Revista Facultad de Ingeniería},{Simulación de una red Grid con máquinas virtu...,article,MILLÁN-ROJAS2016
8,http://www.scielo.sa.cr/scielo.php?script=sci_...,15-23,,3.0,"en, es",scielo,2015,28,"Murillo-Soto, Luis Diego",{Revista Tecnología en Marcha},{Automatización de pequeña escala con Open Har...,article,MURILLO-SOTO2015
9,http://www.scielo.br/scielo.php?script=sci_art...,26-32,10.1590/2446-4740.0653,3.0,en,scielo,2015,31,"Araujo, Carlos Eduardo de, Abatti, Paulo José...",{Research on Biomedical Engineering},{In vitro evaluation of a closed-loop feedback...,article,ARAUJO2015


In [10]:
#join all databases
allData = pd.concat([wosData,scData,scieloData], axis=0, join='outer', ignore_index=True, keys=None,
          levels=None, names=None, verify_integrity=False, copy=True,sort=False)

#web of science exports data with {} on every entry. 
# so the next lines clean it up.
for key in allData.keys():
    allData[key]=allData[key].str.replace("{","")
    allData[key]=allData[key].str.replace("}","")

#convert the year string type to int (making it easier to plot histograms)
allData.year = allData.year.astype(int)

allData.to_csv(dataPath+"allData.csv")

In [None]:
len(set(allData.journal[allData.type=="Article"]))

In [None]:

fHandle = allData.year[allData.type=="Article"].plot(kind="hist",
                 fontsize=18,
                 bins=list(set(allData.year+0.5)),
                 rwidth=0.65,
                # color="blue",
                )
fHandle.set(xlabel="Year")
fHandle.axes.xaxis.set_label_text(label="year",fontsize=18)
fHandle.axes.yaxis.set_label_text(label="count",fontsize=18)
fHandle.legend(["N = " + str(len(allData.year[allData.type=="Article"]))])
#fHandle.suptitle("test")

fHandle.figure.savefig(outputPath+ "test.png")


print("\n"+"number of articles: " + str(len(allData.year[allData.type=="Article"]))+ "\n")


In [None]:
print("number of entries:" + str(len(allData)))
print(len(allData.language))


In [None]:
allData.head(5)

In [None]:
allData.keys()

In [None]:
test = allData.sort_values("title")
for i in range(len(test)):
    print(test.loc[i].title)

In [None]:
dupTitle = test[allData.duplicated("title",False)].title
for i in range(len(dupTitle)):
    print(dupTitle[dupTitle.index[i]])