Skip to content

Commit

Permalink
Ris fix (#129)
Browse files Browse the repository at this point in the history
- Fix being able to read RIS files with UTF-8 BOM characters.
- Add support for excel (.xlsx) files.
- Add support for PubMed XML files.
- Add wider support for ris/csv files.
- More test sets.
- Throw error if the reader can't find titles or abstracts.
  • Loading branch information
qubixes committed Nov 18, 2019
1 parent 4b3710f commit 647e11d
Show file tree
Hide file tree
Showing 23 changed files with 4,682 additions and 111 deletions.
2 changes: 2 additions & 0 deletions asreview/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
from asreview.readers import ASReviewData
from asreview.readers import read_csv
from asreview.readers import read_data
from asreview.readers import read_excel
from asreview.readers import read_pubmed_xml
from asreview.readers import read_ris
from asreview.review import get_reviewer
from asreview.review import MinimalReview
Expand Down
2 changes: 2 additions & 0 deletions asreview/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class BadFileFormatError(Exception):
pass
179 changes: 148 additions & 31 deletions asreview/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,16 @@
import logging
from pathlib import Path
import warnings
import xml.etree.ElementTree as ET


import numpy as np
import pandas as pd
from RISparser import readris
from RISparser import TAG_KEY_MAPPING

from asreview.config import NOT_AVAILABLE
from asreview.exceptions import BadFileFormatError

with warnings.catch_warnings():
warnings.filterwarnings("ignore")
Expand Down Expand Up @@ -73,6 +76,13 @@ def __init__(self, raw_df, labels=None, title=None, abstract=None,
self.authors = authors
self.final_labels = final_labels

if authors is None:
logging.warning("Could not locate authors in data.")
if title is None:
BadFileFormatError("Could not locate titles in data.")
if abstract is None:
BadFileFormatError("Could not locate abstracts in data.")

if article_id is None:
self.article_id = np.arange(len(raw_df.index))

Expand Down Expand Up @@ -103,13 +113,21 @@ def from_data_frame(cls, raw_df, abstract_only=False):
data_kwargs['final_labels'] = data_kwargs['labels']
data_kwargs['labels'] = inclusion_codes > 0

def fill_column(dst_dict, key):
try:
dst_dict[key] = raw_df[key.lower()].fillna('').values
except KeyError:
pass

for key in ['title', 'abstract', 'keywords', 'authors']:
def fill_column(dst_dict, keys):
if not isinstance(keys, list):
keys = [keys]
dst_key = keys[0]
df_columns = {str(col_name).lower(): col_name
for col_name in list(raw_df)}
for key in keys:
try:
dst_dict[dst_key] = raw_df[df_columns[key]].fillna('').values
except KeyError:
pass

for key in [['title', 'primary_title'],
['authors', 'author names', 'first_authors'],
'abstract', 'keywords']:
fill_column(data_kwargs, key)

return cls(**data_kwargs)
Expand All @@ -122,15 +140,15 @@ def from_csv(cls, fp, *args, **kwargs):
def from_ris(cls, fp, *args, **kwargs):
return cls.from_data_frame(pd.DataFrame(read_ris(fp)), *args, **kwargs)

@classmethod
def from_excel(cls, fp, *args, **kwargs):
return cls.from_data_frame(
pd.DataFrame(read_excel(fp)), *args, **kwargs)

@classmethod
def from_file(cls, fp, *args, **kwargs):
"Create instance from csv/ris file."
if Path(fp).suffix in [".csv", ".CSV"]:
return cls.from_csv(fp, *args, **kwargs)
if Path(fp).suffix in [".ris", ".RIS"]:
return cls.from_ris(fp, *args, **kwargs)
raise ValueError(f"Unknown file extension: {Path(fp).suffix}.\n"
f"from file {fp}")
"Create instance from csv/ris/excel file."
return cls.from_data_frame(_df_from_file(fp), *args, **kwargs)

def preview_record(self, i, w_title=80, w_authors=40):
"Return a preview string for record i."
Expand Down Expand Up @@ -208,7 +226,11 @@ def fuzzy_find(self, keywords, threshold=50, max_return=10,
if self.authors is not None:
match_str += self.authors + " "
if self.keywords is not None:
match_str += self.keywords
if isinstance(self.keywords[0], list):
new_keywords = np.array([" ".join(x) for x in self.keywords])
else:
new_keywords = self.keywords
match_str += new_keywords

new_ranking = get_fuzzy_ranking(keywords, match_str)
sorted_idx = np.argsort(-new_ranking)
Expand Down Expand Up @@ -314,6 +336,24 @@ def write_ris(df, ris_fp):
fp.write("ER - \n\n")


def _df_from_file(fp):
if Path(fp).suffix.lower() == ".csv":
data = read_csv(fp)
elif Path(fp).suffix.lower() in [".ris", ".txt"]:
data = read_ris(fp)
elif Path(fp).suffix.lower() == ".xslx":
data = read_excel(fp)
elif Path(fp).suffix.lower() == ".xml":
data = read_pubmed_xml(fp)
else:
raise ValueError(f"Unknown file extension: {Path(fp).suffix}.\n"
f"from file {fp}")

# parse data in pandas dataframe
df = pd.DataFrame(data)
return df


def read_data(fp):
"""Load papers and their labels.
Expand All @@ -329,18 +369,7 @@ def read_data(fp):
The labels for each paper. 1 is included, 0 is excluded. If this column
is not available, this column is not returned.
"""

if Path(fp).suffix in [".csv", ".CSV"]:
data = read_csv(fp)
elif Path(fp).suffix in [".ris", ".RIS"]:
data = read_ris(fp)
else:
raise ValueError(f"Unknown file extension: {Path(fp).suffix}.\n"
f"from file {fp}")

# parse data in pandas dataframe
df = pd.DataFrame(data)

df = _df_from_file(fp)
# make texts
texts = (df['title'].fillna('') + ' ' + df['abstract'].fillna(''))

Expand Down Expand Up @@ -374,7 +403,6 @@ def read_csv(fp):
List with entries.
"""

try:
df = pd.read_csv(fp)
except UnicodeDecodeError:
Expand All @@ -383,6 +411,90 @@ def read_csv(fp):
return df.to_dict('records')


def read_excel(fp):
"""Excel file reader.
Parameters
----------
fp: str, pathlib.Path
File path to the Excel file (.xlsx).
Returns
-------
list:
List with entries.
"""
try:
dfs = pd.read_excel(fp, sheet_name=None)
except UnicodeDecodeError:
dfs = pd.read_excel(fp, sheet_name=None, encoding="ISO-8859-1")

best_sheet = None
sheet_obj_val = -1
wanted_columns = [
'title', 'primary_title', 'authors', 'author names', 'first_authors',
'abstract', 'keywords']
wanted_columns.extend(LABEL_INCLUDED_VALUES)
for sheet_name in dfs:
col_names = set([col.lower() for col in list(dfs[sheet_name])])
obj_val = len(col_names & set(wanted_columns))
if obj_val > sheet_obj_val:
sheet_obj_val = obj_val
best_sheet = sheet_name

return dfs[best_sheet].to_dict('records')


def read_pubmed_xml(fp):
"""PubMed XML file reader.
Parameters
----------
fp: str, pathlib.Path
File path to the XML file (.xml).
Returns
-------
list:
List with entries.
"""
tree = ET.parse(fp)
root = tree.getroot()

records = []
for child in root:
parts = []
elem = child.find('MedlineCitation/Article/ArticleTitle')
title = elem.text.replace('[', '').replace(']', '')

for elem in child.iter('AbstractText'):
parts.append(elem.text)
authors = []
for author in child.iter('Author'):
author_elems = []
for elem in author.iter('ForeName'):
author_elems.append(elem.text)
for elem in author.iter('LastName'):
author_elems.append(elem.text)
authors.append(" ".join(author_elems))

author_str = ", ".join(authors)
abstract = " ".join(parts)

keyword_list = [keyword.text for keyword in child.iter('Keyword')]
keywords = ", ".join(keyword_list)

new_record = {
"abstract": abstract,
"title": title,
"authors": author_str,
"keywords": keywords,
}
records.append(new_record)
return records


def read_ris(fp):
"""RIS file reader.
Expand All @@ -400,8 +512,13 @@ def read_ris(fp):
"""

with open(fp, 'r') as bibliography_file:
mapping = _tag_key_mapping(reverse=False)
entries = list(readris(bibliography_file, mapping=mapping))
try:
with open(fp, 'r', encoding='utf-8') as bibliography_file:
mapping = _tag_key_mapping(reverse=False)
entries = list(readris(bibliography_file, mapping=mapping))
except IOError:
with open(fp, 'r', encoding='utf-8-sig') as bibliography_file:
mapping = _tag_key_mapping(reverse=False)
entries = list(readris(bibliography_file, mapping=mapping))

return entries
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ def get_long_description():
'PyInquirer',
'fuzzywuzzy',
'h5py',
'xlrd>=1.0.0',
],
extras_require={
'dev': ['check-manifest'],
Expand Down

0 comments on commit 647e11d

Please sign in to comment.