Ris fix (#129)

- Fix being able to read RIS files with UTF-8 BOM characters. - Add support for excel (.xlsx) files. - Add support for PubMed XML files. - Add wider support for ris/csv files. - More test sets. - Throw error if the reader can't find titles or abstracts.
asreview · Nov 18, 2019 · 647e11d · 647e11d
1 parent 4b3710f
commit 647e11d
Show file tree

Hide file tree

Showing 23 changed files with 4,682 additions and 111 deletions.
diff --git a/asreview/__init__.py b/asreview/__init__.py
@@ -18,6 +18,8 @@
 from asreview.readers import ASReviewData
 from asreview.readers import read_csv
 from asreview.readers import read_data
+from asreview.readers import read_excel
+from asreview.readers import read_pubmed_xml
 from asreview.readers import read_ris
 from asreview.review import get_reviewer
 from asreview.review import MinimalReview

diff --git a/asreview/exceptions.py b/asreview/exceptions.py
@@ -0,0 +1,2 @@
+class BadFileFormatError(Exception):
+    pass
diff --git a/asreview/readers.py b/asreview/readers.py
@@ -15,13 +15,16 @@
 import logging
 from pathlib import Path
 import warnings
+import xml.etree.ElementTree as ET
+
 
 import numpy as np
 import pandas as pd
 from RISparser import readris
 from RISparser import TAG_KEY_MAPPING
 
 from asreview.config import NOT_AVAILABLE
+from asreview.exceptions import BadFileFormatError
 
 with warnings.catch_warnings():
     warnings.filterwarnings("ignore")
@@ -73,6 +76,13 @@ def __init__(self, raw_df, labels=None, title=None, abstract=None,
         self.authors = authors
         self.final_labels = final_labels
 
+        if authors is None:
+            logging.warning("Could not locate authors in data.")
+        if title is None:
+            BadFileFormatError("Could not locate titles in data.")
+        if abstract is None:
+            BadFileFormatError("Could not locate abstracts in data.")
+
         if article_id is None:
             self.article_id = np.arange(len(raw_df.index))
 
@@ -103,13 +113,21 @@ def from_data_frame(cls, raw_df, abstract_only=False):
             data_kwargs['final_labels'] = data_kwargs['labels']
             data_kwargs['labels'] = inclusion_codes > 0
 
-        def fill_column(dst_dict, key):
-            try:
-                dst_dict[key] = raw_df[key.lower()].fillna('').values
-            except KeyError:
-                pass
-
-        for key in ['title', 'abstract', 'keywords', 'authors']:
+        def fill_column(dst_dict, keys):
+            if not isinstance(keys, list):
+                keys = [keys]
+            dst_key = keys[0]
+            df_columns = {str(col_name).lower(): col_name
+                          for col_name in list(raw_df)}
+            for key in keys:
+                try:
+                    dst_dict[dst_key] = raw_df[df_columns[key]].fillna('').values
+                except KeyError:
+                    pass
+
+        for key in [['title', 'primary_title'],
+                    ['authors', 'author names', 'first_authors'],
+                    'abstract', 'keywords']:
             fill_column(data_kwargs, key)
 
         return cls(**data_kwargs)
@@ -122,15 +140,15 @@ def from_csv(cls, fp, *args, **kwargs):
     def from_ris(cls, fp, *args, **kwargs):
         return cls.from_data_frame(pd.DataFrame(read_ris(fp)), *args, **kwargs)
 
+    @classmethod
+    def from_excel(cls, fp, *args, **kwargs):
+        return cls.from_data_frame(
+            pd.DataFrame(read_excel(fp)), *args, **kwargs)
+
     @classmethod
     def from_file(cls, fp, *args, **kwargs):
-        "Create instance from csv/ris file."
-        if Path(fp).suffix in [".csv", ".CSV"]:
-            return cls.from_csv(fp, *args, **kwargs)
-        if Path(fp).suffix in [".ris", ".RIS"]:
-            return cls.from_ris(fp, *args, **kwargs)
-        raise ValueError(f"Unknown file extension: {Path(fp).suffix}.\n"
-                         f"from file {fp}")
+        "Create instance from csv/ris/excel file."
+        return cls.from_data_frame(_df_from_file(fp), *args, **kwargs)
 
     def preview_record(self, i, w_title=80, w_authors=40):
         "Return a preview string for record i."
@@ -208,7 +226,11 @@ def fuzzy_find(self, keywords, threshold=50, max_return=10,
         if self.authors is not None:
             match_str += self.authors + " "
         if self.keywords is not None:
-            match_str += self.keywords
+            if isinstance(self.keywords[0], list):
+                new_keywords = np.array([" ".join(x) for x in self.keywords])
+            else:
+                new_keywords = self.keywords
+            match_str += new_keywords
 
         new_ranking = get_fuzzy_ranking(keywords, match_str)
         sorted_idx = np.argsort(-new_ranking)
@@ -314,6 +336,24 @@ def write_ris(df, ris_fp):
             fp.write("ER  - \n\n")
 
 
+def _df_from_file(fp):
+    if Path(fp).suffix.lower() == ".csv":
+        data = read_csv(fp)
+    elif Path(fp).suffix.lower() in [".ris", ".txt"]:
+        data = read_ris(fp)
+    elif Path(fp).suffix.lower() == ".xslx":
+        data = read_excel(fp)
+    elif Path(fp).suffix.lower() == ".xml":
+        data = read_pubmed_xml(fp)
+    else:
+        raise ValueError(f"Unknown file extension: {Path(fp).suffix}.\n"
+                         f"from file {fp}")
+
+    # parse data in pandas dataframe
+    df = pd.DataFrame(data)
+    return df
+
+
 def read_data(fp):
     """Load papers and their labels.
 
@@ -329,18 +369,7 @@ def read_data(fp):
         The labels for each paper. 1 is included, 0 is excluded. If this column
         is not available, this column is not returned.
     """
-
-    if Path(fp).suffix in [".csv", ".CSV"]:
-        data = read_csv(fp)
-    elif Path(fp).suffix in [".ris", ".RIS"]:
-        data = read_ris(fp)
-    else:
-        raise ValueError(f"Unknown file extension: {Path(fp).suffix}.\n"
-                         f"from file {fp}")
-
-    # parse data in pandas dataframe
-    df = pd.DataFrame(data)
-
+    df = _df_from_file(fp)
     # make texts
     texts = (df['title'].fillna('') + ' ' + df['abstract'].fillna(''))
 
@@ -374,7 +403,6 @@ def read_csv(fp):
         List with entries.
 
     """
-
     try:
         df = pd.read_csv(fp)
     except UnicodeDecodeError:
@@ -383,6 +411,90 @@ def read_csv(fp):
     return df.to_dict('records')
 
 
+def read_excel(fp):
+    """Excel file reader.
+
+    Parameters
+    ----------
+    fp: str, pathlib.Path
+        File path to the Excel file (.xlsx).
+
+    Returns
+    -------
+    list:
+        List with entries.
+
+    """
+    try:
+        dfs = pd.read_excel(fp, sheet_name=None)
+    except UnicodeDecodeError:
+        dfs = pd.read_excel(fp, sheet_name=None, encoding="ISO-8859-1")
+
+    best_sheet = None
+    sheet_obj_val = -1
+    wanted_columns = [
+        'title', 'primary_title', 'authors', 'author names', 'first_authors',
+        'abstract', 'keywords']
+    wanted_columns.extend(LABEL_INCLUDED_VALUES)
+    for sheet_name in dfs:
+        col_names = set([col.lower() for col in list(dfs[sheet_name])])
+        obj_val = len(col_names & set(wanted_columns))
+        if obj_val > sheet_obj_val:
+            sheet_obj_val = obj_val
+            best_sheet = sheet_name
+
+    return dfs[best_sheet].to_dict('records')
+
+
+def read_pubmed_xml(fp):
+    """PubMed XML file reader.
+
+    Parameters
+    ----------
+    fp: str, pathlib.Path
+        File path to the XML file (.xml).
+
+    Returns
+    -------
+    list:
+        List with entries.
+    """
+    tree = ET.parse(fp)
+    root = tree.getroot()
+
+    records = []
+    for child in root:
+        parts = []
+        elem = child.find('MedlineCitation/Article/ArticleTitle')
+        title = elem.text.replace('[', '').replace(']', '')
+
+        for elem in child.iter('AbstractText'):
+            parts.append(elem.text)
+        authors = []
+        for author in child.iter('Author'):
+            author_elems = []
+            for elem in author.iter('ForeName'):
+                author_elems.append(elem.text)
+            for elem in author.iter('LastName'):
+                author_elems.append(elem.text)
+            authors.append(" ".join(author_elems))
+
+        author_str = ", ".join(authors)
+        abstract = " ".join(parts)
+
+        keyword_list = [keyword.text for keyword in child.iter('Keyword')]
+        keywords = ", ".join(keyword_list)
+
+        new_record = {
+            "abstract": abstract,
+            "title": title,
+            "authors": author_str,
+            "keywords": keywords,
+        }
+        records.append(new_record)
+    return records
+
+
 def read_ris(fp):
     """RIS file reader.
 
@@ -400,8 +512,13 @@ def read_ris(fp):
 
     """
 
-    with open(fp, 'r') as bibliography_file:
-        mapping = _tag_key_mapping(reverse=False)
-        entries = list(readris(bibliography_file, mapping=mapping))
+    try:
+        with open(fp, 'r', encoding='utf-8') as bibliography_file:
+            mapping = _tag_key_mapping(reverse=False)
+            entries = list(readris(bibliography_file, mapping=mapping))
+    except IOError:
+        with open(fp, 'r', encoding='utf-8-sig') as bibliography_file:
+            mapping = _tag_key_mapping(reverse=False)
+            entries = list(readris(bibliography_file, mapping=mapping))
 
     return entries
diff --git a/setup.py b/setup.py
@@ -67,6 +67,7 @@ def get_long_description():
         'PyInquirer',
         'fuzzywuzzy',
         'h5py',
+        'xlrd>=1.0.0',
     ],
     extras_require={
         'dev': ['check-manifest'],