Skip to content

Commit

Permalink
Merge pull request #400 from biolab/fix-excel-reader
Browse files Browse the repository at this point in the history
[FIX] ExcelReader - Use openpyxl to read xlsx and xlrd for xls
  • Loading branch information
VesnaT committed Mar 21, 2024
2 parents b28efeb + 68edf7a commit 43f70d9
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 25 deletions.
Binary file added orangecontrib/single_cell/tests/data/data.xls
Binary file not shown.
38 changes: 21 additions & 17 deletions orangecontrib/single_cell/tests/test_load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,13 @@ def test_file_summary_gz(self):
self.assertEqual(round(loader.sparsity, 2), 0.86)

def test_file_summary_xls(self):
file_name = os.path.join(os.path.dirname(__file__), "data/data.xlsx")
loader = ExcelLoader(file_name)
self.assertEqual(loader.file_size, 9160)
self.assertEqual(loader.n_rows, 11)
self.assertEqual(loader.n_cols, 15)
self.assertEqual(round(loader.sparsity, 2), 0.86)
for file, size in (("data/data.xlsx", 9160), ("data/data.xls", 27648)):
file_name = os.path.join(os.path.dirname(__file__), file)
loader = ExcelLoader(file_name)
self.assertEqual(loader.file_size, size)
self.assertEqual(loader.n_rows, 11)
self.assertEqual(loader.n_cols, 15)
self.assertEqual(round(loader.sparsity, 2), 0.86)

def test_file_summary_loom(self):
file_name = os.path.join(os.path.dirname(__file__), "data/data.loom")
Expand Down Expand Up @@ -121,17 +122,18 @@ def test_load_data_mtx(self):
npt.assert_array_equal(X, array)

def test_load_data_xls(self):
kwargs = {"header_rows": 0, "header_cols": 0}
xls_path = os.path.join(os.path.dirname(__file__), "data/data.xlsx")
xls_loader = ExcelLoader(xls_path)
xls_attrs, xls_X, xls_M, xls_M_index = xls_loader._load_data(**kwargs)
csv_loader = Loader(os.path.join(os.path.dirname(__file__),
"data/DATA_MATRIX_LOG_TPM.txt"))
csv_attrs, csv_X, csv_M, csv_M_index = csv_loader._load_data(**kwargs)
self.assertEqual(xls_attrs, csv_attrs)
npt.assert_array_almost_equal(xls_X, csv_X)
npt.assert_array_equal(xls_M, csv_M)
npt.assert_array_equal(xls_M_index, csv_M_index)
for file in ("data/data.xlsx", "data/data.xls"):
kwargs = {"header_rows": 0, "header_cols": 0}
xls_path = os.path.join(os.path.dirname(__file__), file)
xls_loader = ExcelLoader(xls_path)
xls_attrs, xls_X, xls_M, xls_M_index = xls_loader._load_data(**kwargs)
csv_loader = Loader(os.path.join(os.path.dirname(__file__),
"data/DATA_MATRIX_LOG_TPM.txt"))
csv_attrs, csv_X, csv_M, csv_M_index = csv_loader._load_data(**kwargs)
self.assertEqual(xls_attrs, csv_attrs)
npt.assert_array_almost_equal(xls_X, csv_X)
npt.assert_array_equal(xls_M, csv_M)
npt.assert_array_equal(xls_M_index, csv_M_index)

def test_n_genes_n_cells(self):
file_name = os.path.join(os.path.dirname(__file__),
Expand Down Expand Up @@ -200,3 +202,5 @@ def test_concatenate_union(self):
self.assertEqual(2 * len(data1) + len(data2), len(concat_data))
self.assertEqual(len(concat_data.domain.attributes), 8)
self.assertEqual(len(concat_data.domain.metas), 2)


21 changes: 14 additions & 7 deletions orangecontrib/single_cell/widgets/load_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
guess_data_type, sanitize_variable
)
from Orange.widgets.utils.filedialogs import RecentPath
from openpyxl.reader.excel import load_workbook


def separator_from_filename(file_name):
Expand Down Expand Up @@ -510,10 +511,9 @@ def _set_enable_annotations(self):

def _set_file_parameters(self):
try:
with open_compressed(self._file_name, "rb") as f:
self.n_rows, self.n_cols, non_zero_el = scipy.io.mminfo(f)[:3]
all_el = self.n_rows * self.n_cols
self.sparsity = (all_el - non_zero_el) / all_el
self.n_rows, self.n_cols, non_zero_el = scipy.io.mminfo(self._file_name)[:3]
all_el = self.n_rows * self.n_cols
self.sparsity = (all_el - non_zero_el) / all_el
except OSError:
pass
except ValueError:
Expand Down Expand Up @@ -616,9 +616,16 @@ def __init__(self, file_name):

def _set_file_parameters(self):
try:
sheet = xlrd.open_workbook(self._file_name).sheet_by_index(0)
self.n_cols = sheet.ncols
self.n_rows = sheet.nrows
if self._file_name.endswith(".xls"):
# xlrd support only historic xls files
sheet = xlrd.open_workbook(self._file_name).sheet_by_index(0)
self.n_cols, self.n_rows = sheet.ncols, sheet.nrows
elif self._file_name.endswith(".xlsx"):
# use openpyxl library for xlsx files
wb = load_workbook(self._file_name, read_only=True)
sheet = wb.worksheets[0]
self.n_cols, self.n_rows = sheet.max_column, sheet.max_row
wb.close()
self._set_sparsity()
except Exception:
pass
Expand Down
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
'fastdtw==0.3.2',
'pandas>=0.23',
'loompy>=2.0.10',
'xlrd~=1.2.0',
'xlrd>=2.0.1',
'openpyxl',
'anndata>=0.6.21',
'numpy',
'scikit-learn',
Expand Down
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ deps =
oldest: orange3==3.34.0
oldest: orange-canvas-core==0.1.28
oldest: orange-widget-base==4.19.0
oldest: pandas==1.4.0
latest: https://github.com/biolab/orange3/archive/refs/heads/master.zip#egg=orange3
latest: https://github.com/biolab/orange-canvas-core/archive/refs/heads/master.zip#egg=orange-canvas-core
latest: https://github.com/biolab/orange-widget-base/archive/refs/heads/master.zip#egg=orange-widget-base
Expand Down

0 comments on commit 43f70d9

Please sign in to comment.