Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 47 additions & 32 deletions ingest/expression_files/dense_ingestor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,21 +40,53 @@ def __init__(self, file_path, study_file_id, study_id, **matrix_kwargs):
self.gene_names = {}
self.header = DenseIngestor.process_header(next(self.csv_file_handler))

def matches_file_type(file_type):
return file_type == "dense"

def execute_ingest(self):
# Method can only be executed once due to
# dependency on position in text file.
# Row after header is needed for R format validation
row = next(self.csv_file_handler)
if not DenseIngestor.is_valid_format(self.header, row):
raise ValueError("Dense matrix has invalid format")
first_row = next(self.csv_file_handler)
DenseIngestor.check_valid(
self.header,
first_row,
query_params=(self.study_id, self.mongo_connection._client),
)
# Reset csv reader to first gene row
self.csv_file_handler = self.open_file(self.file_path)[0]
next(self.csv_file_handler)
for gene_docs, data_array_documents in self.transform():
self.load(gene_docs, data_array_documents)

def matches_file_type(file_type):
return file_type == "dense"
@staticmethod
def check_valid(header, first_row, query_params):
error_messages = []

try:
DenseIngestor.check_unique_header(header)
except ValueError as v:
error_messages.append(str(v))
try:
DenseIngestor.check_gene_keyword(header, first_row)
except ValueError as v:
error_messages.append(str(v))

try:
DenseIngestor.check_header_valid_values(header)
except ValueError as v:
error_messages.append(str(v))

try:
GeneExpression.check_unique_cells(header, *query_params)
except ValueError as v:
error_messages.append(str(v))

if len(error_messages) > 0:
raise ValueError('; '.join(error_messages))

return True


@staticmethod
def format_gene_name(gene):
Expand Down Expand Up @@ -112,42 +144,25 @@ def filter_expression_scores(scores: List, cells: List):
return valid_expression_scores, associated_cells

@staticmethod
def is_valid_format(header, row):
return all(
[
DenseIngestor.has_unique_header(header),
DenseIngestor.has_gene_keyword(header, row),
DenseIngestor.header_has_valid_values(header),
]
)

@staticmethod
def has_unique_header(header: List):
def check_unique_header(header: List):
"""Confirms header has no duplicate values"""
if len(set(header)) != len(header):
# Logger will replace this
print("Duplicate header values are not allowed")
return False
raise ValueError("Duplicate header values are not allowed")
return True

@staticmethod
def header_has_valid_values(header: List[str]):
def check_header_valid_values(header: List[str]):
"""Validates there are no empty header values"""
for value in header:
if not ("" == value or value.isspace()):
# If value is a string an Exception will occur because
# can't convert str to float
try:
if math.isnan(float(value)):
return False
except Exception:
pass
else:
return False
if value == "" or value.isspace():
raise ValueError("Header values cannot be blank")
if value.lower() == "nan":
raise ValueError(f"{value} is not allowed as a header value")

return True

@staticmethod
def has_gene_keyword(header: List, row: List):
def check_gene_keyword(header: List, row: List):
"""Validates that 'Gene' is the first value in header

Parameters:
Expand All @@ -163,7 +178,7 @@ def has_gene_keyword(header: List, row: List):
if (length_of_next_line - 1) == len(header):
return True
else:
return False
raise ValueError("Required 'GENE' header is not present")
else:
return True

Expand Down
44 changes: 43 additions & 1 deletion ingest/expression_files/expression_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import ntpath
import sys
from dataclasses import dataclass
from typing import List # noqa: F401
from typing import List, Dict # noqa: F401

from bson.objectid import ObjectId
from mypy_extensions import TypedDict
Expand Down Expand Up @@ -64,6 +64,48 @@ def set_dataArray(self):
Each expression file will have its own implementation of setting the
DataArray with expression data."""

@staticmethod
def check_unique_cells(cell_names: List, study_id, client):
"""Checks cell names against database to confirm matrix contains unique
cell names

Parameters:
cell_names (List[str]): List of cell names in matrix
study_id (ObjectId): The study id the cell names belong to
client: MongoDB client
"""
COLLECTION_NAME = "data_arrays"
query = {
"$and": [
{"linear_data_type": "Study"},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not strictly necessary, but you could add {"linear_data_id": study_id}.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Im adding this suggestion in another PR.

{"array_type": "cells"},
{"study_id": study_id},
]
}
# Returned fields from query results
field_names = {"values": 1, "_id": 0}
# Dict = {values_1: [<cell names>]... values_n:[<cell names>]}
query_results: List[Dict] = list(
client[COLLECTION_NAME].find(query, field_names)
)
# Query did not return results
if not query_results:
return True
# Flatten query results
existing_cells = [
values
for cell_values in query_results
for values in cell_values.get("values")
]
dupes = set(existing_cells) & set(cell_names)
if len(dupes) > 0:
error_string = f'Expression file contains {len(dupes)} cells that also exist in another expression file. '
# add the first 3 duplicates to the error message
error_string += f'Duplicates include {", ".join(list(dupes)[:3])}'
raise ValueError(error_string)
return True


def set_data_array_cells(self, values: List, linear_data_id):
"""Sets DataArray for cells that were observed in an
expression matrix."""
Expand Down
112 changes: 56 additions & 56 deletions tests/test_dense.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys
import unittest
from unittest.mock import patch
from unittest.mock import patch, MagicMock
from bson.objectid import ObjectId

sys.path.append("../ingest")
from expression_files.dense_ingestor import DenseIngestor
Expand All @@ -9,23 +10,27 @@


class TestDense(unittest.TestCase):
# TODO add test method after SCP-2635 implemented

def test_process_header(self):
header = ["one", "two", '"three', "'four"]
actual_header = DenseIngestor.process_header(header)
self.assertEqual(["one", "two", "three", "four"], actual_header)

def test_header_has_valid_values(self):
def test_check_header_valid_values(self):
header = ["one", "two", '"three', "'four"]
self.assertTrue(DenseIngestor.header_has_valid_values(header))
self.assertTrue(DenseIngestor.check_header_valid_values(header))

empty_value = ["", "two"]
self.assertRaises(
ValueError, DenseIngestor.check_header_valid_values, empty_value
)
space_value = [" ", "two"]
self.assertRaises(
ValueError, DenseIngestor.check_header_valid_values, space_value
)
nan_value = ["nan", "two"]
self.assertFalse(DenseIngestor.header_has_valid_values(empty_value))
self.assertFalse(DenseIngestor.header_has_valid_values(space_value))
self.assertFalse(DenseIngestor.header_has_valid_values(nan_value))
self.assertRaises(
ValueError, DenseIngestor.check_header_valid_values, nan_value
)

def test_filter_expression_scores(self):
scores = ["BRCA1", 4, 0, 3, "0", None, "", " ", "nan", "NaN", "Nan", "NAN"]
Expand All @@ -51,28 +56,28 @@ def test_process_row(self):
invalid_row = ["' 1.BRCA1 '", '"3.45678"', "2"]
self.assertRaises(ValueError, DenseIngestor.process_row, invalid_row)

def test_has_gene_keyword(self):
def test_check_gene_keyword(self):
"""Validates validate_gene_keyword() returns false correctly"""
# Mimics the row following the header
row = ["BRCA1", "' 1.45678 '", '"3.45678"', "2"]

header = ["GENE", "foo", "foo2", "foo3"]
r_header = ["foo", "foo2", "foo3"]
self.assertTrue(DenseIngestor.has_gene_keyword(header, row))
self.assertTrue(DenseIngestor.has_gene_keyword(r_header, row))
self.assertTrue(DenseIngestor.check_gene_keyword(header, row))
self.assertTrue(DenseIngestor.check_gene_keyword(r_header, row))

empty_header = [""]
invalid_header = ["foo", "foo2", "foo3", "foo4"]
self.assertFalse(DenseIngestor.has_gene_keyword(invalid_header, row))
self.assertFalse(DenseIngestor.has_gene_keyword(empty_header, row))
self.assertRaises(ValueError, DenseIngestor.check_gene_keyword, invalid_header, row)
self.assertRaises(ValueError, DenseIngestor.check_gene_keyword, empty_header, row)

def test_has_unique_header(self):
def test_check_unique_header(self):
"""Validates validate_unique_header() returns false correctly"""
header = ["GENE", "foo", "foo2", "foo3"]
self.assertTrue(DenseIngestor.has_unique_header(header))
self.assertTrue(DenseIngestor.check_unique_header(header))

invalid_header = ["GENE", "foo", "foo", "foo3"]
self.assertFalse(DenseIngestor.has_unique_header(invalid_header))
self.assertRaises(ValueError, DenseIngestor.check_unique_header, invalid_header)

def test_duplicate_gene(self):
expression_matrix = DenseIngestor(
Expand All @@ -86,57 +91,52 @@ def test_duplicate_gene(self):
pass
self.assertEqual(str(error.exception), "Duplicate gene: Itm2a")

@patch("expression_files.dense_ingestor.DenseIngestor.has_unique_header")
@patch("expression_files.dense_ingestor.DenseIngestor.has_gene_keyword")
@patch("expression_files.dense_ingestor.DenseIngestor.header_has_valid_values")
def test_is_valid_format(
@patch("expression_files.expression_files.GeneExpression.check_unique_cells")
def test_check_valid(
self,
mock_has_unique_header,
mock_has_gene_keyword,
mock_header_has_valid_values,
mock_check_unique_cells
):
"""Confirms functions in is_valid_format() are called"""
# Should raise Value error
mock_has_unique_header.return_value = False
mock_has_gene_keyword.return_value = False
mock_header_has_valid_values.return_value = False
self.assertFalse(
DenseIngestor.is_valid_format(["foo", "foo1"], ["foo2", "foo3"])
)
self.assertTrue(mock_has_unique_header.called)
self.assertTrue(mock_has_gene_keyword.called)
self.assertTrue(mock_header_has_valid_values.called)

# Should raise Value error
mock_has_unique_header.return_value = True
mock_has_gene_keyword.return_value = False
mock_header_has_valid_values.return_value = False
self.assertFalse(
DenseIngestor.is_valid_format(["foo", "foo1"], ["foo2", "foo3"])
)

# Should raise Value error
mock_has_unique_header.return_value = True
mock_has_gene_keyword.return_value = True
mock_header_has_valid_values.return_value = False
self.assertFalse(
DenseIngestor.is_valid_format(["foo", "foo1"], ["foo2", "foo3"])
)
"""Confirms the errors are correctly promulgated"""
query_params = (ObjectId("5dd5ae25421aa910a723a337"), MagicMock())

# When is_valid_format() returns true
mock_has_unique_header.return_value = True
mock_has_gene_keyword.return_value = True
mock_header_has_valid_values.return_value = True
self.assertTrue(
DenseIngestor.is_valid_format(["foo", "foo1"], ["foo2", "foo3"])
DenseIngestor.check_valid(
["GENE", "foo", "foo2"],
["gene1", "0", "1"],
query_params
)
)

with self.assertRaises(ValueError) as cm:
DenseIngestor.check_valid(
["GENE", "foo", "foo"],
["gene1", "0", "1"],
query_params
)
expected_msg = "Duplicate header values are not allowed"
self.assertEqual(expected_msg, str(cm.exception))

# Should concatenate the two value errors
mock_check_unique_cells.return_value = True
with self.assertRaises(ValueError) as cm:
DenseIngestor.check_valid(
["foo", "nan"],
["foo2", "foo3"],
query_params
)
expected_msg = "Required 'GENE' header is not present; nan is not allowed as a header value"
self.assertEqual(expected_msg, str(cm.exception))

@patch("expression_files.expression_files.GeneExpression.load")
@patch(
"expression_files.dense_ingestor.DenseIngestor.transform",
return_value=[("foo1", "foo2")],
)
def test_execute_ingest(self, mock_load, mock_transform):
@patch(
"expression_files.expression_files.GeneExpression.check_unique_cells",
return_value=True,
)
def test_execute_ingest(self, mock_load, mock_transform, mock_has_unique_cells):
"""
Integration test for execute_ingest()
"""
Expand Down
39 changes: 39 additions & 0 deletions tests/test_expression_files.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import sys
import unittest
from unittest.mock import MagicMock
from bson.objectid import ObjectId

sys.path.append("../ingest")
from expression_files.expression_files import GeneExpression


class TestExpressionFiles(unittest.TestCase):
def test_check_unique_cells(self):
client_mock = MagicMock()
header = ["GENE", "foo", "foo2", "foo3"]

client_mock["data_arrays"].find.return_value = [
{"values": ["foo3", "foo4", "foo5"]},
{"values": ["foo6", "foo7", "foo8"]},
]
with self.assertRaises(ValueError) as cm:
GeneExpression.check_unique_cells(header, ObjectId(), client_mock)
self.assertTrue("contains 1 cells" in str(cm.exception))
self.assertTrue("foo3" in str(cm.exception))

header = ["GENE", "foo", "foo3", "foo2", "foo6"]
with self.assertRaises(ValueError) as cm:
GeneExpression.check_unique_cells(header, ObjectId(), client_mock)
self.assertTrue("contains 2 cells" in str(cm.exception))
self.assertTrue("foo3" in str(cm.exception))
self.assertTrue("foo6" in str(cm.exception))

header = ["GENE", "foo", "foo2"]
self.assertTrue(
GeneExpression.check_unique_cells(header, ObjectId(), client_mock)
)

client_mock["data_arrays"].find.return_value = []
self.assertTrue(
GeneExpression.check_unique_cells(header, ObjectId(), client_mock)
)
Loading