Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# data-registry-api
![Coverage](https://img.shields.io/badge/coverage-57%25-brightgreen)
![Coverage](https://img.shields.io/badge/coverage-56%25-brightgreen)

## Running the server
### Running Locally
Expand Down
8 changes: 8 additions & 0 deletions dataregistry/api/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,7 @@ class SGCCohortFile(BaseModel):
file_path: str
file_name: str
file_size: int
column_mapping: Union[Dict, None] = None
uploaded_at: Union[datetime, None] = None


Expand All @@ -321,6 +322,13 @@ class SGCCoOccurrenceMetadata(BaseModel):
phenotype_pair_counts: Dict[str, int] = {} # {"phenotype1|phenotype2": count}


class SGCPhenotypeCaseTotals(BaseModel):
phenotype_code: str
total_cases_across_cohorts: int
total_controls_across_cohorts: int
num_cohorts: int


class FileUpload(BaseModel):
id: UUID
dataset_name: str
Expand Down
91 changes: 87 additions & 4 deletions dataregistry/api/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -809,16 +809,20 @@ def insert_sgc_cohort_file(engine, cohort_file) -> str:
file_id = str(cohort_file.id).replace('-', '') if cohort_file.id else str(uuid.uuid4()).replace('-', '')
cohort_id_hex = str(cohort_file.cohort_id).replace('-', '')

import json
column_mapping_json = json.dumps(cohort_file.column_mapping) if cohort_file.column_mapping else None

conn.execute(text("""
INSERT INTO sgc_cohort_files (id, cohort_id, file_type, file_path, file_name, file_size)
VALUES (:id, :cohort_id, :file_type, :file_path, :file_name, :file_size)
INSERT INTO sgc_cohort_files (id, cohort_id, file_type, file_path, file_name, file_size, column_mapping)
VALUES (:id, :cohort_id, :file_type, :file_path, :file_name, :file_size, :column_mapping)
"""), {
'id': file_id,
'cohort_id': cohort_id_hex,
'file_type': cohort_file.file_type,
'file_path': cohort_file.file_path,
'file_name': cohort_file.file_name,
'file_size': cohort_file.file_size
'file_size': cohort_file.file_size,
'column_mapping': column_mapping_json
})
conn.commit()
return file_id
Expand All @@ -845,7 +849,7 @@ def get_sgc_cohort_by_id(engine, cohort_id: str):
c.id as cohort_id, c.name, c.uploaded_by, c.total_sample_size,
c.number_of_males, c.number_of_females, c.cohort_metadata, c.validation_status, c.created_at, c.updated_at,
f.id as file_id, f.file_type, f.file_path, f.file_name,
f.file_size, f.uploaded_at as file_uploaded_at
f.file_size, f.column_mapping, f.uploaded_at as file_uploaded_at
FROM sgc_cohorts c
LEFT JOIN sgc_cohort_files f ON c.id = f.cohort_id
WHERE c.id = :cohort_id
Expand All @@ -859,6 +863,8 @@ def get_sgc_cohort_by_id(engine, cohort_id: str):
row_dict = dict(row)
if row_dict['cohort_metadata']:
row_dict['cohort_metadata'] = json.loads(row_dict['cohort_metadata'])
if row_dict['column_mapping']:
row_dict['column_mapping'] = json.loads(row_dict['column_mapping'])
parsed_result.append(row_dict)
return parsed_result
return None
Expand Down Expand Up @@ -1122,3 +1128,80 @@ def delete_sgc_cooccurrence_metadata(engine, file_id: str) -> bool:
"""), {"file_id": file_id})
conn.commit()
return result.rowcount > 0


def get_sgc_phenotype_case_totals(engine):
"""
Get total cases and controls across all SGC cohorts by phenotype.
Returns a list of phenotype statistics aggregated from all cohorts.
"""
import json

with engine.connect() as conn:
# First, get all the phenotype_counts data
query = """
SELECT
c.id as cohort_id,
c.name as cohort_name,
ccm.phenotype_counts
FROM sgc_cohorts c
JOIN sgc_cohort_files cf ON c.id = cf.cohort_id
JOIN sgc_cases_controls_metadata ccm ON cf.id = ccm.file_id
WHERE cf.file_type = 'cases_controls_both'
AND ccm.phenotype_counts IS NOT NULL
"""

result = conn.execute(text(query)).mappings().all()

# Process the data in Python since JSON operations are complex in MySQL
phenotype_totals = {}

for row in result:
cohort_id = row['cohort_id']
cohort_name = row['cohort_name']

try:
phenotype_counts = json.loads(row['phenotype_counts'])

# Iterate through each phenotype in this cohort
for phenotype_code, counts in phenotype_counts.items():
if isinstance(counts, dict) and 'cases' in counts:
cases = counts.get('cases', 0)
controls = counts.get('controls', 0)

# Initialize if this is the first time we see this phenotype
if phenotype_code not in phenotype_totals:
phenotype_totals[phenotype_code] = {
'phenotype_code': phenotype_code,
'total_cases_across_cohorts': 0,
'total_controls_across_cohorts': 0,
'cohorts': set()
}

# Add to totals
if cases is not None:
phenotype_totals[phenotype_code]['total_cases_across_cohorts'] += int(cases)
if controls is not None:
phenotype_totals[phenotype_code]['total_controls_across_cohorts'] += int(controls)

phenotype_totals[phenotype_code]['cohorts'].add(cohort_id)

except (json.JSONDecodeError, TypeError) as e:
# Skip malformed JSON data
print(f"Error parsing phenotype_counts for cohort {cohort_id}: {e}")
continue

# Convert to final format
results = []
for phenotype_code, data in phenotype_totals.items():
results.append({
'phenotype_code': data['phenotype_code'],
'total_cases_across_cohorts': data['total_cases_across_cohorts'],
'total_controls_across_cohorts': data['total_controls_across_cohorts'],
'num_cohorts': len(data['cohorts'])
})

# Sort by total cases descending
results.sort(key=lambda x: x['total_cases_across_cohorts'], reverse=True)

return results
Loading