Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Data dictionary #5

Closed
wants to merge 9 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
/download
/brick
/unzip
37 changes: 37 additions & 0 deletions data_definitions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from functools import reduce
import httpx
from itertools import groupby
import json


timeout = httpx.Timeout(10, read=60)
client = httpx.Client(timeout=timeout)

URL = "https://aact.ctti-clinicaltrials.org/definitions.json"


def merger(acc: dict, d: dict):
acc |= d
return acc


def create_data_dict(ds):
out = {}
for table_name, grouper in ds:
out[table_name] = reduce(merger, grouper, {})
return out


def save_json(resp):
as_dict = groupby(
({v["column"]: v["data type"], "table": v["table"]} for v in resp.json()),
lambda obj: obj["table"],
)
data_dict = create_data_dict(as_dict)
with open("data_dictionary.json", "w") as f:
json.dump(data_dict, f, indent=4)


if __name__ == "__main__":
resp = client.get(URL)
save_json(resp)
277 changes: 277 additions & 0 deletions data_dictionary.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,277 @@
{
"studies": {
"biospec_retention": "string",
"table": "studies",
"enrollment_type": "string",
"is_fda_regulated_drug": "boolean",
"is_us_export": "boolean",
"last_known_status": "string"
},
"analyzed_studies": {
"oversight": "string",
"table": "analyzed_studies",
"nct_id": "string",
"url": "string",
"brief_title": "string",
"start_month": "string",
"start_year": "string",
"overall_statusc": "string",
"p_completion_month": "string",
"p_completion_year": "string",
"completion_month": "string",
"completion_year": "string",
"verification_month": "string",
"verification_year": "string",
"p_comp_mn": "string",
"p_comp_yr": "string",
"received_year": "string",
"mntopcom": "string",
"enrollment": "string",
"number_of_arms": "string",
"allocation": "string",
"masking": "string",
"phasec": "string",
"primary_purpose": null,
"sponsor_name": null,
"agency_classc": "string",
"collaborator_names": "string",
"funding": "string",
"responsible_party_type": "string",
"responsible_party_organization": "string",
"behavioral": "string",
"results": "string",
"resultsreceived_month": "string",
"resultsreceived_year": "string",
"firstreceived_results_dt": "string",
"t2result": "string",
"t2result_imp": "string",
"t2resmod": "string",
"results12": "string",
"delayed": "string",
"dr_received_dt": "string",
"mn2delay": "string",
"delayed12": "string",
"id": "integer",
"intervg1": "string",
"biological": "string",
"device": "string",
"dietsup": "string",
"drug": "string",
"genetic": "string",
"procedure": "string",
"radiation": "string",
"otherint": "string"
},
"outcomes": {
"units_analyze": "string",
"table": "outcomes",
"dispersion_type": "string",
"param_type": "string"
},
"browse_conditions": {
"mesh_term": "string",
"table": "browse_conditions"
},
"pending_results": {
"event_date": "date",
"table": "pending_results"
},
"ipd_information_types": {
"name": "string",
"table": "ipd_information_types"
},
"design_group_interventions": {
"id": "integer",
"table": "design_group_interventions"
},
"brief_summaries": {
"description": "text",
"table": "brief_summaries"
},
"facility_contacts": {
"phone": "string",
"table": "facility_contacts"
},
"interventions": {
"description": "text",
"table": "interventions"
},
"browse_interventions": {
"id": "integer",
"table": "browse_interventions",
"mesh_term": "string"
},
"overall_officials": {
"name": "string",
"table": "overall_officials",
"affiliation": "string"
},
"reported_events": {
"assessment": "string",
"table": "reported_events"
},
"outcome_analyses": {
"ci_n_sides": "string",
"table": "outcome_analyses"
},
"calculated_values": {
"registered_in_calendar_year": "integer",
"table": "calculated_values"
},
"central_contacts": {
"contact_type": "string",
"table": "central_contacts"
},
"conditions": {
"name": "string",
"table": "conditions"
},
"design_groups": {
"group_type": "string",
"table": "design_groups"
},
"countries": {
"removed": "boolean",
"table": "countries",
"name": "string"
},
"design_outcomes": {
"outcome_type": "string",
"table": "design_outcomes"
},
"designs": {
"intervention_model": "string",
"table": "designs"
},
"detailed_descriptions": {
"description": "text",
"table": "detailed_descriptions"
},
"documents": {
"comment": "string",
"table": "documents",
"url": "string"
},
"provided_documents": {
"has_icf": "boolean",
"table": "provided_documents"
},
"eligibilities": {
"gender": "string",
"table": "eligibilities",
"gender_based": "boolean",
"healthy_volunteers": "string"
},
"facilities": {
"status": "string",
"table": "facilities"
},
"responsible_parties": {
"responsible_party_type": "string",
"table": "responsible_parties"
},
"id_information": {
"id_type": "string",
"table": "id_information"
},
"intervention_other_names": {
"name": "string",
"table": "intervention_other_names"
},
"facility_investigators": {
"role": "string",
"table": "facility_investigators"
},
"keywords": {
"name": "string",
"table": "keywords"
},
"links": {
"url": "string",
"table": "links",
"description": "text"
},
"sponsors": {
"agency_class": "string",
"table": "sponsors"
},
"study_references": {
"reference_type": "string",
"table": "study_references"
},
"search_results": {
"id": "integer",
"table": "search_results"
},
"result_agreements": {
"pi_employee": "string",
"table": "result_agreements"
},
"study_searches": {
"id": "integer",
"table": "study_searches"
},
"result_contacts": {
"name": "string",
"table": "result_contacts",
"phone": "string",
"email": "string"
},
"result_groups": {
"description": "text",
"table": "result_groups"
},
"baseline_measurements": {
"category": "string",
"table": "baseline_measurements"
},
"outcome_measurements": {
"param_type": "string",
"table": "outcome_measurements"
},
"baseline_counts": {
"scope": "string",
"table": "baseline_counts",
"units": "string"
},
"outcome_analysis_groups": {
"ctgov_group_code": "string",
"table": "outcome_analysis_groups"
},
"outcome_counts": {
"scope": "string",
"table": "outcome_counts"
},
"milestones": {
"count": "integer",
"table": "milestones",
"description": "text",
"title": "string",
"period": "string",
"ctgov_group_code": "string"
},
"participant_flows": {
"recruitment_details": "string",
"table": "participant_flows",
"pre_assignment_details": "string"
},
"drop_withdrawals": {
"period": "string",
"table": "drop_withdrawals"
},
"tagged_terms": {
"id": "integer",
"table": "tagged_terms"
},
"cdek_organizations": {
"id": "integer",
"table": "cdek_organizations"
},
"cdek_synonyms": {
"name": "string",
"table": "cdek_synonyms",
"preferred_name": "string",
"downcase_name": "string",
"downcase_preferred_name": "string"
}
}
50 changes: 34 additions & 16 deletions dvc.lock
Original file line number Diff line number Diff line change
@@ -1,36 +1,54 @@
schema: '2.0'
stages:
download:
cmd: Rscript stages/download.R
cmd: python3 stages/00_download.py
deps:
- path: https://aact.ctti-clinicaltrials.org/pipe_files
hash: md5
checksum: '270187926789710790283625292984097059272'
- path: stages/download.R
checksum: '286645490874361551640663643212643142456'
- path: stages/00_download.py
hash: md5
md5: 25796c42a85949526905b1c02c917ab7
size: 586
md5: 8d117a0f83077989efddea036576c11a
size: 1085
outs:
- path: download/
hash: md5
md5: 3be97a6b93b71c58a0f0a4195162ee9b.dir
size: 1805514254
md5: 1aa7eedcafa09cf85dea3f26171d3099.dir
size: 1814330916
nfiles: 1
build:
cmd: Rscript stages/build.R
cmd: python3 stages/03_build.py
deps:
- path: stages/03_build.py
hash: md5
md5: ca6f58e4fe1b01bb5a62a62c5a60e500
size: 1420
- path: unzip/
hash: md5
md5: c60563edc132bc7ca753bee8864f8285.dir
size: 12008296453
nfiles: 47
outs:
- path: brick/
hash: md5
md5: 81e5a88e9d89c4d4525551ce7f596d72.dir
size: 2732820598
nfiles: 47
unzip:
cmd: python3 stages/02_unzip.py
deps:
- path: download/
hash: md5
md5: 3be97a6b93b71c58a0f0a4195162ee9b.dir
size: 1805514254
md5: 1aa7eedcafa09cf85dea3f26171d3099.dir
size: 1814330916
nfiles: 1
- path: stages/build.R
- path: stages/02_unzip.py
hash: md5
md5: 838d95032e611a7168262b12616d9d0f
size: 517
md5: 3510bb5e24a50ec96768e9f375b69e31
size: 552
outs:
- path: brick/
- path: unzip/
hash: md5
md5: 162aa0bcf28a151c2d6aef3bea897f47.dir
size: 3150743193
md5: c60563edc132bc7ca753bee8864f8285.dir
size: 12008296453
nfiles: 47
Loading
Loading