### Obtain a list of candidate ICD-10 codes using standard dictionary

The files required to run this notebook is available here: https://www.ohdsi.org/analytic-tools/athena-standardized-vocabularies/


Alternatively, obtain them from this Google Drive link, and unzip to `../ohdsi-vocab` : https://drive.google.com/drive/folders/12Qm6nrlqOW2CrluHYmvw27XZdxtpgWwu?usp=sharing

In [1]:
import pandas as pd
import os

pd.set_option('display.max_colwidth', None)

### Mounting a Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
DATA_HOME = '/content/gdrive/Shareddrives/PROJECT_ROOT_DIR'
OHDSI_VOCAB_HOME = os.path.join(DATA_HOME, 'ohdsi-vocab')

## Loading vocabularies

In [4]:
concepts = pd.read_csv(os.path.join(OHDSI_VOCAB_HOME, 'CONCEPT.csv.gz'), sep='\t', 
                       dtype={'standard_concept': str, 'concept_code': str, 'invalid_reason': str})

In [5]:
concepts.head()

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,45756805,Pediatric Cardiology,Provider,ABMS,Physician Specialty,S,OMOP4821938,19700101,20991231,
1,45756804,Pediatric Anesthesiology,Provider,ABMS,Physician Specialty,S,OMOP4821939,19700101,20991231,
2,45756803,Pathology-Anatomic / Pathology-Clinical,Provider,ABMS,Physician Specialty,S,OMOP4821940,19700101,20991231,
3,45756802,Pathology - Pediatric,Provider,ABMS,Physician Specialty,S,OMOP4821941,19700101,20991231,
4,45756801,Pathology - Molecular Genetic,Provider,ABMS,Physician Specialty,S,OMOP4821942,19700101,20991231,


In [6]:
concepts.groupby('vocabulary_id').size().sort_values(ascending=False)[:40]

vocabulary_id
RxNorm Extension    2091918
NDC                 1080136
SNOMED              1035027
SPL                  563307
Nebraska Lexicon     465801
dm+d                 387449
RxNorm               300300
LOINC                258836
OSM                  203339
ICD10PCS             194981
DPD                  193647
AMT                  136850
OMOP Genomic         120991
Read                 108696
MedDRA               105787
ICD10CM               97114
NDFRT                 69567
ICDO3                 64471
CIEL                  50881
BDPM                  44376
OPS                   41101
VANDF                 40136
NAACCR                34473
GGR                   27208
UK Biobank            19337
ICD9CM                17564
CPT4                  16615
ICD10                 16519
MeSH                  13636
OPCS4                 11000
HCPCS                 10793
CCAM                  10206
Multum                 9770
CTD                    8698
OXMIS                  8118
ClinVa

In [7]:
icd10_concepts = concepts[concepts.vocabulary_id == 'ICD10CM']

In [8]:
icd10_concepts.shape

(97114, 10)

In [9]:
icd10_concepts.to_csv(os.path.join(OHDSI_VOCAB_HOME, 'ICD10_CONCEPT.csv.gz'), sep='\t', compression='gzip', index=False)

In [10]:
injuries = icd10_concepts[icd10_concepts["concept_code"].apply(lambda x: len(x)<=5 and x.startswith("S"))]
injuries.to_csv(os.path.join(DATA_HOME, 'injury_ICD10'))

In [11]:
icd10_concepts

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
533062,1567237,Cholera,Condition,ICD10CM,3-char nonbill code,,A00,20120101,20991231,
533063,35205411,Other shigellosis,Condition,ICD10CM,4-char billing code,,A03.8,20070101,20991231,
533064,1567260,Tularemia,Condition,ICD10CM,3-char nonbill code,,A21,20120101,20991231,
533065,45571436,Tabes dorsalis,Condition,ICD10CM,5-char billing code,,A52.11,19700101,20991231,
533066,35205707,Sandfly fever,Condition,ICD10CM,4-char billing code,,A93.1,20070101,20991231,
...,...,...,...,...,...,...,...,...,...,...
630171,1576313,Dependence on other enabling machines and devices,Condition,ICD10CM,4-char nonbill code,,Z99.8,20120101,20991231,
630172,45600392,Dependence on supplemental oxygen,Condition,ICD10CM,5-char billing code,,Z99.81,19700101,20991231,
630173,45595570,Dependence on other enabling machines and devices,Condition,ICD10CM,5-char billing code,,Z99.89,19700101,20991231,
630174,766502,Post COVID-19 condition,Condition,ICD10CM,3-char nonbill code,,U09,20211001,20991231,
