In [25]:
import requests
import zipfile
import os
import xml.etree.ElementTree as ET
import csv
import pandas as pd

In [26]:

# URL to download the ICD-10-CM ZIP file from the CDC
icd10cm_dotted_url_2024 = "https://ftp.cdc.gov/pub/health_statistics/nchs/publications/icd10cm/2024/icd10cm-Table%20and%20Index-2024.zip"

# Path to save the downloaded ZIP file 
download_path = "./data/icd10cm-Table-and-Index-2024.zip"

# Ensure the directory exists, if not, create it
download_directory = os.path.dirname(download_path)
if not os.path.exists(download_directory):
    os.makedirs(download_directory)
    print("Created directory:", download_directory)

# Perform the download
response = requests.get(icd10cm_dotted_url_2024)
if response.status_code == 200:
    with open(download_path, 'wb') as f:
        f.write(response.content)
    print("Download complete.")
else:
    print("Failed to download file.")

# Path to extract the contents of the ZIP file
extraction_path = "./data/icd10cm-Table-and-Index-2024"

# Ensure the extraction directory exists, if not, create it
if not os.path.exists(extraction_path):
    os.makedirs(extraction_path)
    print("Created directory:", extraction_path)

# Check if the ZIP file was downloaded and exists
if os.path.exists(download_path):
    # Extract the ZIP file
    with zipfile.ZipFile(download_path, 'r') as zip_ref:
        zip_ref.extractall(extraction_path)
    print("Extraction complete.")
else:
    print("Downloaded file not found.")


Download complete.
Extraction complete.


In [27]:

# Load the XML file
def parse_xml_to_csv(xml_file_path, csv_file_path):
    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    # Open a CSV file for writing
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
        csvwriter = csv.writer(csvfile)
        # Write the header
        csvwriter.writerow(['code', 'description'])

        # Extract relevant data and write to CSV
        for child in root:
            for elem in child.findall(".//diag"):
                code = elem.find("name").text if elem.find("name") is not None else ""
                desc = elem.find("desc").text if elem.find("desc") is not None else ""
                csvwriter.writerow([code, desc])

# Example usage
xml_file_path = './data/icd10cm-Table-and-Index-2024/icd10cm-tabular-2024.xml'  # Update with the path to your XML file
csv_file_path = './data/icd10cm-Table-and-Index-2024/icd10cm_with_dots_2024.csv'  # Desired path for the output CSV file
parse_xml_to_csv(xml_file_path, csv_file_path)


In [28]:
icd_withdots = pd.read_csv(csv_file_path)
icd_withdots.head(20)

Unnamed: 0,code,description
0,A00,Cholera
1,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol..."
2,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor"
3,A00.9,"Cholera, unspecified"
4,A01,Typhoid and paratyphoid fevers
5,A01.0,Typhoid fever
6,A01.00,"Typhoid fever, unspecified"
7,A01.01,Typhoid meningitis
8,A01.02,Typhoid fever with heart involvement
9,A01.03,Typhoid pneumonia


In [29]:

# Function to remove all dots
def remove_dots(code):
    return code.replace('.', '')

# Apply functions to the 'code' column to create new a columns
icd_withdots['icd10cm_code'] = icd_withdots['code'].apply(remove_dots)
icd_withdots.head(20)

Unnamed: 0,code,description,icd10cm_code
0,A00,Cholera,A00
1,A00.0,"Cholera due to Vibrio cholerae 01, biovar chol...",A000
2,A00.1,"Cholera due to Vibrio cholerae 01, biovar eltor",A001
3,A00.9,"Cholera, unspecified",A009
4,A01,Typhoid and paratyphoid fevers,A01
5,A01.0,Typhoid fever,A010
6,A01.00,"Typhoid fever, unspecified",A0100
7,A01.01,Typhoid meningitis,A0101
8,A01.02,Typhoid fever with heart involvement,A0102
9,A01.03,Typhoid pneumonia,A0103


In [30]:
icd_withdots.to_csv("./data/filtered/icd10cm_2024.csv")