Process i2b2 annotated concept files, extracting contents, and creating DataFrame with the file names, content

In [1]:
import os
import pandas as pd
import re

# Function to process a file and extract information
def process_file(file_path):
    with open(file_path, 'r') as file:
        file_content = file.read()

    return os.path.basename(file_path), file_content

# Specify the folder path
folder_path = "./i2b2_2010_VA_training_data/consolidated_concpet_training_data"

# List all files in the folder
files = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

# Process each file and store the results in a list of tuples
file_contents = []
for file in files:
    file_path = os.path.join(folder_path, file)
    file_name, content = process_file(file_path)
    file_name_without_extension, _ = os.path.splitext(file_name)
    file_contents.append((file_name, file_name_without_extension, content))

# Create a DataFrame
df = pd.DataFrame(file_contents, columns=['File Name', 'File Name without Extension', 'File Content'])

# Display the result DataFrame
print(df)


               File Name File Name without Extension  \
0       018636330_DH.con                018636330_DH   
1      026350193_RWH.con               026350193_RWH   
2      037945397_RWH.con               037945397_RWH   
3    044687343_ELMVH.con             044687343_ELMVH   
4       060376519_DH.con                060376519_DH   
..                   ...                         ...   
165        record-80.con                   record-80   
166        record-81.con                   record-81   
167        record-82.con                   record-82   
168        record-83.con                   record-83   
169        record-84.con                   record-84   

                                          File Content  
0    c="a workup" 27:2 27:3||t="test"\nc="pain" 55:...  
1    c="flexeril" 39:5 39:5||t="treatment"\nc="cons...  
2    c="ivf" 39:0 39:0||t="treatment"\nc="near sync...  
3    c="lisinopril pump" 88:5 88:6||t="treatment"\n...  
4    c="dizziness" 31:0 31:0||t="problem"\

In [2]:
df.head()

Unnamed: 0,File Name,File Name without Extension,File Content
0,018636330_DH.con,018636330_DH,"c=""a workup"" 27:2 27:3||t=""test""\nc=""pain"" 55:..."
1,026350193_RWH.con,026350193_RWH,"c=""flexeril"" 39:5 39:5||t=""treatment""\nc=""cons..."
2,037945397_RWH.con,037945397_RWH,"c=""ivf"" 39:0 39:0||t=""treatment""\nc=""near sync..."
3,044687343_ELMVH.con,044687343_ELMVH,"c=""lisinopril pump"" 88:5 88:6||t=""treatment""\n..."
4,060376519_DH.con,060376519_DH,"c=""dizziness"" 31:0 31:0||t=""problem""\nc=""benig..."


Extract concepts and associated concpet type from the file content

In [3]:
import pandas as pd

def extract_information(data):
    # Initialize lists to store extracted information
    concept_list = []
    concept_type_list = []

    # Parse data and extract information into lists
    for row in data.split('\n'):
        if row:
            # Split the row by double quotes to extract concept and concept type
            concept_list.append(row.split('"')[1].strip())
            concept_type_list.append(row.split('"')[-2].strip())

    return pd.Series([concept_list, concept_type_list])

# Assuming df is already defined
df[['Concept List', 'Concept Type List']] = df['File Content'].apply(extract_information)

# Display the updated DataFrame
print(df)


               File Name File Name without Extension  \
0       018636330_DH.con                018636330_DH   
1      026350193_RWH.con               026350193_RWH   
2      037945397_RWH.con               037945397_RWH   
3    044687343_ELMVH.con             044687343_ELMVH   
4       060376519_DH.con                060376519_DH   
..                   ...                         ...   
165        record-80.con                   record-80   
166        record-81.con                   record-81   
167        record-82.con                   record-82   
168        record-83.con                   record-83   
169        record-84.con                   record-84   

                                          File Content  \
0    c="a workup" 27:2 27:3||t="test"\nc="pain" 55:...   
1    c="flexeril" 39:5 39:5||t="treatment"\nc="cons...   
2    c="ivf" 39:0 39:0||t="treatment"\nc="near sync...   
3    c="lisinopril pump" 88:5 88:6||t="treatment"\n...   
4    c="dizziness" 31:0 31:0||t="prob

In [4]:
df.head()

Unnamed: 0,File Name,File Name without Extension,File Content,Concept List,Concept Type List
0,018636330_DH.con,018636330_DH,"c=""a workup"" 27:2 27:3||t=""test""\nc=""pain"" 55:...","[a workup, pain, microscopic anterior cervical...","[test, problem, treatment, problem, treatment,..."
1,026350193_RWH.con,026350193_RWH,"c=""flexeril"" 39:5 39:5||t=""treatment""\nc=""cons...","[flexeril, constipation, left shoulder / neck ...","[treatment, problem, problem, treatment, probl..."
2,037945397_RWH.con,037945397_RWH,"c=""ivf"" 39:0 39:0||t=""treatment""\nc=""near sync...","[ivf, near syncope, recurrent dizziness, dehyd...","[treatment, problem, problem, problem, test, p..."
3,044687343_ELMVH.con,044687343_ELMVH,"c=""lisinopril pump"" 88:5 88:6||t=""treatment""\n...","[lisinopril pump, bipap, copd, nad, fatigue, g...","[treatment, treatment, problem, problem, probl..."
4,060376519_DH.con,060376519_DH,"c=""dizziness"" 31:0 31:0||t=""problem""\nc=""benig...","[dizziness, benign positional vertigo, meclizi...","[problem, problem, treatment, problem, problem..."


mapping concept with UMLS and extract CUIs & TUIs

https://uts-ws.nlm.nih.gov/rest/search/current?string=cancer&apiKey=efd9c726-5226-43c1-8cb1-c5ac40bae98c&searchType=exact

https://uts-ws.nlm.nih.gov/rest/content/2015AB/CUI/C0006826?apiKey=efd9c726-5226-43c1-8cb1-c5ac40bae98c

scope of update,

https://uts-ws.nlm.nih.gov/rest/content/2023AB/CUI/C0006826?apiKey=efd9c726-5226-43c1-8cb1-c5ac40bae98c

In [6]:
# this is the final version of the UMLS mapping code, considers paginate and latest UMLS library version='2023AB'
import requests
import time
from cachetools import cached, TTLCache

# Define a cache with a TTL (time-to-live) of 24 hours for search results
cache = TTLCache(maxsize=100, ttl=86400)

@cached(cache)
def search_umls(term, api_key, base_url='https://uts-ws.nlm.nih.gov/rest/search/current', pageSize=25):
    all_cuis = []
    pageNumber = 1
    totalResults = float('inf')  # Set to infinity initially to enter the loop

    while len(all_cuis) < totalResults:
        try:
            response = requests.get(f'{base_url}?string={term}&apiKey={api_key}&searchType=exact&pageSize={pageSize}&pageNumber={pageNumber}')
            data = response.json()
            results = data['result']['results']
            totalResults = data['result']['recCount']  # Total number of results for the term

            all_cuis.extend([result['ui'] for result in results])

            pageNumber += 1  # Increment page number for next iteration
        except Exception as e:
            print(f"Error processing term '{term}': {e}")
            break  # Exit loop in case of an error

    return all_cuis

@cached(cache)
def get_tuis_for_cui(cui, api_key, base_url='https://uts-ws.nlm.nih.gov/rest/content', version='2023AB'):
    try:
        response = requests.get(f'{base_url}/{version}/CUI/{cui}?apiKey={api_key}')
        data = response.json()
        semantic_types = data['result']['semanticTypes']
        return [semantic_type['uri'].split('/')[-1] for semantic_type in semantic_types if semantic_type.get('uri')]
    except Exception as e:
        print(f"Error processing CUI '{cui}': {e}")
        return []

def retrieve_cuis_and_tuis_from_umls(word_list, api_key, version='2023AB', request_delay=0.1):
    cuis = set()
    tuis = set()

    for term in word_list:
        term_cuis = search_umls(term, api_key)
        cuis.update(term_cuis)
        time.sleep(request_delay)

    for cui in cuis:
        cui_tuis = get_tuis_for_cui(cui, api_key, version=version)
        tuis.update(cui_tuis)
        time.sleep(request_delay)

    return list(cuis), list(tuis)

# Your UMLS API key
api_key = 'efd9c726-5226-43c1-8cb1-c5ac40bae98c'  # Replace 'YOUR_API_KEY_HERE' with your actual UMLS API key


# Function to map tags to CUIs and TUIs for each row in the dataframe
def map_tags_to_cuis_and_tuis(tags, api_key):
    cuis, tuis = retrieve_cuis_and_tuis_from_umls(tags, api_key)
    return cuis, tuis

# Applying the mapping function to each row in the dataframe and storing the result in new columns
df[['CUIs', 'TUIs']] = df['Concept List'].apply(lambda tags: pd.Series(map_tags_to_cuis_and_tuis(tags, api_key)))

Error processing term 'intraparenchymal bleed': ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error processing term 'lymphocytic ? chromocytoma': 'result'
Error processing term '&quot; locked in &quot;': 'result'
Error processing term '&quot; top of the basilar &quot; syndrome': 'result'
Error processing term 'tylenol #3': 'result'
Error processing term 'tylenol # 3': 'result'
Error processing term 'csf labeled tube # 1': 'result'
Error processing term 'csf tube labeled # 4': 'result'
Error processing term '# 19 st. jude valve': 'result'
Error processing term '&quot; feeling down &quot;': 'result'
Error processing term '': 'result'
Error processing CUI 'C0030049': ('Connection aborted.', ConnectionResetError(10054, 'An existing connection was forcibly closed by the remote host', None, 10054, None))
Error processing term 'a #21 mosaic porcine valve': 'result'
Error processing CUI 'C3540792': 'resu

In [7]:
# import pandas as pd
# import requests
# import time
# from cachetools import cached, TTLCache

# # Define a cache with a TTL (time-to-live) of 24 hours for search results
# cache = TTLCache(maxsize=100, ttl=86400)

# @cached(cache)
# def search_umls(term, api_key, base_url='https://uts-ws.nlm.nih.gov/rest/search/current'):
#     try:
#         response = requests.get(f'{base_url}?string={term}&apiKey={api_key}&searchType=exact')
#         data = response.json()
#         results = data['result']['results']
#         return [result['ui'] for result in results]
#     except Exception as e:
#         print(f"Error processing term '{term}': {e}")
#         return []

# @cached(cache)
# def get_tuis_for_cui(cui, api_key, base_url='https://uts-ws.nlm.nih.gov/rest/content', version='2015AB'):
#     try:
#         response = requests.get(f'{base_url}/{version}/CUI/{cui}?apiKey={api_key}')
#         data = response.json()
#         semantic_types = data['result']['semanticTypes']
#         return [semantic_type['uri'].split('/')[-1] for semantic_type in semantic_types if semantic_type.get('uri')]
#     except Exception as e:
#         print(f"Error processing CUI '{cui}': {e}")
#         return []

# def retrieve_cuis_and_tuis_from_umls(word_list, api_key, version='2015AB', request_delay=0.1):
#     cuis = set()
#     tuis = set()

#     for term in word_list:
#         term_cuis = search_umls(term, api_key)
#         cuis.update(term_cuis)
#         time.sleep(request_delay)

#     for cui in cuis:
#         cui_tuis = get_tuis_for_cui(cui, api_key, version=version)
#         tuis.update(cui_tuis)
#         time.sleep(request_delay)

#     return list(cuis), list(tuis)

# # Your UMLS API key
# api_key = 'efd9c726-5226-43c1-8cb1-c5ac40bae98c'

# # Function to map tags to CUIs and TUIs for each row in the dataframe
# def map_tags_to_cuis_and_tuis(tags, api_key):
#     cuis, tuis = retrieve_cuis_and_tuis_from_umls(tags, api_key)
#     return cuis, tuis

# # Applying the mapping function to each row in the dataframe and storing the result in new columns
# df[['CUIs', 'TUIs']] = df['Concept List'].apply(lambda tags: pd.Series(map_tags_to_cuis_and_tuis(tags, api_key)))

In [8]:
df.head()

Unnamed: 0,File Name,File Name without Extension,File Content,Concept List,Concept Type List,CUIs,TUIs
0,018636330_DH.con,018636330_DH,"c=""a workup"" 27:2 27:3||t=""test""\nc=""pain"" 55:...","[a workup, pain, microscopic anterior cervical...","[test, problem, treatment, problem, treatment,...","[C0020473, C0592278, C0015967, C0728797, C0162...","[T169, T121, T201, T168, T047, T033, T184, T06..."
1,026350193_RWH.con,026350193_RWH,"c=""flexeril"" 39:5 39:5||t=""treatment""\nc=""cons...","[flexeril, constipation, left shoulder / neck ...","[treatment, problem, problem, treatment, probl...","[C0012306, C0728797, C0009214, C0262573, C4255...","[T121, T201, T037, T184, T109]"
2,037945397_RWH.con,037945397_RWH,"c=""ivf"" 39:0 39:0||t=""treatment""\nc=""near sync...","[ivf, near syncope, recurrent dizziness, dehyd...","[treatment, problem, problem, problem, test, p...","[C0015915, C2263086, C1419864, C4284399, C2751...","[T026, T028, T201, T047, T184, T061, T059, T037]"
3,044687343_ELMVH.con,044687343_ELMVH,"c=""lisinopril pump"" 88:5 88:6||t=""treatment""\n...","[lisinopril pump, bipap, copd, nad, fatigue, g...","[treatment, treatment, problem, problem, probl...","[C5539771, C3539604, C1623258, C2051415, C0302...","[T055, T025, T201, T078, T033, T184, T191, T03..."
4,060376519_DH.con,060376519_DH,"c=""dizziness"" 31:0 31:0||t=""problem""\nc=""benig...","[dizziness, benign positional vertigo, meclizi...","[problem, problem, treatment, problem, problem...","[C0155502, C0018681, C0012833, C0015967, C1457...","[T169, T121, T201, T033, T047, T184, T060, T109]"


In [9]:
df.to_csv('.\i2b2_2010_VA_training_data\consolidated_concpet_training_data\i2b2_annotated_concepts_2023AB.csv', index=False)