In [27]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

import re
import os
import csv

from tqdm.notebook import tqdm

In [28]:
DATA_PATH = 'outputs/humanatlas.csv'

data = pd.read_csv(DATA_PATH, lineterminator="\n")

print(f'Loaded dataset with {len(data)} rows')

Loaded dataset with 457 rows


In [29]:
with open('EDAM/edam_topics.txt', 'r') as f:
    edam_topics = [topic.strip() for topic in f.readlines()]

quoted_topics = [topic for topic in edam_topics if topic.startswith('"') and topic.endswith('"')]

# Remove quotes
edam_topics = [topic[1:-1] if topic.startswith('"') and topic.endswith('"') else topic for topic in edam_topics]

## Format Outputs

Split outputs on tab, and check for other separators that GPT may have used in error.

In [30]:
data['Predictions'] = data['Predictions'].str.replace('\\t', '\t')


In [31]:
def split_topics(topics):
    cleaned_topics = [topic.strip() for topic in topics.split('\t')]
    for i in range(len(cleaned_topics)):
        for quoted_topic in quoted_topics:
            if quoted_topic.replace('\"', '').lower() in cleaned_topics[i].lower():
                cleaned_topics[i] = cleaned_topics[i].replace(quoted_topic.replace('\"', ''), quoted_topic)
                break
            else:
                cleaned_topics[i] = cleaned_topics[i].replace('\"', '')
    return cleaned_topics

data['Predictions'] = data['Predictions'].apply(split_topics)

In [32]:
separators = ['    ', '   ', '  ', '\n', '<TAB>', 'TAB', '<tab>', '(tab)', '<Tab>', '[tab]', '▪️', '(Tab)', '\xa0\xa0\xa0\xa0', '\xa0', '\u2003', '、', '\x0b', '\x0c', ';', '.', '--', '-', '–', '_', '\\', '\\n', '/', '@', '|', '\r', '+', '<', '>', '·']

# Join the separators with the regex OR operator |
sep_pattern = '|'.join(map(re.escape, separators))

for i in tqdm(range(len(data))):
    # Replace commas not enclosed in double quotes with |
    data['Predictions'][i] = [re.sub(r',(?=(?:[^"]*"[^"]*")*[^"]*$)', '|', pred) for pred in data['Predictions'][i]]
    # Split on the separators
    split_list = [re.split(sep_pattern, pred) for pred in data['Predictions'][i]]
    # Flatten the list
    data['Predictions'][i] = [item for sublist in split_list for item in sublist]

data['Predictions'] = data['Predictions'].apply(lambda x: [item.strip() for item in x])

data['Predictions'] = data['Predictions'].apply(lambda x: [re.sub(r'Category \d+:', '', pred) for pred in x])

  0%|          | 0/457 [00:00<?, ?it/s]

In [33]:
is_one_dimensional = all(isinstance(pred, str) for preds in data['Predictions'] for pred in preds)
print(is_one_dimensional)

True


In [34]:
# ## Capture any weirdly formatted outputs (using the wrong separators)

# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('    ') if 0 < len(x) <= 1 and '    ' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('   ') if 0 < len(x) <= 1 and '   ' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('  ') if 0 < len(x) <= 1 and '  ' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\n') if 0 < len(x) <= 1 and '\n' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('<TAB>') if 0 < len(x) <= 1 and '<TAB>' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('TAB') if 0 < len(x) <= 1 and 'TAB' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('<tab>') if 0 < len(x) <= 1 and '<tab>' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('(tab)') if 0 < len(x) <= 1 and '(tab)' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('<Tab>') if 0 < len(x) <= 1 and '<Tab>' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('[tab]') if 0 < len(x) <= 1 and '[tab]' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('▪️') if 0 < len(x) <= 1 and '▪️' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('(Tab)') if 0 < len(x) <= 1 and '<Tab>' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\xa0\xa0\xa0\xa0') if 0 < len(x) <= 1 and '\xa0\xa0\xa0\xa0' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\xa0') if 0 < len(x) <= 1 and '\xa0' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\u2003') if 0 < len(x) <= 1 and '\u2003' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('、') if 0 < len(x) <= 1 and '、' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\x0b') if 0 < len(x) <= 1 and '\x0b' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\x0c') if 0 < len(x) <= 1 and '\x0c' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split(';') if 0 < len(x) <= 1 and ';' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('.') if 0 < len(x) <= 1 and '.' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('--') if 0 < len(x) <= 1 and '--' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('-') if 0 < len(x) <= 1 and '-' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('–') if 0 < len(x) <= 1 and '–' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('_') if 0 < len(x) <= 1 and '_' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\\') if 0 < len(x) <= 1 and '\\' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\\n') if 0 < len(x) <= 1 and '\\n' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('/') if 0 < len(x) <= 1 and '/' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('@') if 0 < len(x) <= 1 and '@' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('|') if 0 < len(x) <= 1 and '|' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('\r') if 0 < len(x) <= 1 and '\r' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('+') if 0 < len(x) <= 1 and '+' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('<') if 0 < len(x) <= 1 and '<' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split('>') if 0 < len(x) <= 1 and '>' in list(x)[0] else x)
# data['Predictions'] = data['Predictions'].apply(lambda x: [pred.strip() for pred in csv.reader([list(x)[0]],skipinitialspace=True, delimiter=',', quotechar='"').__next__()])
# data['Predictions'] = data['Predictions'].apply(lambda x: [re.sub(r'Category \d+:', '', pred) for pred in x])
# # data['Predictions'] = data['Predictions'].apply(lambda x: list(x)[0].split(', ') if len(x) <= 1 and ', ' in list(x)[0] else x)

In [35]:
# check if any of the quoted topics, or their equivalent without quotes, 
# are in any of the prediction sets with length less than or equal to 1. ""
# If there is, then add the quotes back in if they don't have them,
#  and then split on commas while avoiding anything inside quotes
def process_predictions(predictions):
    processed_predictions = []
    for prediction in predictions:
        formatted = False
        for topic in quoted_topics:
            formatted_topic = topic.replace('\"', '')
            if formatted_topic in prediction:
                processed_prediction = prediction.replace(formatted_topic, f'{topic}')
                processed_predictions.append(processed_prediction)
                formatted = True
                break
        if not formatted:
            processed_predictions.append(prediction)
        
    final_predictions = []
    for prediction in processed_predictions:
        if '\"' in prediction:
            parts = re.findall(r'[^"]+|"[^"]+"', prediction)
            final_predictions.extend(parts)
        else:
            final_predictions.extend([pred.strip() for pred in prediction.split(',')])
    return set(final_predictions)

data['Predictions'] = data['Predictions'].apply(process_predictions)

In [36]:
filtered_predictions = data[data['Predictions'].apply(len) <= 1]['Predictions']

# Filter out any expected predictions. May or may not be hallucinations
unexpected_predictions = []
for original_index, pred_set in filtered_predictions.items():
    lst = list(pred_set)
    if len(lst) == 0:
        continue
    prediction = lst[0]
    if '\"' not in prediction and ' ' in prediction and prediction not in edam_topics:
        unexpected_predictions.append((original_index, prediction))

# Print the unexpected predictions and their corresponding original indices
count = len(unexpected_predictions)
print(f"Number of unexpected predictions: {count}")
for original_index, prediction in unexpected_predictions:
    print(f"Original Index: {original_index}, Prediction: {prediction}")

Number of unexpected predictions: 3
Original Index: 277, Prediction: Spatial transcriptomic profiling
Original Index: 389, Prediction: Single cell RNA sequencing (RNAseq)
Original Index: 392, Prediction: Single cell transcriptome sequencing


## Hallucinations

Filter out topics not in the EDAM topics list. The filtered topics may be matched to a topic or synonym->topic in the next section.

In [37]:
data['Hallucinations'] = data['Predictions'].apply(lambda preds: set([pred.replace('.', '').replace('\"', '') for pred in preds if pred.replace('.', '').replace('\"', '') not in edam_topics]))

In [38]:
data['Predictions'] = data['Predictions'].apply(lambda preds: set([pred.replace('.', '').replace('\"', '') for pred in preds if pred.replace('.', '').replace('\"', '') in edam_topics]))
data['Predictions'] = data.apply(lambda row: set([topic for topic in row['Predictions'] if topic not in row['Hallucinations']]), axis=1)

## Synonym matching

Check for mispelled/misformatted topics or synonyms using levenshtein

In [39]:
edam = pd.read_csv('EDAM/EDAM.csv')

edam = edam[edam['Class ID'].str.contains('topic')].reset_index(drop=True)
# edam['Preferred Label'].apply(lambda topic: topic.replace('\"', ''))
edam = edam[edam['Preferred Label'].isin([topic.replace('\"', '') for topic in edam_topics])].reset_index(drop=True)

In [40]:
edam['Synonyms'] = edam['Synonyms'].fillna('').apply(lambda x: x.split('|') if x != '' else [])

In [41]:
missing_topics = set([topic.replace('\"', '') for topic in edam_topics]) - set(edam['Preferred Label'])
missing_topics

set()

In [42]:
synonym_dict = {}

for index, row in edam.iterrows():
    for synonym in row['Synonyms']:
        synonym_dict[synonym] = row['Preferred Label']

In [43]:
synonym_dict

{'Molecular docking': 'Molecular modelling',
 'Homology modeling': 'Molecular modelling',
 'Docking': 'Molecular modelling',
 'Comparative modelling': 'Molecular modelling',
 'Homology modelling': 'Molecular modelling',
 'Evolution': 'Evolutionary biology',
 'Freshwater science': 'Freshwater biology',
 'Nutrition': 'Nutritional science',
 'Nutrition science': 'Nutritional science',
 'Dietetics': 'Nutritional science',
 'Cardiovascular medicine': 'Cardiology',
 'Heart disease': 'Cardiology',
 'Cardiovascular disease': 'Cardiology',
 'Gene features': 'Gene structure',
 'Fusion genes': 'Gene structure',
 'Transcriptome': 'Transcriptomics',
 'Comparative transcriptomics': 'Transcriptomics',
 'Ancestral genomes': 'Paleogenomics',
 'Paleogenetics': 'Paleogenomics',
 'Panomics': 'Multiomics',
 'Pan-omics': 'Multiomics',
 'Integrative omics': 'Multiomics',
 'Multi-omics': 'Multiomics',
 'RNA metabarcoding': 'Metabarcoding',
 'DNA metabarcoding': 'Metabarcoding',
 'eDNA metabarcoding': 'Metabar

In [44]:
import Levenshtein

hallucinations = data['Hallucinations']

matched_topics = {}

for hallucination_set in tqdm(hallucinations):
    for hallucination in hallucination_set:
        if hallucination in matched_topics:
            continue
        matched = False
        # First check for a match in the topics list
        sorted_topics = sorted(edam_topics, key=lambda topic: Levenshtein.distance(hallucination, topic))
        for topic in sorted_topics:
            distance = Levenshtein.distance(hallucination, topic)
            if  0 < distance <= 2:
                matched_topics[hallucination] = topic
                matched = True
                break
        
        # If the hallucination has already been matched, skip to the next hallucination
        if matched:
            continue
        
        # If no match in the topics list, look through the available synonyms
        sorted_synonyms = sorted(synonym_dict.keys(), key=lambda topic: Levenshtein.distance(hallucination, topic))
        for topic in sorted_synonyms:
            distance = Levenshtein.distance(hallucination, topic)
            if 0 <= distance <= 1:
                matched_topics[hallucination] = synonym_dict[topic]
                matched = True
                break     

        if matched:
            continue

        for topic in sorted_topics:
            if topic.lower() in hallucination.lower().split():
                matched_topics[hallucination] = topic
                break
        # No break reached
        else:
            for topic in sorted_synonyms:
                if topic.lower() in hallucination.lower().split():
                    matched_topics[hallucination] = synonym_dict[topic]
                    break

matched_topics

  0%|          | 0/457 [00:00<?, ?it/s]

{'cell RNA': 'RNA',
 'Spatial mapping of developing human brain regions': 'Mapping',
 'cell RNA sequencing': 'RNA',
 'Transcription factor inhibition': 'Gene expression',
 'RNA sequencing': 'RNA',
 'Development of light': 'Developmental biology',
 'nucleus RNA sequencing': 'RNA',
 'Transcriptome analysis': 'Transcriptomics',
 'Bulk transcriptome analysis': 'Transcriptomics',
 'Single cell RNA sequencing (scRNAseq)': 'RNA',
 'Gene ontology functional enrichment analysis': 'Ontology and terminology',
 'cell RNA sequencing (scRNA': 'RNA',
 'Dense genotyping data integration': 'Genotype and phenotype',
 'cell transcriptome studies': 'Transcriptomics',
 'clamp electrophysiology': 'Physiology',
 'cell RNA sequencing (scRNAseq)': 'RNA',
 '10X Genomics technology': 'Genomics',
 'marker expression analysis': 'Gene expression',
 'Single cell RNA sequencing': 'RNA',
 'nucleus transcriptome profiling': 'Transcriptomics',
 'regulatory element mapping': 'Mapping',
 '10x Genomics Chromium Instrument'

In [45]:
for index, row in data.iterrows():
    if len(row['Hallucinations']) > 0:
        for hallucination in list(row['Hallucinations']):
            if hallucination in matched_topics:
                print(f"'{hallucination}' in row {index} matches topic '{matched_topics[hallucination]}'")
                data.at[index, 'Predictions'].add(matched_topics[hallucination])
                continue

'cell RNA' in row 0 matches topic 'RNA'
'Spatial mapping of developing human brain regions' in row 0 matches topic 'Mapping'
'cell RNA sequencing' in row 1 matches topic 'RNA'
'Transcription factor inhibition' in row 1 matches topic 'Gene expression'
'RNA sequencing' in row 2 matches topic 'RNA'
'Development of light' in row 2 matches topic 'Developmental biology'
'cell RNA sequencing' in row 3 matches topic 'RNA'
'nucleus RNA sequencing' in row 4 matches topic 'RNA'
'Transcriptome analysis' in row 4 matches topic 'Transcriptomics'
'cell RNA' in row 5 matches topic 'RNA'
'Bulk transcriptome analysis' in row 5 matches topic 'Transcriptomics'
'Single cell RNA sequencing (scRNAseq)' in row 6 matches topic 'RNA'
'Gene ontology functional enrichment analysis' in row 7 matches topic 'Ontology and terminology'
'cell RNA sequencing (scRNA' in row 7 matches topic 'RNA'
'cell RNA sequencing' in row 8 matches topic 'RNA'
'Dense genotyping data integration' in row 8 matches topic 'Genotype and phe

In [46]:
data[data['Hallucinations'].apply(len) > 1]

Unnamed: 0,_id,name,description,Model,Predictions,Hallucinations
0,hca_ef1d9888-fa86-47a4-bb72-0ab0f20f7004,Single-cell atlas of early human brain development highlights heterogeneity of human neuroepithelial cells and early radial glia.,"The human cortex comprises diverse cell types that emerge from an initially uniform neuroepithelium that gives rise to radial glia, the neural stem cells of the cortex. To characterize the earliest stages of human brain development, we performed single-cell RNA-sequencing across regions of the developing human brain, including the telencephalon, diencephalon, midbrain, hindbrain and cerebellum. We identify nine progenitor populations physically proximal to the telencephalon, suggesting more heterogeneity than previously described, including a highly prevalent mesenchymal-like population that disappears once neurogenesis begins. Comparison of human and mouse progenitor populations at corresponding stages identifies two progenitor clusters that are enriched in the early stages of human cortical development. We also find that organoid systems display low fidelity to neuroepithelial and early radial glia cell types, but improve as neurogenesis progresses. Overall, we provide a comprehensive molecular and spatial atlas of early stages of human brain and cortical development.",gpt-3.5-turbo,"{RNA, Mapping}","{, sequencing, cell RNA, Comparative analysis with mouse progenitor populations, Single, Spatial mapping of developing human brain regions}"
1,hca_923d3231-7295-4184-b3f6-c3082766a8c7,A Single-Cell Transcriptomic Atlas of Human Skin Aging.,"Skin undergoes constant self-renewal, and its functional decline is a visible consequence of aging. Understanding human skin aging requires in-depth knowledge of the molecular and functional properties of various skin cell types. We performed single-cell RNA sequencing of human eyelid skin from healthy individuals across different ages and identified eleven canonical cell types, as well as six subpopulations of basal cells. Further analysis revealed progressive accumulation of photoaging-related changes and increased chronic inflammation with age. Transcriptional factors involved in the developmental process underwent early-onset decline during aging. Furthermore, inhibition of key transcription factors HES1 in fibroblasts and KLF6 in keratinocytes not only compromised cell proliferation, but also increased inflammation and cellular senescence during aging. Lastly, we found that genetic activation of HES1 or pharmacological treatment with quercetin alleviated cellular senescence of dermal fibroblasts. These findings provide a single-cell molecular framework of human skin aging, providing a rich resource for developing therapeutic strategies against aging-related skin disorders.",gpt-3.5-turbo,"{RNA, Gene expression}","{cell RNA sequencing, 2, Genetic activation, Transcription factor inhibition, 5, 3, Transcriptomic analysis, Pharmacological treatment, 1, 4, Single}"
2,hca_1dddae6e-3753-48af-b20e-fa22abad125d,Cell Types of the Human Retina and Its Organoids at Single-Cell Resolution,"Human organoids recapitulating the cell-type diversity and function of their target organ are valuable for basic and translational research. We developed light-sensitive human retinal organoids with multiple nuclear and synaptic layers and functional synapses. We sequenced the RNA of 285,441 single cells from these organoids at seven developmental time points and from the periphery, fovea, pigment epithelium and choroid of light-responsive adult human retinas, and performed histochemistry. Cell types in organoids matured in vitro to a stable ""developed"" state at a rate similar to human retina development in vivo. Transcriptomes of organoid cell types converged toward the transcriptomes of adult peripheral retinal cell types. Expression of disease-associated genes was cell-type-specific in adult retina, and cell-type specificity was retained in organoids. We implicate unexpected cell types in diseases such as macular degeneration. This resource identifies cellular targets for studying disease mechanisms in organoids and for targeted repair in human retinas.",gpt-3.5-turbo,"{RNA, Developmental biology}","{, RNA sequencing, Development of light, Histochemistry, sensitive human retinal organoids}"
3,hca_95f07e6e-6a73-4e1b-a880-c83996b3aa5c,Immunophenotyping of COVID-19 and influenza highlights the role of type I interferons in development of severe COVID-19,"Although most severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2)–infected individuals experience mild coronavirus disease 2019 (COVID-19), some patients suffer from severe COVID-19, which is accompanied by acute respiratory distress syndrome and systemic inflammation. To identify factors driving severe progression of COVID-19, we performed single-cell RNA sequencing using peripheral blood mononuclear cells (PBMCs) obtained from healthy donors, patients with mild or severe COVID-19, and patients with severe influenza. Patients with COVID-19 exhibited hyperinflammatory signatures across all types of cells among PBMCs, particularly up-regulation of the tumor necrosis factor/interleukin-1β (TNF/IL-1β)–driven inflammatory response as compared with severe influenza. In classical monocytes from patients with severe COVID-19, type I interferon (IFN) response coexisted with the TNF/IL-1β–driven inflammation, and this was not seen in patients with milder COVID-19. We documented type I IFN–driven inflammatory features in patients with severe influenza as well. On the basis of this, we propose that the type I IFN response plays a pivotal role in exacerbating inflammation in severe COVID-19.",gpt-3.5-turbo,{RNA},"{, cell RNA sequencing, Immunophenotyping, Single}"
4,hca_dcc28fb3-7bab-48ce-bc4b-684c00e133ce,A single-cell atlas of entorhinal cortex from individuals with Alzheimer's disease reveals cell-type-specific gene expression regulation.,"There is currently little information available about how individual cell types contribute to Alzheimer's disease. Here we applied single-nucleus RNA sequencing to entorhinal cortex samples from control and Alzheimer's disease brains (n = 6 per group), yielding a total of 13,214 high-quality nuclei. We detail cell-type-specific gene expression patterns, unveiling how transcriptional changes in specific cell subpopulations are associated with Alzheimer's disease. We report that the Alzheimer's disease risk gene APOE is specifically repressed in Alzheimer's disease oligodendrocyte progenitor cells and astrocyte subpopulations and upregulated in an Alzheimer's disease-specific microglial subopulation. Integrating transcription factor regulatory modules with Alzheimer's disease risk loci revealed drivers of cell-type-specific state transitions towards Alzheimer's disease. For example, transcription factor EB, a master regulator of lysosomal function, regulates multiple disease genes in a specific Alzheimer's disease astrocyte subpopulation. These results provide insights into the coordinated control of Alzheimer's disease risk genes and their cell-type-specific contribution to disease susceptibility. These results are available at http://adsn.ddnetbio.com.",gpt-3.5-turbo,"{RNA, Transcriptomics}","{Integrative analysis, 2, nucleus RNA sequencing, 3, Regulatory module analysis, 1, 4, Single, Transcriptome analysis}"
...,...,...,...,...,...,...
452,hca_bd400331-54b9-4fcc-bff6-6bb8b079ee1f,Sex-Specific Control of Human Heart Maturation by the Progesterone Receptor.,"Background: Despite in-depth knowledge of the molecular mechanisms controlling embryonic heart development, little is known about the signals governing postnatal maturation of the human heart. Methods: Single nucleus RNA-sequencing (snRNA-seq) of 54,140 nuclei from 9 human donors was used to profile transcriptional changes in diverse cardiac cell types during maturation from fetal stages to adulthood. Bulk RNA-sequencing and the assay for transposase-accessible chromatin using sequencing (ATAC-seq) were used to further validate transcriptional changes and to profile alterations in the chromatin accessibility landscape in purified cardiomyocyte nuclei from 21 human donors. Functional validation studies of sex steroids implicated in cardiac maturation were performed in human pluripotent stem cell-derived cardiac organoids and mice. Results: Our data identify the progesterone receptor as a key mediator of sex-dependent transcriptional programs during cardiomyocyte maturation. Functional validation studies in human cardiac organoids and mice demonstrate the progesterone receptor drives sex-specific metabolic programs and maturation of cardiac contractile properties. Conclusions: These data provide a blueprint for understanding human heart maturation in both sexes and reveal an important role for the progesterone receptor in human heart development.",gpt-3.5-turbo,{RNA},"{sequencing, 2, 3, Bulk RNA, 1, seq), sequencing (snRNA, accessible chromatin using sequencing (ATAC, Single nucleus RNA, Assay for transposase}"
453,hca_415eb773-cadb-43d1-ab89-7d160d5cfc7d,Single-cell analysis reveals innate lymphoid cell lineage infidelity in atopic dermatitis,"BackgroundAlthough ample knowledge exists about phenotype and function of cutaneous T lymphocytes, much less is known about the lymphocytic components of the skin's innate immune system.ObjectiveTo better understand the biologic role of cutaneous innate lymphoid cells (ILCs), we investigated their phenotypic and molecular features under physiologic (normal human skin [NHS]) and pathologic (lesional skin of patients with atopic dermatitis [AD]) conditions.MethodsSkin punch biopsies and reduction sheets as well as blood specimens were obtained from either patients with AD or healthy individuals. Cell and/or tissue samples were analyzed by flow cytometry, immunohistochemistry, and single-cell RNA sequencing or subjected to in vitro/ex vivo culture.ResultsNotwithstanding substantial quantitative differences between NHS and AD skin, we found that the vast majority of cutaneous ILCs belong to the CRTH2+ subset and reside in the upper skin layers. Single-cell RNA sequencing of cutaneous ILC-enriched cell samples confirmed the predominance of biologically heterogeneous group 2 ILCs and, for the first time, demonstrated considerable ILC lineage infidelity (coexpression of genes typical of either type 2 [GATA3 and IL13] or type 3/17 [RORC, IL22, and IL26] immunity within individual cells) in lesional AD skin, and to a much lesser extent, in NHS. Similar events were demonstrated in ILCs from skin explant cultures and in vitro expanded ILCs from the peripheral blood.ConclusionThese findings support the concept that instead of being a stable entity with well-defined components, the skin immune system consists of a network of highly flexible cellular players that are capable of adjusting their function to the needs and challenges of the environment.",gpt-3.5-turbo,{RNA},"{, cell RNA sequencing, Flow cytometry, In vitro, Immunohistochemistry, Single, ex vivo culture}"
454,hca_a29952d9-925e-40f4-8a1c-274f118f1f51,Bone marrow plasma cells from hip replacement surgeries,"Our aims are to generate a full representation of human hematopoiesis in blood and bone marrow of humans using a multi-tier and iterative collection and analysis of 200,000 cells from ten healthy human bone",gpt-3.5-turbo,{RNA},"{Bone marrow aspiration, cell RNA sequencing, 2, 3, tier collection and analysis, 1, 4, Single, Iterative analysis, Multi}"
455,hca_d2111fac-3fc4-4f42-9b6d-32cd6a828267,Human cerebral organoids recapitulate gene expression programs of fetal neocortex development.,"Cerebral organoids – three-dimensional cultures of human cerebral tissue derived from pluripotent stem cells – have emerged as models of human cortical development. However, the extent to which in vitro organoid systems recapitulate neural progenitor cell proliferation and neuronal differentiation programs observed in vivo remains unclear. Here we use single-cell RNA sequencing (scRNA-seq) to dissect and compare cell composition and progenitor-to-neuron lineage relationships in human cerebral organoids and fetal neocortex. Covariation network analysis using the fetal neocortex data reveals known and novel interactions among genes central to neural progenitor proliferation and neuronal differentiation. In the organoid, we detect diverse progenitors and differentiated cell types of neuronal and mesenchymal lineages, and identify cells that derived from regions resembling the fetal neocortex. We find that these organoid cortical cells use gene expression programs remarkably similar to those of the fetal tissue in order to organize into cerebral cortex-like regions. Our comparison of in vivo and in vitro cortical single cell transcriptomes illuminates the genetic features underlying human cortical development that can be studied in organoid cultures.",gpt-3.5-turbo,{RNA},"{, Covariation network analysis, cell RNA sequencing (scRNA, seq), Single}"


In [47]:
# Add quotes back in for the predictions with commas (i.e, "Data submission, annotation, and curation")
data['Predictions'] = data['Predictions'].apply(lambda preds: [f'"{pred}"' if f'"{pred}"' in quoted_topics else pred for pred in preds])

In [48]:
data[data['Hallucinations'].apply(len) > 0][['Predictions', 'Hallucinations']]

Unnamed: 0,Predictions,Hallucinations
0,"[RNA, Mapping]","{, sequencing, cell RNA, Comparative analysis with mouse progenitor populations, Single, Spatial mapping of developing human brain regions}"
1,"[RNA, Gene expression]","{cell RNA sequencing, 2, Genetic activation, Transcription factor inhibition, 5, 3, Transcriptomic analysis, Pharmacological treatment, 1, 4, Single}"
2,"[RNA, Developmental biology]","{, RNA sequencing, Development of light, Histochemistry, sensitive human retinal organoids}"
3,[RNA],"{, cell RNA sequencing, Immunophenotyping, Single}"
4,"[RNA, Transcriptomics]","{Integrative analysis, 2, nucleus RNA sequencing, 3, Regulatory module analysis, 1, 4, Single, Transcriptome analysis}"
...,...,...
452,[RNA],"{sequencing, 2, 3, Bulk RNA, 1, seq), sequencing (snRNA, accessible chromatin using sequencing (ATAC, Single nucleus RNA, Assay for transposase}"
453,[RNA],"{, cell RNA sequencing, Flow cytometry, In vitro, Immunohistochemistry, Single, ex vivo culture}"
454,[RNA],"{Bone marrow aspiration, cell RNA sequencing, 2, 3, tier collection and analysis, 1, 4, Single, Iterative analysis, Multi}"
455,[RNA],"{, Covariation network analysis, cell RNA sequencing (scRNA, seq), Single}"


In [49]:
print('Number of rows with 0 predictions:', f'{len(data[data["Predictions"].apply(len) == 0])}/{len(data)}')

Number of rows with 0 predictions: 38/457


In [50]:
data['Predictions'] = data['Predictions'].apply(lambda lst: set(lst))

In [51]:
file_name = os.path.basename(DATA_PATH).replace('.', '_processed.')

data.to_csv(f'outputs/{file_name}', index=False)