In [27]:
import requests
from tqdm.notebook import tqdm

In [18]:
response = requests.get('http://api.data.niaid.nih.gov/v1/query?q=immport&fetch_all=true').json()

In [19]:
len(response['hits'])

500

In [31]:
response['hits'][0]['description']

'<p>Host responses to vaccines are complex but important to investigate. To facilitate the study, we have developed a tool called Vaccine Induced Gene Expression Analysis Tool (VIGET), with the aim to provide an interactive online tool for users to efficiently and robustly analyze the host immune response gene expression data collected in the ImmPort database. VIGET allows users to select vaccines, choose ImmPort studies, set up analysis models by choosing confounding variables and two groups of samples having different vaccination times, and then perform differential expression analysis to select genes for pathway enrichment analysis and functional interaction network construction using the Reactome&rsquo;s web services. VIGET provides features for users to compare results from two analyses, facilitating comparative response analysis across different demographic groups. VIGET uses the Vaccine Ontology (VO) to classify various types of vaccines such as live or inactivated flu vaccines,

In [34]:
names = []
descriptions = []

for record in tqdm(response['hits']):
    if 'name' not in record:
        names.append('No name')
    else:
        names.append(record['name'])
    if 'description' not in record:
        descriptions.append('No description')
    else:
        descriptions.append(record['description'])

names, descriptions


  0%|          | 0/500 [00:00<?, ?it/s]

(['VIGET: A web portal for study of vaccine-induced host responses based on Reactome pathways and ImmPort data',
  'Dataset from T cell responses to H1N1v and a longitudinal study of seasonal influenza vaccination (TIV) SLVP015 2011 (See companion studies SDY311 2010 / SDY312 2009 / SDY314 2008 / SDY315 2012)',
  'Dataset from T cell responses to H1N1v and a longitudinal study of seasonal influenza vaccination (TIV) SLVP015 2012 (See companion studies SDY311 2010 / SDY312 2009 / SDY314 2008 / SDY112 2011)',
  'Dataset from Defective signaling in aging, influenza vaccination 2007 SLVP015 {NCT01827462}',
  'Modeling Viral Immunity and Antagonism in Dendritic Cells',
  'Induction of maturation Marker after combinatorial TNFa and IFNb treatment',
  'Peptide display hierarchy protocol',
  'Gene Expression Study of TLR Responses on Dendritic Cells',
  'RImmPort: an R/Bioconductor package that enables ready-for-analysis immunology research data.',
  'Dataset from B-cell Immunity to Influenza 

In [39]:
import pandas as pd

df = pd.DataFrame({'Name': names, 'Description': descriptions})
df

Unnamed: 0,Name,Description
0,VIGET: A web portal for study of vaccine-induc...,<p>Host responses to vaccines are complex but ...
1,Dataset from T cell responses to H1N1v and a l...,No description
2,Dataset from T cell responses to H1N1v and a l...,No description
3,"Dataset from Defective signaling in aging, inf...",No description
4,Modeling Viral Immunity and Antagonism in Dend...,A suite of ex vivo perturbations are applied t...
...,...,...
495,Dataset from Correlation of Donor Proinflammat...,Activity of genes in donor tissues that are in...
496,Dataset from Analysis of the Response of Subje...,This study will examine whether administration...
497,The Metabolic Signature of the Placenta in SPTB,The placenta is metabolically active and suppo...
498,CD4+ T cells from COVID-19 mRNA vaccine recipi...,Recent studies have shown that vaccinated indi...


In [40]:
from bs4 import BeautifulSoup

df['Name'] = df['Name'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
df['Description'] = df['Description'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

df


  df['Name'] = df['Name'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
  df['Description'] = df['Description'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())


Unnamed: 0,Name,Description
0,VIGET: A web portal for study of vaccine-induc...,Host responses to vaccines are complex but imp...
1,Dataset from T cell responses to H1N1v and a l...,No description
2,Dataset from T cell responses to H1N1v and a l...,No description
3,"Dataset from Defective signaling in aging, inf...",No description
4,Modeling Viral Immunity and Antagonism in Dend...,A suite of ex vivo perturbations are applied t...
...,...,...
495,Dataset from Correlation of Donor Proinflammat...,Activity of genes in donor tissues that are in...
496,Dataset from Analysis of the Response of Subje...,This study will examine whether administration...
497,The Metabolic Signature of the Placenta in SPTB,The placenta is metabolically active and suppo...
498,CD4+ T cells from COVID-19 mRNA vaccine recipi...,Recent studies have shown that vaccinated indi...


In [45]:
df.to_csv('../datasets/IMMPORT.csv', index=False)

In [107]:
# Open the file and read the lines
with open('../outputs/responses.txt', 'r') as f:
    data = f.readlines()

# Now 'lines' is a list where each element is a line from the file
data = [line.replace('"', '').strip() for line in data]

In [108]:
with open('../EDAM/edam_topics.txt', 'r') as f:
    lines = f.readlines()

# Filter the lines to only include those with quotes
quoted_lines = [line.strip().replace('"', '') for line in lines if '"' in line]

In [121]:
# quoted_subsets = []
out = []
for line in data:
    ln = line
    for topic in quoted_lines:
        if topic in line:
            ln = ln.replace(topic, f'"{topic}"')
    out.append(ln)

out

['Immunology, Data integration and warehousing, Bioinformatics',
 'Infectious disease, Immunology, Vaccinology',
 'Infectious disease, Immunology, Vaccinology',
 'Infectious disease, Immunology, Vaccinology',
 'Immunology, Infectious disease, Computational biology',
 'Immunology, Bioinformatics, Molecular biology',
 'Bioinformatics, Proteomics, Protein structure analysis',
 'Gene expression, Immunology, Cell biology',
 'Bioinformatics, Immunology, Computational biology',
 'Immunology, Infectious disease, Vaccinology',
 'Animal study, Infectious disease, Immunology',
 'Animal study, Immunology, Vaccinology',
 'Cell biology, Immunology, Infectious disease',
 'Immunology, Geriatric medicine, Infectious disease',
 'Immunology, Aging, Infectious disease',
 'Immunology, Infectious disease, Animal study',
 'Immunology, Geriatric medicine, Infectious disease',
 'Animal study, Immunology, Infectious disease',
 'Genetics, Immunology, Rheumatology',
 'Animal study, Infectious disease, Immunology'

In [125]:
import csv
import io

# Create an empty list to store the lists
out_lists = []

# Iterate over each string in the 'out' list
for string in out:
    # Create a StringIO object to simulate a file-like object
    string_io = io.StringIO(string)
    
    # Create a CSV reader object
    reader = csv.reader(string_io)
    
    # Convert the CSV reader object into a list and append it to the 'out_lists' list
    out_lists.append(list(reader)[0])

In [129]:
dataset = pd.read_csv('../datasets/IMMPORT.csv', lineterminator='\n')

In [131]:
output_df = pd.DataFrame(
        zip(
            dataset['Name'],
            dataset['Description'],
            ["gpt-3.5-turbo"] * len(dataset),
            out,
        ),
        columns=['Name', 'Description', "Model", "Predictions"],
    )

In [144]:
output_df['Ground Truth'] = ['Infectious disease, \"Allergy, clinical immunology and immunotherapeutics\", Immunology'] * len(output_df)

In [145]:
output_df.to_csv('../outputs/tqvhttfm.csv', index=False, lineterminator="\n")