In [7]:
import requests
from tqdm.notebook import tqdm

In [4]:
response = requests.get('http://api.data.niaid.nih.gov/v1/query?q=lincs&fetch_all=true').json()

In [8]:
names = []
descriptions = []

for record in tqdm(response['hits']):
    if 'name' not in record:
        names.append('No name')
    else:
        names.append(record['name'])
    if 'description' not in record:
        descriptions.append('No description')
    else:
        descriptions.append(record['description'])

names, descriptions


  0%|          | 0/500 [00:00<?, ?it/s]

(['LINCS - preprocessed',
  'Columbia LINCS Trans-Center project data',
  'LINCS Canvas Browser: interactive web app to query, browse and interrogate LINCS L1000 gene expression signatures.',
  'LINCS MCF10A GRNs calculated with ANANSE',
  'Systematic Quality Control Analysis of LINCS Data.',
  'Compound signature detection on LINCS L1000 big data.',
  'L1000 Dataset -small molecule perturbagens- LINCS Phase 1',
  'L1000 Dataset -small molecule perturbagens- LINCS Joint Project',
  'L1000 Dataset -small molecule perturbagens- LINCS Trans-Center Project',
  'L1000 Connectivity Map perturbational profiles from Broad Institute LINCS Center for Transcriptomics LINCS PHASE *II* (n=354,123; updated March 30, 2017)',
  'Drug-induced adverse events prediction with the LINCS L1000 data.',
  'broadinstitute/lincs-cell-painting: Full release of LINCS Cell Painting dataset',
  'L1000 Connectivity Map perturbational profiles from Broad Institute LINCS Center for Transcriptomics LINCS Pilot PHASE I 

In [9]:
import pandas as pd

df = pd.DataFrame({'Name': names, 'Description': descriptions})
df

Unnamed: 0,Name,Description
0,LINCS - preprocessed,<p>LINCS - PDRGC - preprocessed</p>
1,Columbia LINCS Trans-Center project data,The Library of Integrated Network-based Cellul...
2,LINCS Canvas Browser: interactive web app to q...,For the Library of Integrated Network-based Ce...
3,LINCS MCF10A GRNs calculated with ANANSE,<p>GRNs calculated using ANANSE presumably act...
4,Systematic Quality Control Analysis of LINCS D...,The Library of Integrated Cellular Signatures ...
...,...,...
495,Homo sapiens,Experimental validation of candidate regulator...
496,TRANSCRIPT drug repurposing dataset,<p>Version 2.0.0 (05/29/2023)</p>\n\n<p>This i...
497,Homo sapiens,Generation of KRAS signatures using immortaliz...
498,STL427944 suppresses FOXM1 via nuclear export ...,FOXM1 transcription factor is an oncogene and ...


In [10]:
from bs4 import BeautifulSoup

df['Name'] = df['Name'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())
df['Description'] = df['Description'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())

df


  df['Name'] = df['Name'].apply(lambda x: BeautifulSoup(x, 'html.parser').get_text())


Unnamed: 0,Name,Description
0,LINCS - preprocessed,LINCS - PDRGC - preprocessed
1,Columbia LINCS Trans-Center project data,The Library of Integrated Network-based Cellul...
2,LINCS Canvas Browser: interactive web app to q...,For the Library of Integrated Network-based Ce...
3,LINCS MCF10A GRNs calculated with ANANSE,GRNs calculated using ANANSE presumably active...
4,Systematic Quality Control Analysis of LINCS D...,The Library of Integrated Cellular Signatures ...
...,...,...
495,Homo sapiens,Experimental validation of candidate regulator...
496,TRANSCRIPT drug repurposing dataset,Version 2.0.0 (05/29/2023)\nThis is a drug rep...
497,Homo sapiens,Generation of KRAS signatures using immortaliz...
498,STL427944 suppresses FOXM1 via nuclear export ...,FOXM1 transcription factor is an oncogene and ...


In [12]:
df.to_csv('../datasets/LINCS.csv', index=False, lineterminator="\n")

In [14]:
pd.read_csv('../datasets/LINCS.csv', lineterminator="\n")

Unnamed: 0,Name,Description
0,LINCS - preprocessed,LINCS - PDRGC - preprocessed
1,Columbia LINCS Trans-Center project data,The Library of Integrated Network-based Cellul...
2,LINCS Canvas Browser: interactive web app to q...,For the Library of Integrated Network-based Ce...
3,LINCS MCF10A GRNs calculated with ANANSE,GRNs calculated using ANANSE presumably active...
4,Systematic Quality Control Analysis of LINCS D...,The Library of Integrated Cellular Signatures ...
...,...,...
495,Homo sapiens,Experimental validation of candidate regulator...
496,TRANSCRIPT drug repurposing dataset,Version 2.0.0 (05/29/2023)\nThis is a drug rep...
497,Homo sapiens,Generation of KRAS signatures using immortaliz...
498,STL427944 suppresses FOXM1 via nuclear export ...,FOXM1 transcription factor is an oncogene and ...


In [16]:
outputs = pd.read_csv('../outputs/9sroovd8.csv')

In [19]:
outputs['Predictions']

0      Data integration and warehousing, Computationa...
1                 Cell biology, Genetics, Bioinformatics
2      Bioinformatics, Systems biology, Computational...
3        Bioinformatics, Computational biology, Genomics
4      Bioinformatics, Computational biology, Transcr...
                             ...                        
495    Genetics, Gene expression, Transcription facto...
496      Bioinformatics, Drug discovery, Transcriptomics
497            Cell biology, Genetics, Molecular biology
498                 Biochemistry, Pharmacology, Oncology
499               Animal study, Microbiology, Toxicology
Name: Predictions, Length: 500, dtype: object

In [20]:
with open('../EDAM/edam_topics.txt', 'r') as f:
    lines = f.readlines()

# Filter the lines to only include those with quotes
quoted_lines = [line.strip().replace('"', '') for line in lines if '"' in line]
# quoted_subsets = []
out = []
for line in outputs['Predictions']:
    ln = line
    for topic in quoted_lines:
        if topic in line:
            ln = ln.replace(topic, f'"{topic}"')
    out.append(ln)

out

['Data integration and warehousing, Computational biology, "Data submission, annotation, and curation"',
 'Cell biology, Genetics, Bioinformatics',
 'Bioinformatics, Systems biology, Computational biology',
 'Bioinformatics, Computational biology, Genomics',
 'Bioinformatics, Computational biology, Transcriptomics',
 'Bioinformatics, Drug discovery, Oncology',
 'Cell biology, Drug discovery, Gene expression',
 'Transcriptomics, Small molecules, Cancer',
 'Transcriptomics, Small molecules, Cancer',
 'Transcriptomics, Drug development, Bioinformatics',
 'Bioinformatics, Drug discovery, Machine learning',
 'Cell biology, Bioinformatics, Data management',
 'Bioinformatics, Genomics, Transcriptomics',
 'Gene expression, Computational biology, Bioinformatics',
 'Bioinformatics, Computational biology, Genetics',
 'Bioinformatics, Genomics, Drug discovery',
 'Bioinformatics, Computational biology, Gene expression',
 'Bioinformatics, Proteomics, Transcriptomics',
 'Bioinformatics, Drug discover

In [21]:
outputs['Predictions'] = out

In [25]:
outputs['Ground Truth'] = ['\"Molecular interactions, pathways and networks\", Metabolomics, Protein binding sites'] * len(outputs)

In [26]:
outputs.to_csv('../outputs/9sroovd8.csv', index=False, lineterminator='\n')