In [25]:
import torch
import transformers
import sentence_transformers
import faiss
import archs4py
import pandas as pd
import re
import json

def remove_non_alphanumeric(strings):
    # Define a regular expression pattern to match non-alphanumeric characters
    pattern = r'[^a-zA-Z0-9]'
    
    # Use a list comprehension to apply the regex substitution to each string
    cleaned_strings = [re.sub(pattern, '', s) for s in strings]
    
    return cleaned_strings

def split_and_retain_quoted(strings):

    # Regular expression to match quoted strings or words
    pattern = r'"[^"]*"|\S+'
    
    # Find all matches in the text
    matches = re.findall(pattern, strings)
    clean_matches = [element for element in matches if element.startswith('"') and element.endswith('"')]
    clean_matches = [element[1:-1] for element in clean_matches]
    
    return clean_matches

def get_config_data():
    with open("local_data.json") as json_file:
        data = json.load(json_file)
        return data["OPENAI_KEY"]["key"]


In [49]:

OPENAI_API_KEY = get_config_data()

import nest_asyncio

nest_asyncio.apply()

from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")


In [13]:


#file_path = archs4py.download.counts("human", path="", version="latest")

def handle_archs4py_query(query, file, selection_type):

    """
    Accepts a parsed user query and filters the dataset using archs4py.
    
    Parameters:
    - query: str, the user query
    - data: the loaded dataset
    - selection_type: str, the type of selection to perform (samples, series, or terms)
    
    Returns:
    - query_dataset: a subset of the dataset relevant to the query that consists of a list of data objects (e.g. meta and RNAseq data)
    """

    if selection_type == "samples":
        temp_meta = archs4py.meta.samples(file, query, 
            meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"])
        #temp_data = archs4py.data.samples(file, query)
        #matched_samples = [col for col in temp_meta.index if col in temp_data.columns]
        #temp_data = temp_data[matched_samples]
        #temp_meta = temp_meta.loc[matched_samples]
        #query_dataset = {'meta': temp_meta, 'RNAseq': temp_data}
        query_dataset = {'meta': temp_meta}
    elif selection_type == "series":
        temp_meta = archs4py.meta.series(file, query, 
            meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"])
        #temp_data = archs4py.data.series(file, query)
        #matched_samples = [col for col in temp_meta.index if col in temp_data.columns]
        #temp_data = temp_data[matched_samples]
        #temp_meta = temp_meta.loc[matched_samples]
        #query_dataset = {'meta': temp_meta, 'RNAseq': temp_data}
        query_dataset = {'meta': temp_meta}
    elif selection_type == "terms":
        temp_meta = archs4py.meta.meta(file, query, 
            meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"], 
            remove_sc=False)
        #temp_samples = temp_meta["geo_accession"]
        #temp_data = archs4py.data.samples(file, temp_samples)
        #matched_samples = [col for col in temp_meta.index if col in temp_data.columns]
        #temp_data = temp_data[matched_samples]
        #temp_meta = temp_meta.loc[matched_samples]
        #query_dataset = {'meta': temp_meta, 'RNAseq': temp_data}
        query_dataset = {'meta': temp_meta}
    else:
        raise ValueError("Invalid selection type. Please choose 'sample', 'series', or 'term'.")
    
    return query_dataset

def parse_archs4py_query(full_query):
    """
    Parses a user query into chunks suitable for the handle_archs4py_query function.
    
    Parameters:
    - full_query: str, the unfiltereduser query
    
    Returns:
    - query_dataset: a subset of the dataset relevant to the full query that consists of a list of data objects (e.g. meta and RNAseq data)
    """

    # Convert the query to uppercase to ensure case-insensitive matching
    
    if "GSM" in full_query:
        query_upper = full_query.upper()
        # Perform operation for queries containing "GSM"
        # Split the string into a list of elements using spaces as the separator
        temp_elements = query_upper.split()
        temp_elements = remove_non_alphanumeric(temp_elements)
    
        # Filter the list to include only elements containing "GSE"
        temp_samples = [temp_element for temp_element in temp_elements if "GSM" in temp_element]

        # Select data for the samples
        return handle_archs4py_query(temp_samples, archive_file, "samples")

    elif "GSE" in full_query:
        query_upper = full_query.upper()
        # Perform operation for queries containing "GSE"
        # Split the string into a list of elements using spaces as the separator
        temp_elements = query_upper.split()
        temp_elements = remove_non_alphanumeric(temp_elements)
    
        # Filter the list to include only elements containing "GSE"
        temp_series = [temp_element for temp_element in temp_elements if "GSE" in temp_element]

        if len(temp_series) > 1:
            prior_dataset = []
        for temp_index, temp_value in enumerate(temp_series):
            #if len(temp_series) > 1:
            #    prior_dataset = temp_dataset
            temp_dataset = handle_archs4py_query(temp_value, archive_file, "series")
            if temp_index > 0:
                #temp_dataset['meta'] = temp_dataset['meta'][~temp_dataset['meta'].index.isin(prior_dataset['meta'].index)]
                temp_dataset['meta'] = pd.concat([prior_dataset['meta'], 
                                                  temp_dataset['meta']], 
                                                  ignore_index=True)
                #temp_dataset['RNAseq'] = temp_dataset['RNAseq'].drop(columns=prior_dataset['RNAseq'].columns.intersection(temp_dataset['RNAseq'].columns))
                #temp_dataset['RNAseq'] = pd.concat([prior_dataset['RNAseq'], 
                #                                  temp_dataset['RNAseq']], 
                #                                  axis=1)
            if temp_index < len(temp_series) - 1:
                prior_dataset = temp_dataset
        return temp_dataset

    else:
        # Perform default operation for other queries
        temp_terms = split_and_retain_quoted(full_query)
        print(temp_terms)
        if len(temp_terms) > 1:
            prior_dataset = []
        for temp_index, temp_value in enumerate(temp_terms):
            print(temp_value)
            temp_dataset = handle_archs4py_query(temp_value, archive_file, "terms")
            print(temp_dataset['meta'].shape)
            #print(temp_dataset['RNAseq'].shape)
            if temp_index > 0:
                temp_dataset['meta'] = temp_dataset['meta'][temp_dataset['meta'].index.isin(prior_dataset['meta'].index)]
                print(temp_dataset['meta'].shape)
                #temp_dataset['RNAseq'] = temp_dataset['RNAseq'].drop(columns=prior_dataset['RNAseq'].columns.intersection(temp_dataset['RNAseq'].columns))
                #print(temp_dataset['RNAseq'].shape)
                #temp_dataset['RNAseq'] = pd.concat([prior_dataset['RNAseq'], 
                #                                  temp_dataset['RNAseq']], 
                #                                  axis=1)
            if temp_index < len(temp_terms) - 1:
                prior_dataset = temp_dataset
        return temp_dataset

    #if "GSM" in full_query:
    #    return handle_archs4py_query(full_query, archive_file, "sample")
    #elif "GSE" in full_query:






In [14]:

archive_file = "human_gene_v2.5.h5"
test1 = parse_archs4py_query('I want to look up data for the "iPSC" and "neuro" samples')
print(test1['meta'].shape)
print(test1['meta'].head())
#print(test1['RNAseq'].shape)


['iPSC', 'neuro']
iPSC


100%|██████████| 6/6 [00:05<00:00,  1.10it/s]


(48474, 6)
neuro


100%|██████████| 6/6 [00:05<00:00,  1.10it/s]


(34983, 6)
(9101, 6)
(9101, 6)
           geo_accession          series_id  \
GSM1132425    GSM1132425           GSE46562   
GSM1132426    GSM1132426           GSE46562   
GSM1132427    GSM1132427           GSE46562   
GSM1179927    GSM1179927  GSE43526,GSE43900   
GSM1179928    GSM1179928  GSE43526,GSE43900   

                                          characteristics_ch1  \
GSM1132425       SAMPLE GROUP: CONTROL,TISSUE: 14 DAY NEURONS   
GSM1132426       SAMPLE GROUP: CONTROL,TISSUE: 14 DAY NEURONS   
GSM1132427       SAMPLE GROUP: CONTROL,TISSUE: 14 DAY NEURONS   
GSM1179927  CELL TYPE: FOREBRAIN CORTICAL NEURONS,AGENT: V...   
GSM1179928  CELL TYPE: FOREBRAIN CORTICAL NEURONS,AGENT: T...   

                                         extract_protocol_ch1  \
GSM1132425  RNA-SEQ WAS CARRIED OUT ON AND DAY 14 NEURONS ...   
GSM1132426  RNA-SEQ WAS CARRIED OUT ON AND DAY 14 NEURONS ...   
GSM1132427  RNA-SEQ WAS CARRIED OUT ON AND DAY 14 NEURONS ...   
GSM1179927  RNA WAS EXTRACTED USING

In [15]:
print(test1['meta']['characteristics_ch1'].value_counts())

characteristics_ch1
STAGE: 55 DAYS,TISSUE: CONTROL ORGANOID 3,SCRNA-SEQ PROTOCOL: SMARTSEQ2                                                                                           185
CELL TYPE: HUMAN EMBRYONIC STEM CELLS- AND INDUCED PLURIPOTENET STEM CELLS-DERIVED NEURAL PROGENITOR CELLS,DERIVED FROM HESC/IPSC: IPSC,WT/MUT: MUT,HD/ASD: HD    157
CELL TYPE: HUMAN EMBRYONIC STEM CELLS- AND INDUCED PLURIPOTENET STEM CELLS-DERIVED NEURAL PROGENITOR CELLS,DERIVED FROM HESC/IPSC: IPSC,WT/MUT: WT,HD/ASD: HD     142
CELL TYPE: FEEDER FREE IPSC DERIVED NEUROGENIC NS/PCS                                                                                                              91
STAGE: 55 DAYS,TISSUE: DCHS1 ORGANOID 3,SCRNA-SEQ PROTOCOL: SMARTSEQ2                                                                                              91
                                                                                                                                                      

In [73]:

archive_file = "human_gene_v2.5.h5"
test1 = parse_archs4py_query('I want to look up data for the terms "iPSC"')
print(test1['meta'].shape)
print(test1['meta'].head())
print(test1['RNAseq'].shape)


['iPSC']
iPSC
Searches for any occurrence of IPSC as regular expression


KeyboardInterrupt: 

In [46]:
temp_meta = archs4py.meta.meta(archive_file, "neuro", 
            meta_fields=["characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"], 
            remove_sc=False)
print(temp_meta.shape)
print(temp_meta.head())



100%|██████████| 4/4 [00:04<00:00,  1.07s/it]


(34983, 4)
                                          characteristics_ch1  \
GSM1024416  STRAIN: HUES 3HB9::GFP,DAY OF COLLECTION: DAY ...   
GSM1024417  STRAIN: HUES 3HB9::GFP,DAY OF COLLECTION: DAY ...   
GSM1024418  STRAIN: HUES 3HB9::GFP,DAY OF COLLECTION: DAY ...   
GSM1132421      SAMPLE GROUP: PATIENTS,TISSUE: 14 DAY NEURONS   
GSM1132422      SAMPLE GROUP: PATIENTS,TISSUE: 14 DAY NEURONS   

                                         extract_protocol_ch1  \
GSM1024416  TRIZOL EXTRACTION OF TOTAL RNA,NUGEN RNA KIT F...   
GSM1024417  TRIZOL EXTRACTION OF TOTAL RNA,NUGEN RNA KIT F...   
GSM1024418  TRIZOL EXTRACTION OF TOTAL RNA,NUGEN RNA KIT F...   
GSM1132421  RNA-SEQ WAS CARRIED OUT ON AND DAY 14 NEURONS ...   
GSM1132422  RNA-SEQ WAS CARRIED OUT ON AND DAY 14 NEURONS ...   

                                      source_name_ch1  \
GSM1024416  HUMAN STEM DERIVED MOTOR NEURONS GFP HIGH   
GSM1024417  HUMAN STEM DERIVED MOTOR NEURONS GFP HIGH   
GSM1024418    HUMAN STEM DERIVED MOT

In [23]:

archive_file = "human_gene_v2.5.h5"
test1 = handle_archs4py_query(["GSM3587529",
                              "GSM3587530",
                              "GSM3587531"], archive_file, "samples")
print(test1['meta'].shape)
print(test1['meta'].head())
print(test1['RNAseq'].shape)

test2 = handle_archs4py_query(["GSM3582063",
                              "GSM3582064",
                              "GSM3582065"], archive_file, "samples")
print(test2['meta'].shape)
print(test2['meta'].head())
print(test2['RNAseq'].shape)

test3 = handle_archs4py_query(["GSM3582063",
                              "GSM3582064",
                              "GSM3582065"], archive_file, "samples")
print(test3['meta'].shape)
print(test3['meta'].head())
print(test3['RNAseq'].shape)

test4 = handle_archs4py_query(["GSM3582063",
                              "GSM3582064",
                              "GSM3582065"], archive_file, "samples")
print(test4['meta'].shape)
print(test4['meta'].head())
print(test4['RNAseq'].shape)

combined_df = pd.concat([df1, df2], ignore_index=True)


100%|██████████| 3/3 [00:01<00:00,  1.68it/s]


(3, 6)
           geo_accession  series_id  \
GSM3587529    GSM3587529  GSE125999   
GSM3587530    GSM3587530  GSE125999   
GSM3587531    GSM3587531  GSE125999   

                                          characteristics_ch1  \
GSM3587529  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   
GSM3587530  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   
GSM3587531  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   

                                         extract_protocol_ch1 source_name_ch1  \
GSM3587529  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   
GSM3587530  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   
GSM3587531  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   

            title  
GSM3587529  A_58A  
GSM3587530  A_58B  
GSM3587531  A_67A  
(67186, 3)


100%|██████████| 3/3 [00:01<00:00,  1.68it/s]


(3, 6)
           geo_accession  series_id  \
GSM3582063    GSM3582063  GSE125805   
GSM3582064    GSM3582064  GSE125805   
GSM3582065    GSM3582065  GSE125805   

                                          characteristics_ch1  \
GSM3582063  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE S...   
GSM3582064  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE S...   
GSM3582065  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE S...   

                                         extract_protocol_ch1 source_name_ch1  \
GSM3582063  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   
GSM3582064  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   
GSM3582065  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   

                                                       title  
GSM3582063   IPSC-DERIVED INTERNEURONS_SCHIZOPHRENIA (A_58A)  
GSM3582064  IPSC-DERIVED INTERNEURONS_SCHIZOPHRENIA (A_117B)  
GSM3582065  IPSC-DERIVED INTERNEURONS_SCHIZOPHRENIA (A_282A) 

In [26]:
test1 = ["test","test2"]
print(len(test1))

2


In [18]:
print(test['meta'].head())
print(test['meta']['geo_accession'].value_counts())
print(test['meta']['series_id'].value_counts())
print(test['meta']['characteristics_ch1'].value_counts())
print(test['meta']['extract_protocol_ch1'].value_counts())
print(test['meta']['source_name_ch1'].value_counts())
print(test['meta']['title'].value_counts())


           geo_accession  series_id  \
GSM3587529    GSM3587529  GSE125999   
GSM3587530    GSM3587530  GSE125999   
GSM3587531    GSM3587531  GSE125999   
GSM3587532    GSM3587532  GSE125999   
GSM3587533    GSM3587533  GSE125999   

                                          characteristics_ch1  \
GSM3587529  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   
GSM3587530  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   
GSM3587531  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   
GSM3587532  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   
GSM3587533  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   

                                         extract_protocol_ch1 source_name_ch1  \
GSM3587529  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   
GSM3587530  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   
GSM3587531  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   
GSM3587532  SAMPLES WERE HARVESTED USING TRIZOL RE

In [2]:

#path to file
file = "human_gene_v2.5.h5"

#get sample counts
#test_meta = a4.data.meta(file, "GSE125805")
test_meta = archs4py.data.series(file, "GSE125999")
print(test_meta.shape)


100%|██████████| 60/60 [00:05<00:00, 10.08it/s]


(67186, 60)


In [31]:
print(test_meta.head())
print(test_meta['characteristics_ch1'].value_counts())

           geo_accession  series_id  \
GSM3587529    GSM3587529  GSE125999   
GSM3587530    GSM3587530  GSE125999   
GSM3587531    GSM3587531  GSE125999   
GSM3587532    GSM3587532  GSE125999   
GSM3587533    GSM3587533  GSE125999   

                                          characteristics_ch1  \
GSM3587529  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   
GSM3587530  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   
GSM3587531  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   
GSM3587532  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   
GSM3587533  CELL TYPE: IPSC-DERIVED INTERNEURONS,DISEASE: ...   

                                         extract_protocol_ch1 source_name_ch1  \
GSM3587529  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   
GSM3587530  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   
GSM3587531  SAMPLES WERE HARVESTED USING TRIZOL REAGENT (I...    INTERNEURONS   
GSM3587532  SAMPLES WERE HARVESTED USING TRIZOL RE

In [32]:

# get sample meta data
#sample_meta = a4.meta.samples(file, ["GSM1158284","GSM1482938","GSM1562817"])
#print(sample_meta["characteristics_ch1"])

meta_meta1 = archs4py.meta.meta(file, "iPSC", meta_fields=["characteristics_ch1", "source_name_ch1"])
#meta_meta2 = a4.meta.meta(file, "neuro", meta_fields=["characteristics_ch1", "source_name_ch1"])
print(meta_meta1.shape)
print(meta_meta1.head())
#print(meta_meta2.shape)

print("First 5 row names of df1:", meta_meta1.index[:5])

# Print the first 5 row names of df2
#print("First 5 row names of df2:", meta_meta2.index[:5])

# Check for overlap between row names
#overlap = set(meta_meta1.index).intersection(set(meta_meta2.index))
#print("Overlap between row names:", len(overlap))



100%|██████████| 2/2 [00:01<00:00,  1.99it/s]

(45545, 2)
                                          characteristics_ch1  \
GSM1057333                                    CELL TYPE: IPSC   
GSM1088317            CELL TYPE: HIPSCS,PASSAGES: 10,CLONE: 7   
GSM1088319  TISSUE: HIPSC-DERIVED EMBRYOID BODIES,PASSAGES...   
GSM1153501                                    CELL TYPE: IPSC   
GSM1153507                                    CELL TYPE: IPSC   

                           source_name_ch1  
GSM1057333                            IPSC  
GSM1088317                             IPS  
GSM1088319                              EB  
GSM1153501  INDUCED PLURIPOTENT STEM CELLS  
GSM1153507  INDUCED PLURIPOTENT STEM CELLS  
First 5 row names of df1: Index(['GSM1057333', 'GSM1088317', 'GSM1088319', 'GSM1153501', 'GSM1153507'], dtype='object')



