In [55]:
import torch
import transformers
import sentence_transformers
import faiss
import archs4py
import pandas as pd
import re
import json

archive_file = "human_gene_v2.5.h5"

def remove_non_alphanumeric(strings):
    # Define a regular expression pattern to match non-alphanumeric characters
    pattern = r'[^a-zA-Z0-9]'
    
    # Use a list comprehension to apply the regex substitution to each string
    cleaned_strings = [re.sub(pattern, '', s) for s in strings]
    
    return cleaned_strings

def split_and_retain_quoted(strings):

    # Regular expression to match quoted strings or words
    pattern = r'"[^"]*"|\S+'
    
    # Find all matches in the text
    matches = re.findall(pattern, strings)
    clean_matches = [element for element in matches if element.startswith('"') and element.endswith('"')]
    clean_matches = [element[1:-1] for element in clean_matches]
    
    return clean_matches

def get_config_data():
    with open("local_data.json") as json_file:
        data = json.load(json_file)
        return data["OPENAI_KEY"]["key"]





In [56]:

#file_path = archs4py.download.counts("human", path="", version="latest")

def handle_archs4py_query(query, file, selection_type):

    """
    Accepts a parsed user query and filters the dataset using archs4py.
    
    Parameters:
    - query: str, the user query
    - file: the loaded dataset
    - selection_type: str, the type of selection to perform (samples, series, or terms)
    
    Returns:
    - query_dataset: a subset of the dataset relevant to the query that consists of a list of data objects (e.g. meta and RNAseq data)
    """
    try:
        if selection_type == "samples":
            temp_meta = archs4py.meta.samples(file, query, 
                meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"])
            #temp_data = archs4py.data.samples(file, query)
            #matched_samples = [col for col in temp_meta.index if col in temp_data.columns]
            #temp_data = temp_data[matched_samples]
            #temp_meta = temp_meta.loc[matched_samples]
            #query_dataset = {'meta': temp_meta, 'RNAseq': temp_data}
            query_dataset = {'meta': temp_meta}
        elif selection_type == "series":
            temp_meta = archs4py.meta.series(file, query, 
                meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"])
            #temp_data = archs4py.data.series(file, query)
            #matched_samples = [col for col in temp_meta.index if col in temp_data.columns]
            #temp_data = temp_data[matched_samples]
            #temp_meta = temp_meta.loc[matched_samples]
            #query_dataset = {'meta': temp_meta, 'RNAseq': temp_data}
            query_dataset = {'meta': temp_meta}
        elif selection_type == "terms":
            temp_meta = archs4py.meta.meta(file, query, 
                meta_fields=["geo_accession", "series_id", "characteristics_ch1", "extract_protocol_ch1", "source_name_ch1", "title"], 
                remove_sc=True)
            #temp_samples = temp_meta["geo_accession"]
            #temp_data = archs4py.data.samples(file, temp_samples)
            #matched_samples = [col for col in temp_meta.index if col in temp_data.columns]
            #temp_data = temp_data[matched_samples]
            #temp_meta = temp_meta.loc[matched_samples]
            #query_dataset = {'meta': temp_meta, 'RNAseq': temp_data}
            query_dataset = {'meta': temp_meta}
        else:
            raise ValueError("Invalid selection type. Please choose 'sample', 'series', or 'term'.")
        return query_dataset
    
    except Exception as e:
        return f"Error retrieving data: {str(e)}"

def parse_archs4py_query(full_query):
    """
    Parses a user query into chunks suitable for the handle_archs4py_query function.
    
    Parameters:
    - full_query: str, the unfiltereduser query
    
    Returns:
    - query_dataset: a subset of the dataset relevant to the full query that consists of a list of data objects (e.g. meta and RNAseq data)
    """
    try:
        # Convert the query to uppercase to ensure case-insensitive matching
        
        if "GSM" in full_query:
            query_upper = full_query.upper()
            # Perform operation for queries containing "GSM"
            # Split the string into a list of elements using spaces as the separator
            temp_elements = query_upper.split()
            temp_elements = remove_non_alphanumeric(temp_elements)
        
            # Filter the list to include only elements containing "GSE"
            temp_samples = [temp_element for temp_element in temp_elements if "GSM" in temp_element]

            # Select data for the samples
            return handle_archs4py_query(temp_samples, archive_file, "samples")

        elif "GSE" in full_query:
            query_upper = full_query.upper()
            # Perform operation for queries containing "GSE"
            # Split the string into a list of elements using spaces as the separator
            temp_elements = query_upper.split()
            temp_elements = remove_non_alphanumeric(temp_elements)
        
            # Filter the list to include only elements containing "GSE"
            temp_series = [temp_element for temp_element in temp_elements if "GSE" in temp_element]

            if len(temp_series) > 1:
                prior_dataset = []
            for temp_index, temp_value in enumerate(temp_series):
                temp_dataset = handle_archs4py_query(temp_value, archive_file, "series")
                if temp_index > 0:
                    temp_dataset['meta'] = pd.concat([prior_dataset['meta'], 
                                                    temp_dataset['meta']], 
                                                    ignore_index=True)
                if temp_index < len(temp_series) - 1:
                    prior_dataset = temp_dataset
            return temp_dataset

        else:
            # Perform default operation for other queries
            temp_terms = split_and_retain_quoted(full_query)
            print(temp_terms)
            if len(temp_terms) > 1:
                prior_dataset = []
            for temp_index, temp_value in enumerate(temp_terms):
                print(temp_value)
                temp_dataset = handle_archs4py_query(temp_value, archive_file, "terms")
                print(temp_dataset['meta'].shape)
                if temp_index > 0:
                    temp_dataset['meta'] = temp_dataset['meta'][temp_dataset['meta'].index.isin(prior_dataset['meta'].index)]
                    print(temp_dataset['meta'].shape)
                if temp_index < len(temp_terms) - 1:
                    prior_dataset = temp_dataset
            return temp_dataset

    except Exception as e:
        return f"Error retrieving data: {str(e)}"


def get_archs4py_expression_counts(query, file):

    """
    Accepts a parsed user query and filters the dataset using archs4py.
    
    Parameters:
    - query: str, the user query
    - file: the loaded dataset
    
    Returns:
    - temp_data: a subset of the dataset relevant to the query that consists of a list of data objects (e.g. meta and RNAseq data)
    """
    try:
        temp_data = archs4py.data.samples(file, query)
        return temp_data

    except Exception as e:
        return f"Error retrieving data: {str(e)}"




In [57]:

#test1 = parse_archs4py_query('I want to look up data for the "iPSC" and "neuro" samples')
test1 = parse_archs4py_query('I want to look up data for GSM1132425, GSM1132426, GSM1132427, GSM1179927, and GSM1179928')
print(test1['meta'].shape)
print(test1['meta'].head())
print(test1['meta']['geo_accession'].value_counts())
print(test1['meta']['series_id'].value_counts())
print(test1['meta']['characteristics_ch1'].value_counts())
print(test1['meta']['extract_protocol_ch1'].value_counts())
print(test1['meta']['source_name_ch1'].value_counts())
print(test1['meta']['title'].value_counts())


(5, 6)
           geo_accession          series_id  \
GSM1132425    GSM1132425           GSE46562   
GSM1132426    GSM1132426           GSE46562   
GSM1132427    GSM1132427           GSE46562   
GSM1179927    GSM1179927  GSE43526,GSE43900   
GSM1179928    GSM1179928  GSE43526,GSE43900   

                                          characteristics_ch1  \
GSM1132425       SAMPLE GROUP: CONTROL,TISSUE: 14 DAY NEURONS   
GSM1132426       SAMPLE GROUP: CONTROL,TISSUE: 14 DAY NEURONS   
GSM1132427       SAMPLE GROUP: CONTROL,TISSUE: 14 DAY NEURONS   
GSM1179927  CELL TYPE: FOREBRAIN CORTICAL NEURONS,AGENT: V...   
GSM1179928  CELL TYPE: FOREBRAIN CORTICAL NEURONS,AGENT: T...   

                                         extract_protocol_ch1  \
GSM1132425  RNA-SEQ WAS CARRIED OUT ON AND DAY 14 NEURONS ...   
GSM1132426  RNA-SEQ WAS CARRIED OUT ON AND DAY 14 NEURONS ...   
GSM1132427  RNA-SEQ WAS CARRIED OUT ON AND DAY 14 NEURONS ...   
GSM1179927  RNA WAS EXTRACTED USING TRIZOL REAGENT.,SRANDE.

In [58]:

###Load counts for selected samples
temp_samples = test1['meta']['geo_accession'].tolist()
temp_data = get_archs4py_expression_counts(temp_samples, archive_file)
print(temp_data.shape)
print(temp_data.head())



100%|██████████| 5/5 [00:01<00:00,  2.54it/s]

(67186, 5)
          GSM1132425  GSM1132426  GSM1132427  GSM1179927  GSM1179928
TSPAN6          1561        3554        1046       10111       11536
TNMD             174         832           8          31          21
DPM1            1057        2367         575        4349        4022
SCYL3            574        1017         374        1713        2168
C1orf112         427         641         170         624         665





In [59]:
OPENAI_API_KEY = get_config_data()

import nest_asyncio

nest_asyncio.apply()




In [63]:

# Step 1: Read the table data into a DataFrame
test_df = test1['meta']

from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_pandas_dataframe_agent
from langchain_openai import ChatOpenAI

from langchain_openai import OpenAI

agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model="gpt-4o-mini",openai_api_key=OPENAI_API_KEY),
    test_df,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    allow_dangerous_code=True
)

agent.invoke("how many rows are there?")

agent.invoke("What are the main properties of the data?")

agent.invoke('Please summarize the rows based on the values in the columns titles "characteristics_ch1", "extract_protocol_ch1", "title", and "source_name_ch1".')






[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'len(df)'}`


[0m[36;1m[1;3m5[0m[32;1m[1;3mThere are 5 rows in the dataframe `df`.[0m

[1m> Finished chain.[0m


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'df.info()'}`


[0m[36;1m[1;3m<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, GSM1132425 to GSM1179928
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   geo_accession         5 non-null      object
 1   series_id             5 non-null      object
 2   characteristics_ch1   5 non-null      object
 3   extract_protocol_ch1  5 non-null      object
 4   source_name_ch1       5 non-null      object
 5   title                 5 non-null      object
dtypes: object(6)
memory usage: 280.0+ bytes
[0m[32;1m[1;3mThe main properties of the data in the dataframe `df` are as

{'input': 'Please summarize the rows based on the values in the columns titles "characteristics_ch1", "extract_protocol_ch1", "title", and "source_name_ch1".',
 'output': 'Here is the summary of the rows based on the specified columns:\n\n| characteristics_ch1                                   | extract_protocol_ch1                                                                                               | title                    | source_name_ch1                                                                                        | count |\n|------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------|-------------------------|-------------------------------------------------------------------------------------------------------|-------|\n| CELL TYPE: FOREBRAIN CORTICAL NEURONS,AGENT: TOPOTECAN | RNA WAS EXTRACTED USING TRIZOL REAGENT.,SRANDED LIBRARIES WERE CONSTRUCTED



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'df.info()'}`


[0m[36;1m[1;3m<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, GSM1132425 to GSM1179928
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   geo_accession         5 non-null      object
 1   series_id             5 non-null      object
 2   characteristics_ch1   5 non-null      object
 3   extract_protocol_ch1  5 non-null      object
 4   source_name_ch1       5 non-null      object
 5   title                 5 non-null      object
dtypes: object(6)
memory usage: 280.0+ bytes
[0m[32;1m[1;3mThe main properties of the data are as follows:

- The dataframe has 5 entries (rows).
- There are 6 columns in the dataframe: `geo_accession`, `series_id`, `characteristics_ch1`, `extract_protocol_ch1`, `source_name_ch1`, and `title`.
- All columns have non-null values for all entries

{'input': 'What are the main properties of the data?',
 'output': 'The main properties of the data are as follows:\n\n- The dataframe has 5 entries (rows).\n- There are 6 columns in the dataframe: `geo_accession`, `series_id`, `characteristics_ch1`, `extract_protocol_ch1`, `source_name_ch1`, and `title`.\n- All columns have non-null values for all entries, meaning there are no missing values in this dataset.\n- The data type for all columns is `object`, indicating that they contain text or mixed data types.\n- The memory usage of the dataframe is approximately 280.0+ bytes.'}



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': "# Group the dataframe by the specified columns and count the occurrences of each group\ngrouped_summary = df.groupby(['characteristics_ch1', 'extract_protocol_ch1', 'title', 'source_name_ch1']).size().reset_index(name='counts')\ngrouped_summary"}`


[0m[36;1m[1;3m                                 characteristics_ch1  \
0  CELL TYPE: FOREBRAIN CORTICAL NEURONS,AGENT: T...   
1  CELL TYPE: FOREBRAIN CORTICAL NEURONS,AGENT: V...   
2       SAMPLE GROUP: CONTROL,TISSUE: 14 DAY NEURONS   
3       SAMPLE GROUP: CONTROL,TISSUE: 14 DAY NEURONS   
4       SAMPLE GROUP: CONTROL,TISSUE: 14 DAY NEURONS   

                                extract_protocol_ch1  \
0  RNA WAS EXTRACTED USING TRIZOL REAGENT.,SRANDE...   
1  RNA WAS EXTRACTED USING TRIZOL REAGENT.,SRANDE...   
2  RNA-SEQ WAS CARRIED OUT ON AND DAY 14 NEURONS ...   
3  RNA-SEQ WAS CARRIED OUT ON AND DAY 14 NEURONS ...   
4  RNA-SE

{'input': 'Please summarize the rows based on the values in the columns titles "characteristics_ch1", "extract_protocol_ch1", "title", and "source_name_ch1".',
 'output': 'Here is the summary of the rows based on the values in the columns "characteristics_ch1", "extract_protocol_ch1", "title", and "source_name_ch1":\n\n1. **Characteristics**: CELL TYPE: FOREBRAIN CORTICAL NEURONS, AGENT: TOPOTECAN\n   - **Extract Protocol**: RNA was extracted using TRIzol reagent. Stranded libraries were constructed with Illumina kits for stranded RNAseq.\n   - **Title**: TOPOTECAN_RNASEQ [HUMAN]\n   - **Source Name**: iPSCs were differentiated into forebrain cortical neurons and treated with 10μM topotecan for 6 days.\n   - **Count**: 1\n\n2. **Characteristics**: CELL TYPE: FOREBRAIN CORTICAL NEURONS, AGENT: VEHICLE\n   - **Extract Protocol**: RNA was extracted using TRIzol reagent. Stranded libraries were constructed with Illumina kits for stranded RNAseq.\n   - **Title**: VEHICLE_RNASEQ [HUMAN]\n   

In [30]:


from llama_index.core import Settings
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

Settings.llm = OpenAI(model="gpt-3.5-turbo")
Settings.embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

from llama_index.core import SummaryIndex, VectorStoreIndex

summary_index = SummaryIndex(nodes)
vector_index = VectorStoreIndex(nodes)

summary_query_engine = summary_index.as_query_engine(
    response_mode="tree_summarize",
    use_async=True,
)
vector_query_engine = vector_index.as_query_engine()

from llama_index.core.tools import QueryEngineTool


summary_tool = QueryEngineTool.from_defaults(
    query_engine=summary_query_engine,
    description=(
        "Useful for summarization questions related to test"
    ),
)

vector_tool = QueryEngineTool.from_defaults(
    query_engine=vector_query_engine,
    description=(
        "Useful for retrieving specific context from the test data."
    ),
)

## Report on available data for the user query and propose next steps

# Use the available data to train a RAG agent


ValueError: nodes must be a list of Node objects.