In [2]:
import gzip
import time
import pandas as pd
import os
import gzip
import openai
import tiktoken
import networkx as nx

from collections import defaultdict

In [3]:

l1000_genes = []
string_ids = []

with open('../data/L1000_to_STRING.txt', 'r') as file:
    for line in file:
        gene, string_id = line.rstrip().split('\t')
        l1000_genes.append(gene)
        string_ids.append(string_id)


string_interactions = {}

# download: https://stringdb-static.org/download/protein.links.detailed.v11.5/9606.protein.links.detailed.v11.5.txt.gz
with gzip.open('../data/9606.protein.links.detailed.v11.5.txt.gz', 'rt') as f:
    # header
    f.readline()
    
    # for fast search
    _string_ids = set(string_ids)

    for line in f:
        columns = line.strip().split(' ')
        if columns[0] in _string_ids and columns[1] in _string_ids:
            string_interactions[(columns[0], columns[1])] = float(columns[-1])

gene_to_stringId = dict(zip(l1000_genes, string_ids))


biogrid_interactions = defaultdict(int)
# download: https://downloads.thebiogrid.org/File/BioGRID/Release-Archive/BIOGRID-4.4.222/BIOGRID-ORGANISM-4.4.222.tab3.zip
#           We created BIOGRID-ORGANISM-Homo_sapiens-4.4.221.tab to include just rows corresponding to Homo Sapiens
with open('../data/BIOGRID-ORGANISM-Homo_sapiens-4.4.221.tab', 'r') as f:
    # header
    f.readline()

    g1, g2 = 7, 8
    for line in f:
        line = line.split('\t')
        biogrid_interactions[(line[g1], line[g2])] += 1

In [103]:
biogrid_graphg = nx.Graph()
biogrid_graphg.add_edges_from(biogrid_interactions.keys())


def compute(dataset, interaction_type='additive'):
    df = pd.read_csv(f'../computed_interactions/{dataset}.csv')

    if interaction_type == 'additive':
        interaction_column = 'rmst_diff_f1+f2'
    elif interaction_type == 'competing':
        interaction_column = 'rmst_diff_f1-f2'
    elif interaction_type == 'xor':
        interaction_column = 'rmst_diff_f1*f2'
    else:
        raise ValueError('interaction_type must be one of: additive, competing, xor')
    

    results = []

    df = df.sort_values(by=[interaction_column], ascending=False)
    top_100 = df[:100]
    for interaction_term, rmst_diff in zip(top_100['interaction'], df[interaction_column]):
        gene1, gene2 = interaction_term.split('*')

        stringId1 = gene_to_stringId[gene1]
        stringId2 = gene_to_stringId[gene2]

        paths = []
        try:
            paths = list(nx.all_shortest_paths(biogrid_graphg, gene1, gene2))
        except:
            pass
            
        min_path_length = min([len(path) for path in paths]) if paths else 0
        
        string_interaction_evidence = string_interactions.get((stringId1, stringId2), None)

        biogrid_interaction_evidence = biogrid_interactions.get((gene1, gene2), 0) + biogrid_interactions.get((gene2, gene1), 0)

        results.append((f'{gene1}*{gene2}', rmst_diff, string_interaction_evidence, biogrid_interaction_evidence, len(paths), min_path_length))
    

    save_path = f'../analyzed_interactions/{dataset}/'
    if not os.path.exists(save_path):
        os.mkdir(save_path)

    df = pd.DataFrame(results, columns=['interaction', 'rmst_diff', 'string_interaction_evidence', 'biogrid_interaction_evidence', 'bioGRID_shortest_paths', 'bioGRID_min_path_length'])
    df.to_csv(f'../analyzed_interactions/{dataset}/{interaction_type}.csv', index=False)


DATASETS = [
    "METABRIC",
    "BLCA",
    'BRCA',
    "CESC",
    "COAD",
    "GBM",
    "HNSC",
    "KIRC",
    "KIRP",
    "LAML",
    "LGG",
    "LIHC",
    "LUAD",
    "LUSC",
    "OV",
    "PRAD",
    "READ",
    "SKCM",
    "STAD",
    "THCA",
    "UCEC"
]

for dataset in DATASETS:
    compute(dataset, interaction_type='additive')
    compute(dataset, interaction_type='competing')
    compute(dataset, interaction_type='xor')

In [None]:

g = nx.Graph()
g.add_edges_from(biogrid_interactions.keys())


openai.organization = ""
openai.api_key = ""

def generate_prompt(cancer_type: str, gene1: str, gene2: str, paths: list, context: list):
    promt = f"""
    You are a helpful domain expert with a background in biology. You know the biology of each genes known in the literature. 
    
    Cancer type: TCGA-{cancer_type}
    Genes: {gene1} and {gene2}. 

    BioGRID protein interaction network; shortest path between {gene1} and {gene2}: {paths} .

    Context: {context}

    Based on what you know about these two genes and provided context. Describe briefly what specifically these genes do. 
    Can you reason about any possible functional associations between these two genes in specific biological terms? 
    Use context and your knowledge about biology to answer the question. Be specific in the processes where these genes are involved.

    Be concise. Answer in 2-3 short sentences. Start with possible functional associations. 
    
    """

    return promt


def compute(tcga_project, interaction_type='additive', llm='gpt-4', len_prompt=7600, paths_to_include=5):
    df = pd.read_csv(f'../analyzed_interactions/{tcga_project}/{interaction_type}.csv')
    uniprot_data = pd.read_csv(f'../data/uniprot_data/uniprot_data.csv')

    pairs = set()
    pairs.update(set( df.sort_values(by=['rmst_diff'], ascending=False)['interaction'][:10].values))
    pairs.update(set(df.loc[df['string_interaction_evidence'].notnull()]['interaction'].values))
    pairs.update(set(df.loc[df['biogrid_interaction_evidence'] > 0]['interaction'].values))
    
    results = []
    for interaction in pairs:
        gene1, gene2 = interaction.split('*')
        # rmst_diff = df.loc[df['interaction'] == interaction]['rmst_diff'].values[0]


        all_genes_in_all_paths = []
        paths = []
        try:
            paths = list(nx.all_shortest_paths(g, gene1, gene2))
        except:
            pass
        paths = paths[:paths_to_include]

        for path in paths:
            for gene in path[1:-1]:
                if gene in uniprot_data['L1000_name']:
                    gene_uniprot_data = uniprot_data.loc[uniprot_data['L1000_name'] == gene][['function_description', 'subunit_interactions']].values[0]
                    gene_uniprot_data = ' '.join(gene_uniprot_data)
                    all_genes_in_all_paths.append((gene, gene_uniprot_data))

        gene1_uniprot_data = uniprot_data.loc[uniprot_data['L1000_name'] == gene1][['function_description', 'subunit_interactions']].values[0]
        gene1_uniprot_data = ' '.join(gene1_uniprot_data)

        gene2_uniprot_data = uniprot_data.loc[uniprot_data['L1000_name'] == gene2][['function_description', 'subunit_interactions']].values[0]
        gene2_uniprot_data = ' '.join(gene2_uniprot_data)

        prompt = generate_prompt(tcga_project, gene1, gene2, paths, [(gene1, gene1_uniprot_data), (gene2, gene2_uniprot_data),] + all_genes_in_all_paths)
        enc = tiktoken.encoding_for_model(llm)
        prompt = enc.decode(enc.encode(prompt)[:len_prompt])

        retries = 3
        for i in range(retries):
            try:
                response = openai.ChatCompletion.create(
                        model=llm,
                        messages=[{"role": "user", "content": prompt}])
                openai_response = response.choices[0].message.content
                break
            except Exception as e:
                if i < retries - 1:  # If not the last retry
                    print('Model overloaded, retrying...')
                    time.sleep(2)  # Wait for 2 seconds or you can increase this
                    continue
                else:
                    print('Model overloaded, out of retries')
                    openai_response = str(e)

        # shortest_path_string = '|'.join(['-'.join(path) for path in paths]
        results.append((interaction, openai_response.replace('\n', ' ')))
        time.sleep(0.1)


    save_path = f'../analyzed_interactions/{tcga_project}/'
    if not os.path.exists(save_path):
        os.mkdir(save_path)

    df = pd.DataFrame(results, columns=['interaction', 'summary'])
    df.to_csv(f'../analyzed_interactions/{tcga_project}/{interaction_type}_{llm}_summary.csv', index=False)



DATASETS = [
    "METABRIC",
    "BLCA",
    'BRCA',
    "CESC",
    "COAD",
    "GBM",
    "HNSC",
    "KIRC",
    "KIRP",
    "LAML",
    "LGG",
    "LIHC",
    "LUAD",
    "LUSC",
    "OV",
    "PRAD",
    "READ",
    "SKCM",
    "STAD",
    "THCA",
    "UCEC"
]

for dataset in DATASETS:
    compute(dataset, interaction_type='additive', llm='gpt-3.5-turbo', len_prompt=3600, paths_to_include=5)
    compute(dataset, interaction_type='competing', llm='gpt-3.5-turbo', len_prompt=3600, paths_to_include=5)
    compute(dataset, interaction_type='xor', llm='gpt-3.5-turbo', len_prompt=3600, paths_to_include=5)
    compute(dataset, interaction_type='additive', paths_to_include=5)
    compute(dataset, interaction_type='competing', paths_to_include=5)
    compute(dataset, interaction_type='xor', paths_to_include=5)

    print(dataset, 'done')




In [5]:

def create_markdown_table(dataset: str, interaction_type: str, llm: str):
    df = pd.read_csv(f'../analyzed_interactions/{dataset}/{interaction_type}.csv')
    df_summary = pd.read_csv(f'../analyzed_interactions/{dataset}/{interaction_type}_{llm}_summary.csv')

    save_path = f'../explained_interactions/{dataset}/'
    if not os.path.exists(save_path):
        os.mkdir(save_path)

    with open(f'../explained_interactions/{dataset}/{interaction_type}_{llm}.md', 'w') as f:

        header = """| Genes&nbsp;&nbsp;&nbsp;&nbsp;| Data&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;| Summary |"""
        header_divider = "|:---|:---|:---|"

        f.write(f'{header}\n{header_divider}\n')

        temp_store = []

        for _, row in df_summary.iterrows():
            interaction = row['interaction']
            gene1, gene2 = interaction.split('*')
            # gene1_link = f"https://www.ncbi.nlm.nih.gov/gene?term={gene1}[Gene+Name]+AND+9606[Taxonomy+ID]"
            # gene2_link = f"https://www.ncbi.nlm.nih.gov/gene?term={gene2}[Gene+Name]+AND+9606[Taxonomy+ID]"

            summary = row['summary']
            # bioGRID_shortest_paths = row['bioGRID_shortest_paths']

            rmst_diff = df.loc[df['interaction'] == interaction]['rmst_diff'].values[0]
            string_score = df.loc[df['interaction'] == interaction]['string_interaction_evidence'].fillna(0).values[0]
            biogrid_count = df.loc[df['interaction'] == interaction]['biogrid_interaction_evidence'].values[0]
            biogrid_count = df.loc[df['interaction'] == interaction]['biogrid_interaction_evidence'].values[0]
            bioGRID_shortest_paths = df.loc[df['interaction'] == interaction]['bioGRID_shortest_paths'].values[0]
            bioGRID_min_path_length = df.loc[df['interaction'] == interaction]['bioGRID_min_path_length'].values[0]

            row = f"""| [{gene1}](https://www.ncbi.nlm.nih.gov/gene?term={gene1}[Gene+Name]+AND+9606[Taxonomy+ID]) </br> and </br> [{gene2}](https://www.ncbi.nlm.nih.gov/gene?term={gene2}[Gene+Name]+AND+9606[Taxonomy+ID]) |  **RMST differance:** {round(rmst_diff, 1)}</br>**STRING score:** {int(string_score)}</br>**BioGRID count:** {biogrid_count}</br>**BioGRID shortest paths:** {bioGRID_shortest_paths} (min len {bioGRID_min_path_length}) | {summary}|"""
            temp_store.append((rmst_diff, row))

        temp_store.sort(key=lambda x: x[0], reverse=True)
        for _, row in temp_store:
            f.write(f'{row}\n')



DATASETS = [
    "METABRIC",
    "BLCA",
    'BRCA',
    "CESC",
    "COAD",
    "GBM",
    "HNSC",
    "KIRC",
    "KIRP",
    "LAML",
    "LGG",
    "LIHC",
    "LUAD",
    "LUSC",
    "OV",
    "PRAD",
    "READ",
    "SKCM",
    "STAD",
    "THCA",
    "UCEC"
]


for dataset in DATASETS:
    create_markdown_table(dataset, interaction_type='additive', llm='gpt-3.5-turbo')
    create_markdown_table(dataset, interaction_type='competing', llm='gpt-3.5-turbo')
    create_markdown_table(dataset, interaction_type='xor', llm='gpt-3.5-turbo')

    create_markdown_table(dataset, interaction_type='competing', llm='gpt-4')
