# Load CPDB data and generate gene interaction table

In [4]:
import pandas as pd
import numpy as np

complex_input = pd.read_csv("db/v5.0.0/complex_input.csv")
gene_input = pd.read_csv("db/v5.0.0/gene_input.csv")
interaction_input = pd.read_csv("db/v5.0.0/interaction_input.csv")

interaction_input.head()

Unnamed: 0,id_cp_interaction,partner_a,partner_b,protein_name_a,protein_name_b,annotation_strategy,source,is_ppi,curator,reactome_complex,reactome_reaction,reactome_pathway,complexPortal_complex,comments,version,interactors,classification,directionality,modulatory_effect
0,,P12830,integrin_a2b1_complex,CADH1_HUMAN,,curated,PMID:12392763,True,RVentoTormo,,,,,,CellPhoneDBcore<=4.1,CDH1-ITGA2+ITGB1,Adhesion by Cadherin,Adhesion-Adhesion,
1,,P12830,integrin_aEb7_complex,CADH1_HUMAN,,curated,uniprot,True,RVentoTormo,,,,,,CellPhoneDBcore<=4.1,CDH1-ITGAE+ITGB7,Adhesion by Cadherin,Adhesion-Adhesion,
2,,P12830,Q96E93,CADH1_HUMAN,KLRG1_HUMAN,curated,PMC3030123,True,RVentoTormo,,,,,,CellPhoneDBcore<=4.1,CDH1-KLRG1,Adhesion by Cadherin,Ligand-Receptor,
3,,P19022,P19022,CADH2_HUMAN,CADH2_HUMAN,curated,uniprot,True,LGarciaAlonso,,,,,,CellPhoneDBcore<=4.1,CDH2-CDH2,Adhesion by Cadherin,Adhesion-Adhesion,
4,,P19022,P06734,CADH2_HUMAN,FCER2_HUMAN,curated,Shilts screen - VALIDATED,True,JShilts,,,,,,CellPhoneDBcore4.1,CDH2-FCER2,Adhesion by Cadherin,Ligand-Receptor,


In [2]:
gene_input.head()

Unnamed: 0,gene_name,uniprot,hgnc_symbol,ensembl
0,SULT1A1,P50225,SULT1A1,ENSG00000196502
1,UBASH3B,Q8TF42,UBASH3B,ENSG00000154127
2,SRD5A3,Q9H8P0,SRD5A3,ENSG00000128039
3,SULT2A1,Q06520,SULT2A1,ENSG00000105398
4,SRD5A2,P31213,SRD5A2,ENSG00000277893


In [3]:
complex_input.head()

Unnamed: 0,complex_name,uniprot_1,uniprot_2,uniprot_3,uniprot_4,uniprot_5,transmembrane,peripheral,secreted,secreted_desc,...,pdb_id,pdb_structure,stoichiometry,comments_complex,reactome_reaction,reactome_complex,complexPortal_complex,rhea_reaction,curator,version
0,Dehydroepiandrosterone_bySTS,Q8TF42,,,,,False,False,True,,...,,False,,,,,,,LGarciaAlonso,CellPhoneDBcore<=4.1
1,DHEAsulfate_bySULT2B,P50225,,,,,False,False,True,,...,,False,,,,,,,LGarciaAlonso,CellPhoneDBcore<=4.1
2,Dihydrotestosterone_bySRD5A3,Q9H8P0,,,,,False,False,True,,...,,False,,,,,,,LGarciaAlonso,CellPhoneDBcore<=4.1
3,Dihydrotestosterone_bySRD5A1,P18405,,,,,False,False,True,,...,,False,,,,,,,LGarciaAlonso,CellPhoneDBcore<=4.1
4,Dihydrotestosterone_bySRD5A2,P31213,,,,,False,False,True,,...,,False,,,,,,,LGarciaAlonso,CellPhoneDBcore<=4.1


In [23]:
def get_genes_from_identifier(identifier):
    """
    Convert an identifier (either uniprot or complex name) to a list of gene names.

    Returns:
        (list of genes, bool): The list of genes and a boolean indicating if the identifier was a complex.
    """
    # Check if it's a uniprot ID (simple pattern match)
    if pd.notna(identifier) and identifier in gene_input["uniprot"].values:
        # Direct mapping through gene_input
        genes = gene_input[gene_input["uniprot"] == identifier][["gene_name", "ensembl"]]
        return [genes]
    
    # Check if it's a complex
    complex_row = complex_input[complex_input["complex_name"] == identifier]
    if not complex_row.empty:
        # Get all uniprot columns
        uniprot_cols = ["uniprot_1", "uniprot_2", "uniprot_3", "uniprot_4", "uniprot_5"]
        # Get all valid uniprot IDs from the complex
        uniprots = complex_row[uniprot_cols].iloc[0].tolist()  # don't dropna
        # Map all uniprots to genes
        genes = []
        for uniprot in uniprots:
            gene = gene_input[gene_input["uniprot"] == uniprot][["gene_name", "ensembl"]]
            if gene.size > 0:
                genes.append(gene)
        genes = genes if genes else [identifier]
        return genes
    
    return None

In [21]:
row = interaction_input.iloc[0]
row

id_cp_interaction                          NaN
partner_a                               P12830
partner_b                integrin_a2b1_complex
protein_name_a                     CADH1_HUMAN
protein_name_b                             NaN
annotation_strategy                    curated
source                           PMID:12392763
is_ppi                                    True
curator                            RVentoTormo
reactome_complex                           NaN
reactome_reaction                          NaN
reactome_pathway                           NaN
complexPortal_complex                      NaN
comments                                   NaN
version                   CellPhoneDBcore<=4.1
interactors                   CDH1-ITGA2+ITGB1
classification            Adhesion by Cadherin
directionality               Adhesion-Adhesion
modulatory_effect                          NaN
Name: 0, dtype: object

In [24]:
print(get_genes_from_identifier(row["partner_a"]))
print(get_genes_from_identifier(row["partner_b"]))


[    gene_name          ensembl
377      CDH1  ENSG00000039068]
[    gene_name          ensembl
271     ITGB1  ENSG00000150093,     gene_name          ensembl
444     ITGA2  ENSG00000164171]


In [25]:
# Create new dataframe with expanded gene interactions
gene_interactions_list = []

for _, row in interaction_input.iterrows():
    genes_a = get_genes_from_identifier(row["partner_a"])
    genes_b = get_genes_from_identifier(row["partner_b"])
    if genes_a is None or genes_b is None:
        continue

    for gene_a in genes_a:
        for gene_b in genes_b:
            gene_interactions_list.append({
                "gene_name_a": gene_a["gene_name"].values[0],
                "ensembl_a": gene_a["ensembl"].values[0],
                "gene_name_b": gene_b["gene_name"].values[0],
                "ensembl_b": gene_b["ensembl"].values[0]
            })

gene_interactions = pd.DataFrame(gene_interactions_list)

# Display the first few rows
gene_interactions.head()

Unnamed: 0,gene_name_a,ensembl_a,gene_name_b,ensembl_b
0,CDH1,ENSG00000039068,ITGB1,ENSG00000150093
1,CDH1,ENSG00000039068,ITGA2,ENSG00000164171
2,CDH1,ENSG00000039068,ITGB7,ENSG00000139626
3,CDH1,ENSG00000039068,ITGAE,ENSG00000083457
4,CDH1,ENSG00000039068,KLRG1,ENSG00000139187


In [26]:
len(gene_interactions)

6003

In [27]:
gene_interactions.to_csv("data/gene_interactions.csv", index=False)