In [15]:
import pandas as pd
import numpy as np
import pickle

def create_embedding_feature_matrix(pair_df, embedding_dict, gene_columns=('Gene_A', 'Gene_B'), label_column='Same_Complex'):
    """
    Efficiently constructs a feature matrix for gene pairs using their embeddings.
    """
    gene_a_col, gene_b_col = gene_columns
    embedding_dim = len(next(iter(embedding_dict.values())))
    
    valid_pairs = []
    features = []
    labels = []

    for row in pair_df.itertuples(index=False):
        gene_a = getattr(row, gene_a_col)
        gene_b = getattr(row, gene_b_col)
        label = getattr(row, label_column)

        emb_a = embedding_dict.get(gene_a)
        emb_b = embedding_dict.get(gene_b)

        if emb_a is None or emb_b is None:
            continue

        combined = np.concatenate([emb_a, emb_b])
        features.append(combined)
        labels.append(label)
        valid_pairs.append(f"{gene_a}_{gene_b}")

    feature_array = np.array(features)
    feature_columns = [f"gene_a_{i}" for i in range(embedding_dim)] + [f"gene_b_{i}" for i in range(embedding_dim)]
    
    feature_df = pd.DataFrame(feature_array, columns=feature_columns, index=valid_pairs)
    feature_df["Label"] = labels
    feature_df.index.name = "Gene_Pair"

    return feature_df

In [24]:
def drop_self_pairs(df, gene_columns=('Gene_A', 'Gene_B')):
    """
    Drops rows where gene_a == gene_b.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame with gene pair columns.
        gene_columns (tuple): Column names for the gene pairs.

    Returns:
        pd.DataFrame: Filtered DataFrame with self-pairs removed.
    """
    gene_a_col, gene_b_col = gene_columns
    return df[df[gene_a_col] == df[gene_b_col]].copy()

In [17]:
label_df1 = pd.read_pickle('complex_label.pkl')

In [18]:
label_df1

Unnamed: 0,GeneAB,Same_Complex
0,ST13P4_WASH2P,0
2,ST13P4_FAM90A12P,0
3,ST13P4_PI4KAP1,0
4,ST13P4_SNX29P2,0
5,ST13P4_ZNF818P,0
...,...,...
325245048,RBMXP1_PPT1,0
325245049,RBMXP1_PLD2,0
325245050,RBMXP1_EIF4EBP1,0
325245051,RBMXP1_TINAG,0


In [21]:
label_df1 = label_df1[label_df1['Same_Complex']==1]

In [22]:
label_df1[['Gene_A', 'Gene_B']] = label_df1['GeneAB'].str.split('_', expand=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df1[['Gene_A', 'Gene_B']] = label_df1['GeneAB'].str.split('_', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  label_df1[['Gene_A', 'Gene_B']] = label_df1['GeneAB'].str.split('_', expand=True)


In [25]:
drop_self_pairs(label_df1)

Unnamed: 0,GeneAB,Same_Complex,Gene_A,Gene_B
5041889,POP1_POP1,1,POP1,POP1
5513938,POLR1E_POLR1E,1,POLR1E,POLR1E
7444118,ORC2_ORC2,1,ORC2,ORC2
9814439,PDP1_PDP1,1,PDP1,PDP1
11767736,PAN2_PAN2,1,PAN2,PAN2
...,...,...,...,...
281168390,SNRPD1_SNRPD1,1,SNRPD1,SNRPD1
285818871,STAG1_STAG1,1,STAG1,STAG1
288477012,LAMB2_LAMB2,1,LAMB2,LAMB2
291938500,TEP1_TEP1,1,TEP1,TEP1


In [12]:
label_df = drop_self_pairs(label_df[['Gene_A', 'Gene_B', 'Same_Complex']])

In [13]:
label_df

Unnamed: 0,Gene_A,Gene_B,Same_Complex
0,ST13P4,WASH2P,0
2,ST13P4,FAM90A12P,0
3,ST13P4,PI4KAP1,0
4,ST13P4,SNX29P2,0
5,ST13P4,ZNF818P,0
...,...,...,...
325245048,RBMXP1,PPT1,0
325245049,RBMXP1,PLD2,0
325245050,RBMXP1,EIF4EBP1,0
325245051,RBMXP1,TINAG,0


In [14]:
label_df[label_df['Same_Complex']==1]

Unnamed: 0,Gene_A,Gene_B,Same_Complex
1565502,SMC4,NCAPH,1
1573940,SMC4,NCAPD2,1
1577014,SMC4,NCAPG,1
1604616,SMC4,SMC2,1
2303994,PSMD4,PSMD13,1
...,...,...,...
325081137,TUBGCP2,MZT2B,1
325082437,TUBGCP2,MZT1,1
325083284,TUBGCP2,TUBG1,1
325084572,TUBGCP2,TUBGCP6,1


In [13]:
embedding_pkl = '/home/ubuntu/scgpt_embeddings/scGPT_brain_embeddings.pkl'
with open(embedding_pkl, 'rb') as f:
    embedding_dictionary = pickle.load(f)

In [15]:
import numpy as np
import pandas as pd

def create_embedding_feature_matrix(pair_df, embedding_dict, gene_columns=('Gene_A', 'Gene_B'), label_column='Same_Complex'):
    gene_a_col, gene_b_col = gene_columns
    embedding_dim = len(next(iter(embedding_dict.values())))
    print('start map genes to embeddings')
    # Map genes to embeddings
    pair_df['emb_a'] = pair_df[gene_a_col].map(embedding_dict)
    pair_df['emb_b'] = pair_df[gene_b_col].map(embedding_dict)

    print('Drop rows where either embedding is missing')
    pair_df = pair_df.dropna(subset=['emb_a', 'emb_b'])

    print('Convert embedding lists to arrays')
    emb_a_stack = np.stack(pair_df['emb_a'].values)
    emb_b_stack = np.stack(pair_df['emb_b'].values)

    print('Concatenate embeddings')
    features = np.concatenate([emb_a_stack, emb_b_stack], axis=1)

    print('Build column names')
    feature_columns = [f"gene_a_{i}" for i in range(embedding_dim)] + [f"gene_b_{i}" for i in range(embedding_dim)]
    feature_df = pd.DataFrame(features, columns=feature_columns)

    print('Add labels and index')
    feature_df["Label"] = pair_df[label_column].values
    feature_df.index = pair_df[gene_a_col] + "_" + pair_df[gene_b_col]
    feature_df.index.name = "Gene_Pair"

    return feature_df

In [16]:
feature_matrix = create_embedding_feature_matrix(label_df, embedding_dictionary)

start map genes to embeddings
Drop rows where either embedding is missing
Convert embedding lists to arrays


MemoryError: Unable to allocate 383. GiB for an array with shape (200794875, 512) and data type float32

In [17]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def create_embedding_feature_matrix(pair_df, embedding_dict, gene_columns=('Gene_A', 'Gene_B'), label_column='Same_Complex'):
    """
    Efficiently constructs a feature matrix for gene pairs using their embeddings.
    """
    gene_a_col, gene_b_col = gene_columns
    embedding_dim = len(next(iter(embedding_dict.values())))
    
    valid_pairs = []
    features = []
    labels = []

    print('Begin for-loop')
    for row in tqdm(pair_df.itertuples(index=False), total=len(pair_df), desc="Processing gene pairs"):
        gene_a = getattr(row, gene_a_col)
        gene_b = getattr(row, gene_b_col)
        label = getattr(row, label_column)

        emb_a = embedding_dict.get(gene_a)
        emb_b = embedding_dict.get(gene_b)

        if emb_a is None or emb_b is None:
            continue

        combined = np.concatenate([emb_a, emb_b])
        features.append(combined)
        labels.append(label)
        valid_pairs.append(f"{gene_a}_{gene_b}")

    # Convert to DataFrame
    feature_array = np.array(features)
    feature_columns = [f"gene_a_{i}" for i in range(embedding_dim)] + [f"gene_b_{i}" for i in range(embedding_dim)]
    
    feature_df = pd.DataFrame(feature_array, columns=feature_columns, index=valid_pairs)
    feature_df["Label"] = labels
    feature_df.index.name = "Gene_Pair"

    return feature_df

In [None]:
feature_matrix = create_embedding_feature_matrix(label_df, embedding_dictionary)

Begin for-loop


Processing gene pairs:   6%|▌         | 12077078/200794875 [02:14<62:41:31, 836.18it/s] 

In [3]:
label_df = drop_self_pairs(label_df)

NameError: name 'label_df' is not defined