In [1]:
import pandas as pd
import numpy as np
import os

def create_subdataset(df, n_rows, n_cols, ppi, random_state=None):
    """
    Create a sub-dataset with specified rows and columns.
    Row names (index) are preserved in the output.
    """
    rng = np.random.default_rng(random_state)

    # Sample rows by index
    sampled_rows = df.sample(n=n_rows, replace=False, random_state=random_state)

    # select some genes from PPI to ensure they are included
    ppi_genes = set(ppi['gene1']).union(set(ppi['gene2']))
    # Select genes from ppi_genes
    selected_genes = rng.choice(list(ppi_genes), size=int(min(n_rows / 10, len(ppi_genes))), replace=False)

    # Define three celltypes to repeat
    celltypes = ['Bcells', 'Tcells', 'CD8cells', 'NKcells', 'Fakecells']

    if n_cols <= df.shape[1]:
        # Randomly pick a subset of columns
        chosen_cols = rng.choice(df.columns, size=n_cols, replace=False)

        # Create fake index by combining selected_genes and celltypes, repeated as needed
        fake_index = []
        for gene in selected_genes:
            for ct in celltypes:
                fake_index.append(f"{gene}_at_{ct}")
        # Repeat or trim to match n_rows
        fake_index = (fake_index * ((n_rows // len(fake_index)) + 1))[:n_rows]

        sampled_rows.index = fake_index
        return sampled_rows.loc[:, chosen_cols]
    else:
        # Oversample columns (with replacement) if n_cols > original
        chosen_cols = rng.choice(df.columns, size=n_cols, replace=True)
        expanded = pd.concat(
            [sampled_rows[col].rename(f"{col}_{i}") for i, col in enumerate(chosen_cols, 1)],
            axis=1
        )

        fake_index = []
        for gene in selected_genes:
            for ct in celltypes:
                fake_index.append(f"{gene}_at_{ct}")
        # Repeat or trim to match n_rows
        fake_index = (fake_index * ((n_rows // len(fake_index)) + 1))[:n_rows]
        expanded.index = fake_index
        return expanded

In [2]:
# Example usage
df = pd.read_csv("../../SHISMA_main/temporal_data_with_patient_ready_normalized_full_genes.csv", sep=",", index_col=0)
ppi_full = pd.read_csv("../../SHISMA_main/string_is_0.7_ev_reactome.tsv", sep="\t", header=0)

ppi_full

Unnamed: 0,gene1,gene2,gene1_string_id,gene2_string_id,neighborhood_on_chromosome,gene_fusion,phylogenetic_cooccurrence,homology,coexpression,experimentally_determined_interaction,database_annotated,automated_textmining,combined_score
0,A1BG,SERPINA1,9606.ENSP00000263100,9606.ENSP00000416066,0.0,0.0,0.000,0.000,0.056,0.000,0.4,0.570,0.735
1,A1BG,CRK,9606.ENSP00000263100,9606.ENSP00000300574,0.0,0.0,0.000,0.000,0.000,0.000,0.0,0.745,0.745
2,A1BG,GAB3,9606.ENSP00000263100,9606.ENSP00000399588,0.0,0.0,0.000,0.000,0.000,0.000,0.0,0.821,0.821
3,A1BG,AHSG,9606.ENSP00000263100,9606.ENSP00000273784,0.0,0.0,0.000,0.000,0.100,0.067,0.4,0.736,0.849
4,A1BG,GAB2,9606.ENSP00000263100,9606.ENSP00000354952,0.0,0.0,0.000,0.000,0.000,0.000,0.0,0.856,0.856
...,...,...,...,...,...,...,...,...,...,...,...,...,...
23675,WIPF1,WIPF3,9606.ENSP00000376330,9606.ENSP00000242140,0.0,0.0,0.000,0.619,0.000,0.092,0.9,0.341,0.935
23676,WIPF1,WIPF2,9606.ENSP00000376330,9606.ENSP00000320924,0.0,0.0,0.000,0.790,0.000,0.000,0.9,0.194,0.916
23677,WIPF2,WIPF3,9606.ENSP00000320924,9606.ENSP00000242140,0.0,0.0,0.000,0.693,0.000,0.092,0.9,0.214,0.922
23678,XRCC5,XRCC6,9606.ENSP00000375978,9606.ENSP00000352257,0.0,0.0,0.230,0.000,0.574,0.999,0.9,0.999,0.999


In [8]:
unique_genes = pd.unique(pd.concat([ppi_full['gene1'], ppi_full['gene2']]))
len(unique_genes)

1867

In [9]:
df.shape

(5488056, 6)

In [15]:
# sample half of the genes in ppi_full
ppi_half = ppi_full.sample(frac=0.5, random_state=42)
ppi_halfhalf = ppi_full.sample(frac=0.25, random_state=42)

ppi_halfhalf

Unnamed: 0,gene1,gene2,gene1_string_id,gene2_string_id,neighborhood_on_chromosome,gene_fusion,phylogenetic_cooccurrence,homology,coexpression,experimentally_determined_interaction,database_annotated,automated_textmining,combined_score
7115,COL1A1,COL2A1,9606.ENSP00000225964,9606.ENSP00000369889,0.000,0.0,0.054,0.971,0.042,0.075,0.72,0.521,0.867
23489,UBA7,UBE2A,9606.ENSP00000333266,9606.ENSP00000360613,0.043,0.0,0.000,0.000,0.044,0.452,0.50,0.727,0.919
6501,CDC42,TUBA8,9606.ENSP00000497733,9606.ENSP00000333326,0.000,0.0,0.000,0.000,0.116,0.121,0.90,0.150,0.925
8939,DEFA5,DEFB1,9606.ENSP00000329890,9606.ENSP00000297439,0.000,0.0,0.000,0.000,0.042,0.000,0.50,0.861,0.927
14006,IL11RA,IL6,9606.ENSP00000450565,9606.ENSP00000385675,0.000,0.0,0.000,0.000,0.000,0.000,0.65,0.755,0.910
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22201,SAMHD1,TRIM5,9606.ENSP00000493536,9606.ENSP00000369373,0.000,0.0,0.000,0.000,0.087,0.000,0.00,0.884,0.889
23336,TUBB4A,TUBB4B,9606.ENSP00000264071,9606.ENSP00000341289,0.000,0.0,0.047,0.987,0.060,0.421,0.80,0.048,0.883
9994,FBXO27,SKP1,9606.ENSP00000292853,9606.ENSP00000231487,0.000,0.0,0.000,0.000,0.000,0.760,0.40,0.820,0.971
14598,IL1B,LCN2,9606.ENSP00000263341,9606.ENSP00000362108,0.000,0.0,0.000,0.000,0.116,0.000,0.00,0.740,0.761


In [16]:
# create output folder
newpath = 'datasets' 
if not os.path.exists(newpath):
    os.makedirs(newpath)

rows = [1000, 5000, 10000, 50000, 100000, 500000, 1000000]
cols = [5, 10, 15]

for r in rows:
    for c in cols:
        for ppi, ppi_name in zip([ppi_full, ppi_half, ppi_halfhalf], ['full', 'half', 'quarter']):
            sub_df = create_subdataset(df, ppi=ppi, n_rows=r, n_cols=c, random_state=42)
            sub_df.to_csv(f'{newpath}/subdataset_{ppi_name}_{r}rows_{c}cols.csv', index=True)
            ppi.to_csv(f'{newpath}/ppi_{ppi_name}_{r}rows_{c}cols.tsv', sep="\t", index=False)