In [1]:
import pathlib
import numpy as np
import pandas as pd

from pycytominer import aggregate
from pycytominer.cyto_utils import infer_cp_features

from cytominer_eval import evaluate

import umap

In [2]:
# Load perturbseq data
perturbseq_data_dir = pathlib.Path("../../0.download-data/data/perturbseq")
gse_id = "GSE132080"

file = pathlib.Path(perturbseq_data_dir / f"{gse_id}_processed_matrix.tsv.gz")
df = (
    pd.read_csv(file, sep="\t", index_col=0)
    .transpose()
    .reset_index()
    .rename({"index": "barcode"}, axis="columns")
)

# Pull out the measured genes
gene_features = df.columns.tolist()
gene_features.remove("barcode")

df = df.assign(sequence=[x.split("-")[0] for x in df.barcode])

print(df.shape)
df.head()

(23537, 1002)


gene,barcode,HES4,TNFRSF18,TNFRSF4,RP5-892K4.1,SMIM1,ESPN,TNFRSF9,SRM,C1orf64,...,LINC01426,AP000692.10,PSMG1,TFF3,FRGCA,AP001055.6,ITGB2,ITGB2-AS1,MT-ND6,sequence
0,AAACCTGAGAGTAATC-1,-0.824355,-0.05538,-0.043239,-0.0417,0.799906,-0.063787,-0.050248,-0.167651,-0.011615,...,-0.056596,-0.17316,0.597234,-0.010501,-0.024549,-0.015099,-0.187405,-0.113514,-0.058751,AAACCTGAGAGTAATC
1,AAACCTGAGGGATCTG-1,-0.824355,-0.05538,-0.043239,-0.0417,-0.662634,-0.063787,-0.050248,-0.50047,-0.011615,...,-0.056596,-0.17316,-0.951147,-0.010501,-0.024549,-0.015099,-0.187405,-0.113514,-0.775935,AAACCTGAGGGATCTG
2,AAACCTGAGGTCATCT-1,0.264442,-0.05538,-0.043239,-0.0417,-0.662634,-0.063787,-0.050248,1.198212,-0.011615,...,-0.056596,-0.17316,0.908946,-0.010501,-0.024549,-0.015099,-0.187405,-0.113514,0.621333,AAACCTGAGGTCATCT
3,AAACCTGCAATGGAGC-1,-0.824355,-0.05538,-0.043239,-0.0417,-0.662634,-0.063787,-0.050248,0.033259,-0.011615,...,-0.056596,-0.17316,-0.300633,-0.010501,-0.024549,-0.015099,-0.187405,-0.113514,0.828318,AAACCTGCAATGGAGC
4,AAACCTGCACCAGGCT-1,-0.824355,-0.05538,-0.043239,-0.0417,-0.662634,-0.063787,-0.050248,-0.436949,-0.011615,...,-0.056596,-0.17316,-1.241414,-0.010501,-0.024549,-0.015099,-0.187405,-0.113514,0.56361,AAACCTGCACCAGGCT


In [3]:
# Load activities results
file = pathlib.Path("supplementary/Table_S16_perturb-seq_screen_phenotypes.txt")
activity_df = pd.read_csv(file, sep="\t").rename({"Unnamed: 0": "id"}, axis="columns")

# Create a perturbation column to match with other IDs
activity_df = activity_df.assign(perturbation=activity_df.gene + "_" + activity_df.id)

print(activity_df.shape)
activity_df.head()

(128, 8)


Unnamed: 0,id,sequence,gene,gamma_day5,gamma_day10,relative_activity_day5,relative_activity_day10,perturbation
0,ALDOA_+_30077139.23-P1P2_00,GGTCACCAGGACCCCTTCTG,ALDOA,-0.412746,-0.366469,1.0,1.0,ALDOA_ALDOA_+_30077139.23-P1P2_00
1,ALDOA_+_30077139.23-P1P2_06,GGTCACCAGGATCCCTTCTG,ALDOA,-0.396687,-0.348503,0.961091,0.950977,ALDOA_ALDOA_+_30077139.23-P1P2_06
2,ALDOA_+_30077139.23-P1P2_07,GGTCACCAGGCCCCCTTCTG,ALDOA,-0.360892,-0.335059,0.874369,0.914291,ALDOA_ALDOA_+_30077139.23-P1P2_07
3,ALDOA_+_30077139.23-P1P2_13,GGTCACCAGGACCCCTTTTG,ALDOA,0.017063,-0.00022,-0.04134,0.000601,ALDOA_ALDOA_+_30077139.23-P1P2_13
4,ALDOA_+_30077139.23-P1P2_14,GGTCACCAGGACCGCTTCTG,ALDOA,-0.175243,-0.156611,0.424579,0.427353,ALDOA_ALDOA_+_30077139.23-P1P2_14


In [4]:
# Load Cell Identities
cell_id_file = pathlib.Path(f"{perturbseq_data_dir}/{gse_id}_cell_identities.csv.gz")
cell_id_df = pd.read_csv(cell_id_file, sep=",")

print(cell_id_df.shape)
cell_id_df.head()

(23608, 8)


Unnamed: 0,cell_barcode,guide_identity,read_count,UMI_count,coverage,gemgroup,good_coverage,number_of_cells
0,GGACAAGTCCCTGACT-3,neg_ctrl_non-targeting_00028,7452,457,16.306346,3,True,1
1,CGACTTCAGAAGGCCT-3,GNB2L1_GNB2L1_+_180670873.23-P1P2_13,6554,361,18.155125,3,True,1
2,TTAGGCAAGAAGGCCT-2,TUBB_TUBB_+_30688126.23-P1_00,4177,165,25.315152,2,True,2
3,CGTAGGCAGCCAGGAT-1,TUBB_TUBB_+_30688126.23-P1_01,4024,218,18.458716,1,True,1
4,GCGCAACTCACGATGT-2,HSPE1_HSPE1_+_198365089.23-P1P2_00,3923,134,29.276119,2,True,1


In [5]:
# Merge single cells with identifiers
sc_df = cell_id_df.merge(df, how="right", right_on="barcode", left_on="cell_barcode")

print(sc_df.shape)
sc_df.head()

(23537, 1010)


Unnamed: 0,cell_barcode,guide_identity,read_count,UMI_count,coverage,gemgroup,good_coverage,number_of_cells,barcode,HES4,...,LINC01426,AP000692.10,PSMG1,TFF3,FRGCA,AP001055.6,ITGB2,ITGB2-AS1,MT-ND6,sequence
0,AAACCTGAGAGTAATC-1,RAN_RAN_+_131356438.23-P1P2_12,544.0,34.0,16.0,1.0,True,1.0,AAACCTGAGAGTAATC-1,-0.824355,...,-0.056596,-0.17316,0.597234,-0.010501,-0.024549,-0.015099,-0.187405,-0.113514,-0.058751,AAACCTGAGAGTAATC
1,AAACCTGAGGGATCTG-1,neg_ctrl_non-targeting_00089,267.0,19.0,14.052632,1.0,True,1.0,AAACCTGAGGGATCTG-1,-0.824355,...,-0.056596,-0.17316,-0.951147,-0.010501,-0.024549,-0.015099,-0.187405,-0.113514,-0.775935,AAACCTGAGGGATCTG
2,AAACCTGAGGTCATCT-1,POLR2H_POLR2H_+_184081251.23-P1P2_08,622.0,34.0,18.294118,1.0,True,1.0,AAACCTGAGGTCATCT-1,0.264442,...,-0.056596,-0.17316,0.908946,-0.010501,-0.024549,-0.015099,-0.187405,-0.113514,0.621333,AAACCTGAGGTCATCT
3,AAACCTGCAATGGAGC-1,TUBB_TUBB_+_30688126.23-P1_03,433.0,20.0,21.65,1.0,True,1.0,AAACCTGCAATGGAGC-1,-0.824355,...,-0.056596,-0.17316,-0.300633,-0.010501,-0.024549,-0.015099,-0.187405,-0.113514,0.828318,AAACCTGCAATGGAGC
4,AAACCTGCACCAGGCT-1,CDC23_CDC23_-_137548987.23-P1P2_04,136.0,8.0,17.0,1.0,True,1.0,AAACCTGCACCAGGCT-1,-0.824355,...,-0.056596,-0.17316,-1.241414,-0.010501,-0.024549,-0.015099,-0.187405,-0.113514,0.56361,AAACCTGCACCAGGCT


In [6]:
# Perform single cell aggregation into bulk
bulk_df = aggregate(
    population_df=sc_df,
    strata=["guide_identity"],
    features=gene_features,
    operation="median"
)

# Some genes have very small variance still, remove these!
genes_to_retain = (
    pd.DataFrame(bulk_df.var() > 0.001)
    .reset_index()
    .rename({"index": "gene", 0: "keep"}, axis="columns")
    .query("keep")
    .gene
    .tolist()
)

bulk_df = bulk_df.loc[:, ["guide_identity"] + genes_to_retain]

# create a column for the gene
bulk_df = (
    bulk_df
    .assign(gene_identity=[x.split("_")[0] for x in bulk_df.guide_identity])
    .query("gene_identity != '*'")
)

print(bulk_df.shape)
bulk_df.head()

(138, 420)


Unnamed: 0,guide_identity,HES4,SMIM1,SRM,ID3,RHCE,SH3BGRL3,CD52,NUDC,ATPIF1,...,RPL3,ATF4,GTSE1,CRELD2,RP11-717F1.1,SAMSN1,PSMG1,ITGB2,MT-ND6,gene_identity
1,ALDOA_ALDOA_+_30077139.23-P1P2_00,-0.824355,-0.662634,-0.134945,-0.360075,-0.418661,-0.115687,-0.287396,0.045283,-0.004475,...,0.453441,-0.227842,-0.333797,-0.122447,0.254342,-0.094221,-0.063043,-0.187405,-0.038185,ALDOA
2,ALDOA_ALDOA_+_30077139.23-P1P2_06,-0.008632,-0.007438,-0.081923,-0.360075,-0.418661,-0.141087,-0.287396,0.092071,-0.049865,...,0.545697,-0.292911,0.099303,-0.08438,0.059196,-0.227093,0.12656,-0.187405,0.131256,ALDOA
3,ALDOA_ALDOA_+_30077139.23-P1P2_07,-0.824355,0.417267,0.004172,-0.360075,-0.418661,-0.216798,-0.287396,0.243446,-0.059595,...,0.479342,0.04406,-0.014387,-0.131299,0.184272,-0.025816,-0.091167,-0.187405,0.104615,ALDOA
4,ALDOA_ALDOA_+_30077139.23-P1P2_13,0.070173,-0.662634,0.172592,-0.360075,-0.418661,-0.264907,-0.287396,-0.010452,0.003083,...,0.367993,-0.058491,-0.079764,-0.066345,-0.029146,-0.227368,-0.028629,-0.187405,-0.032706,ALDOA
5,ALDOA_ALDOA_+_30077139.23-P1P2_14,-0.047386,-0.662634,0.014727,-0.360075,-0.418661,-0.190056,-0.287396,-0.09365,0.146223,...,0.424871,0.022834,-0.057425,-0.007199,0.153753,-0.329577,0.006557,-0.187405,0.270352,ALDOA


## Calculate Grit

In [7]:
neg_controls = [x for x in bulk_df.guide_identity if "neg_ctrl" in x]

barcode_col = "guide_identity"
gene_col = "gene_identity"

replicate_group_grit = {
    "replicate_id": barcode_col,
    "group_id": gene_col
}

result = evaluate(
    profiles=bulk_df,
    features=genes_to_retain,
    meta_features=[barcode_col, gene_col],
    replicate_groups=replicate_group_grit,
    operation="grit",
    grit_control_perts=neg_controls
)

result = result.dropna().sort_values(by="grit", ascending=False).reset_index(drop=True)

print(result.shape)
result.head(3)

(138, 3)


Unnamed: 0,perturbation,group,grit
0,HSPA5_HSPA5_+_128003624.23-P1P2_01,HSPA5,27.978637
1,GATA1_GATA1_-_48645022.23-P1P2_00,GATA1,27.381778
2,HSPA5_HSPA5_+_128003624.23-P1P2_04,HSPA5,27.111261


In [8]:
# Merge with activity results and output file
output_results_file = pathlib.Path(f"results/{gse_id}_grit.tsv")

result = result.merge(activity_df, left_on="perturbation", right_on="perturbation")

result.to_csv(output_results_file, sep="\t", index=False)

print(result.shape)
result.head(3)

(128, 10)


Unnamed: 0,perturbation,group,grit,id,sequence,gene,gamma_day5,gamma_day10,relative_activity_day5,relative_activity_day10
0,HSPA5_HSPA5_+_128003624.23-P1P2_01,HSPA5,27.978637,HSPA5_+_128003624.23-P1P2_01,GAACCGAGTAGGCGACGGTG,HSPA5,-0.637327,-0.374808,0.852461,0.877397
1,GATA1_GATA1_-_48645022.23-P1P2_00,GATA1,27.381778,GATA1_-_48645022.23-P1P2_00,GTGAGCTTGCCACATCCCCA,GATA1,-0.962732,-0.615306,1.0,1.0
2,HSPA5_HSPA5_+_128003624.23-P1P2_04,HSPA5,27.111261,HSPA5_+_128003624.23-P1P2_04,GAGCCGAGAAGGCGACGGTG,HSPA5,-0.754402,-0.422481,1.009055,0.988996


## Single Cell Grit

In [9]:
# Prepare single cell data frame for grit calculation
sc_df = sc_df.assign(gene_identity=[str(x).split("_")[0] for x in sc_df.guide_identity])
sc_df = sc_df.reset_index().rename({"index": "cell_identity"}, axis="columns")
neg_controls_df = sc_df.query("guide_identity in @neg_controls").sample(frac=0.2)

sc_neg_controls = (
    neg_controls_df
    .query("guide_identity in @neg_controls")
    .cell_identity
    .tolist()
)

replicate_group_grit = {'replicate_id': 'cell_identity', 'group_id': 'guide_identity'}

In [10]:
all_sc_grit_results = []
all_sc_umap_embeddings = []

genes = sc_df.gene_identity.unique()
for gene in genes:
    if gene not in ["neg", "*", "nan"]:
        print(f"Now analyzing {gene}...")
        subset_sc_df = sc_df.query("gene_identity in @gene")
        
        guides = subset_sc_df.guide_identity.unique()
            
        subset_sc_df = pd.concat([subset_sc_df, neg_controls_df]).reset_index(drop=True)
        
        # Apply UMAP to single cell profiles
        embedding = umap.UMAP().fit_transform(subset_sc_df.loc[:, genes_to_retain])

        # Combine results with single cell dataframe
        embedding_df = pd.concat(
            [
                subset_sc_df.drop(gene_features, axis="columns").reset_index(drop=True),
                pd.DataFrame(embedding, columns=["umap_0", "umap_1"])
            ],
            axis="columns"
        )
        
        # Append to list
        embedding_df.cell_identity = embedding_df.cell_identity.astype(str)
        all_sc_umap_embeddings.append(embedding_df.assign(grit_gene=gene))

        # Now calculate sc-Grit per guide
        for guide in guides:
            subset_guide_df = pd.concat(
                [
                    subset_sc_df.query("guide_identity == @guide"),
                    neg_controls_df
                ]
            ).reset_index(drop=True)
            
            # Calculate Grit
            sc_grit_result = evaluate(
                profiles=subset_guide_df,
                features=genes_to_retain,
                meta_features=["guide_identity", "cell_identity"],
                replicate_groups=replicate_group_grit,
                operation="grit",
                grit_control_perts=[str(x) for x in sc_neg_controls]
            )
            
            all_sc_grit_results.append(
                sc_grit_result.assign(grit_gene=gene, grit_guide=guide)
            )

Now analyzing RAN...
Now analyzing POLR2H...
Now analyzing TUBB...
Now analyzing CDC23...
Now analyzing POLR1D...
Now analyzing DUT...
Now analyzing HSPA5...
Now analyzing MTOR...
Now analyzing GATA1...
Now analyzing GINS1...
Now analyzing HSPE1...
Now analyzing RPS14...
Now analyzing EIF2S1...
Now analyzing DBR1...
Now analyzing CAD...
Now analyzing SEC61A1...
Now analyzing RPL9...
Now analyzing HSPA9...
Now analyzing RPS18...
Now analyzing ALDOA...
Now analyzing RPS15...
Now analyzing ATP5E...
Now analyzing COX11...
Now analyzing BCR...
Now analyzing GNB2L1...


In [11]:
all_sc_grit_results = pd.concat(all_sc_grit_results).reset_index(drop=True)

print(all_sc_grit_results.shape)
all_sc_grit_results.head()

(83105, 5)


Unnamed: 0,perturbation,group,grit,grit_gene,grit_guide
0,0,RAN_RAN_+_131356438.23-P1P2_12,-0.14267,RAN,RAN_RAN_+_131356438.23-P1P2_12
1,1,neg_ctrl_non-targeting_00089,-0.041876,RAN,RAN_RAN_+_131356438.23-P1P2_12
2,10030,RAN_RAN_+_131356438.23-P1P2_12,0.052259,RAN,RAN_RAN_+_131356438.23-P1P2_12
3,10046,neg_ctrl_non-targeting_00054,0.107413,RAN,RAN_RAN_+_131356438.23-P1P2_12
4,10051,neg_ctrl_non-targeting_00406,0.157619,RAN,RAN_RAN_+_131356438.23-P1P2_12


In [12]:
all_sc_umap_embeddings = pd.concat(all_sc_umap_embeddings).reset_index(drop=True)

print(all_sc_umap_embeddings.shape)
all_sc_umap_embeddings.head()

(32944, 15)


Unnamed: 0,cell_identity,cell_barcode,guide_identity,read_count,UMI_count,coverage,gemgroup,good_coverage,number_of_cells,barcode,sequence,gene_identity,umap_0,umap_1,grit_gene
0,0,AAACCTGAGAGTAATC-1,RAN_RAN_+_131356438.23-P1P2_12,544.0,34.0,16.0,1.0,True,1.0,AAACCTGAGAGTAATC-1,AAACCTGAGAGTAATC,RAN,3.926448,9.162159,RAN
1,165,AACGTTGAGAGTAATC-1,RAN_RAN_+_131356438.23-P1P2_00,625.0,36.0,17.361111,1.0,True,2.0,AACGTTGAGAGTAATC-1,AACGTTGAGAGTAATC,RAN,-0.330427,8.224433,RAN
2,264,AACTTTCTCTAAGCCA-1,RAN_RAN_+_131356438.23-P1P2_04,334.0,19.0,17.578947,1.0,True,1.0,AACTTTCTCTAAGCCA-1,AACTTTCTCTAAGCCA,RAN,1.142994,8.285683,RAN
3,311,AAGGAGCCATGCGCAC-1,RAN_RAN_+_131356438.23-P1P2_02,405.0,22.0,18.409091,1.0,True,1.0,AAGGAGCCATGCGCAC-1,AAGGAGCCATGCGCAC,RAN,3.871725,8.64783,RAN
4,314,AAGGAGCTCCTGTAGA-1,RAN_RAN_+_131356438.23-P1P2_04,926.0,41.0,22.585366,1.0,True,1.0,AAGGAGCTCCTGTAGA-1,AAGGAGCTCCTGTAGA,RAN,1.223484,10.063598,RAN


In [13]:
embedding_df = all_sc_umap_embeddings.merge(
    all_sc_grit_results,
    left_on=["cell_identity", "grit_gene"],
    right_on=["perturbation", "grit_gene"],
    how="right"
).merge(
    activity_df,
    left_on=["guide_identity", "gene_identity"],
    right_on=["perturbation", "gene"],
    how="outer",
    suffixes=["", "_activity"]
)

print(embedding_df.shape)
embedding_df.head()

(83105, 27)


Unnamed: 0,cell_identity,cell_barcode,guide_identity,read_count,UMI_count,coverage,gemgroup,good_coverage,number_of_cells,barcode,...,grit,grit_guide,id,sequence_activity,gene,gamma_day5,gamma_day10,relative_activity_day5,relative_activity_day10,perturbation_activity
0,0,AAACCTGAGAGTAATC-1,RAN_RAN_+_131356438.23-P1P2_12,544.0,34.0,16.0,1.0,True,1.0,AAACCTGAGAGTAATC-1,...,-0.14267,RAN_RAN_+_131356438.23-P1P2_12,RAN_+_131356438.23-P1P2_12,GGCGGTCGCTGCGCTTAGGT,RAN,-0.046549,-0.034259,0.235823,0.212155,RAN_RAN_+_131356438.23-P1P2_12
1,10030,ATTCTACCATGCAACT-2,RAN_RAN_+_131356438.23-P1P2_12,643.0,22.0,29.227273,2.0,True,1.0,ATTCTACCATGCAACT-2,...,0.052259,RAN_RAN_+_131356438.23-P1P2_12,RAN_+_131356438.23-P1P2_12,GGCGGTCGCTGCGCTTAGGT,RAN,-0.046549,-0.034259,0.235823,0.212155,RAN_RAN_+_131356438.23-P1P2_12
2,10094,ATTTCTGCAGGTCGTC-2,RAN_RAN_+_131356438.23-P1P2_12,694.0,37.0,18.756757,2.0,True,1.0,ATTTCTGCAGGTCGTC-2,...,0.119537,RAN_RAN_+_131356438.23-P1P2_12,RAN_+_131356438.23-P1P2_12,GGCGGTCGCTGCGCTTAGGT,RAN,-0.046549,-0.034259,0.235823,0.212155,RAN_RAN_+_131356438.23-P1P2_12
3,10164,CAAGATCAGGTTACCT-2,RAN_RAN_+_131356438.23-P1P2_12,317.0,12.0,26.416667,2.0,True,2.0,CAAGATCAGGTTACCT-2,...,0.057124,RAN_RAN_+_131356438.23-P1P2_12,RAN_+_131356438.23-P1P2_12,GGCGGTCGCTGCGCTTAGGT,RAN,-0.046549,-0.034259,0.235823,0.212155,RAN_RAN_+_131356438.23-P1P2_12
4,10280,CACAGGCCAAGGTGTG-2,RAN_RAN_+_131356438.23-P1P2_12,818.0,29.0,28.206897,2.0,True,1.0,CACAGGCCAAGGTGTG-2,...,-0.057149,RAN_RAN_+_131356438.23-P1P2_12,RAN_+_131356438.23-P1P2_12,GGCGGTCGCTGCGCTTAGGT,RAN,-0.046549,-0.034259,0.235823,0.212155,RAN_RAN_+_131356438.23-P1P2_12


In [14]:
# Output file
output_results_file = pathlib.Path(f"results/{gse_id}_single_cell_grit.tsv.gz")
embedding_df.to_csv(output_results_file, sep="\t", compression="gzip", index=False)