In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import seaborn as sns
import anndata
import string
import gc
from anndata import read_h5ad
from anndata import read_csv
from pandas import DataFrame
import h5py
import os

In [2]:
# PATHS
SC_TRANSCRIPTOMICS_DIR = "singlecell_transcriptomics/TSP1_30_Paper2_Version2d_Oct2024/full_object"
METADATA = 'TSP1_30_metadata_min200_2500_decontx_scvi_donorassay_version2d_20241112.csv'
FULLDATA = 'TSP1_30_min200_2500_decontx_scvi_donorassay_version2d_20241126.h5ad'
DATA_DIR = os.path.join(os.getcwd(), '..', 'data/shared', SC_TRANSCRIPTOMICS_DIR)

# BOOLEANS
TenXOnly = True   # don't use plate data

## LOAD DATA

In [3]:
adata = read_h5ad(os.path.join(DATA_DIR, FULLDATA))
adata

AnnData object with n_obs × n_vars = 1136218 × 61806
    obs: 'donor', 'tissue', 'anatomical_position', 'method', 'cdna_plate', 'library_plate', 'notes', 'cdna_well', 'old_index', 'assay', 'sample_id', 'replicate', '10X_run', '10X_barcode', 'ambient_removal', 'donor_method', 'donor_assay', 'donor_tissue', 'donor_tissue_assay', 'cell_ontology_class', 'cell_ontology_id', 'compartment', 'broad_cell_class', 'free_annotation', 'manually_annotated', 'published_2022', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ercc', 'pct_counts_ercc', '_scvi_batch', '_scvi_labels', 'scvi_leiden_donorassay_full', 'age', 'sex', 'ethnicity', 'sample_number'
    var: 'ensembl_id', 'gene_symbol', 'genome', 'mt', 'ercc', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mean', 'std'
    uns: '_scvi_manager_uuid', '_scvi_uuid', '_training_mode', 'age_colors', 'assay_colors', 'compartment_colors', 'donor_colors', 'leiden', 'log1p', 'method_color

In [4]:
if TenXOnly:
    adata = adata[adata.obs.method == '10X']

adata.layers['raw_counts']

<1093048x61806 sparse matrix of type '<class 'numpy.int32'>'
	with 3542059822 stored elements in Compressed Sparse Row format>

In [5]:
def mapTFsbyEnsemblIDlist(TF_data):
    
    adata_table = pd.DataFrame()
    adata_table['ensembl_id'] = adata.var.ensembl_id.str.split('.', expand=True)[0]
    adata_table['gene_symbol'] = adata.var.gene_symbol
    
    # slice to just TF's
    TF_adata_table = adata_table[adata_table.ensembl_id.isin(TF_data.TF_names)]
    TF_adata_table.reset_index()
    TF_adata_table.sort_values('ensembl_id')
    
    return(TF_adata_table)

In [6]:
tf_df = pd.read_csv('../data/HumanTranscriptionFactorsEnsembl.csv')
tf_df = mapTFsbyEnsemblIDlist(tf_df)
tf_df

Unnamed: 0_level_0,ensembl_id,gene_symbol
index,Unnamed: 1_level_1,Unnamed: 2_level_1
NFYA,ENSG00000001167,NFYA
ARX,ENSG00000004848,ARX
HOXA11,ENSG00000005073,HOXA11
MEOX1,ENSG00000005102,MEOX1
SOX8,ENSG00000005513,SOX8
...,...,...
PCGF2,ENSG00000277258,PCGF2
ZNF670,ENSG00000277462,ZNF670
ZNF8,ENSG00000278129,ZNF8
ZNF229,ENSG00000278318,ZNF229


In [7]:
# Extract var['ensembl_id'] as a pandas Series to avoid modifying the AnnData object
ensembl_ids_with_version = adata.var['ensembl_id']

# Strip version numbers by processing this Series
base_ensembl_ids = ensembl_ids_with_version.str.split('.').str[0]

# Create a set of reference Ensembl IDs (without versions)
ensembl_ids_to_keep = set(tf_df['ensembl_id'])

# Generate a boolean mask for the subset
mask = base_ensembl_ids.isin(ensembl_ids_to_keep)

# Subset the AnnData object without modifying its var structure
adata_tf = adata[:, mask].copy()

adata_tf.write("../data/adata_tf.h5ad")

In [8]:
adata_tf

AnnData object with n_obs × n_vars = 1093048 × 1637
    obs: 'donor', 'tissue', 'anatomical_position', 'method', 'cdna_plate', 'library_plate', 'notes', 'cdna_well', 'old_index', 'assay', 'sample_id', 'replicate', '10X_run', '10X_barcode', 'ambient_removal', 'donor_method', 'donor_assay', 'donor_tissue', 'donor_tissue_assay', 'cell_ontology_class', 'cell_ontology_id', 'compartment', 'broad_cell_class', 'free_annotation', 'manually_annotated', 'published_2022', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ercc', 'pct_counts_ercc', '_scvi_batch', '_scvi_labels', 'scvi_leiden_donorassay_full', 'age', 'sex', 'ethnicity', 'sample_number'
    var: 'ensembl_id', 'gene_symbol', 'genome', 'mt', 'ercc', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'mean', 'std'
    uns: '_scvi_manager_uuid', '_scvi_uuid', '_training_mode', 'age_colors', 'assay_colors', 'compartment_colors', 'donor_colors', 'leiden', 'log1p', 'method_colors