In [None]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import matplotlib.pyplot as plt

# Load your two AnnData objects
adata1 = sc.read_10x_h5("cellrangerX2024/SSTX-HEK293T-2XDASH_trimmed/outs/filtered_feature_bc_matrix.h5")
# Load the second matrix (unfiltered smallRNA matrix)
adata2 = sc.read_10x_h5("cellrangerX2025-smallRNA/SSTX-HEK293T-2XDASH_trimmed/outs/raw_feature_bc_matrix.h5")
# Filter adata2 to retain only cell barcodes present in adata1
common_barcodes = adata1.obs.index.intersection(adata2.obs.index)
adata2 = adata2[common_barcodes, :]

# Identify genes in adata2
genes_to_remove = set(adata2.var_names)

# Filter adata1 to exclude genes present in adata2
adata1_filtered = adata1[:, ~adata1.var_names.isin(genes_to_remove)].copy()

# Merge datasets based on common gene_ids
adata = anndata.concat([adata1_filtered, adata2], axis=1, join='inner')

# Check the result
print(adata.shape)  # Should be (5, 1) if only 'gene2' is common
print(adata.var['gene_ids'])  # Should print ['gene2']


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


(27961, 46173)
DDX11L2            ENSG00000290825
MIR1302-2HG        ENSG00000243485
FAM138A            ENSG00000237613
ENSG00000290826    ENSG00000290826
OR4F5              ENSG00000186092
                        ...       
U1                 ENSG00000275987
U6                 ENSG00000277890
U6                 ENSG00000277927
U6                 ENSG00000278625
U1                 ENSG00000277374
Name: gene_ids, Length: 46173, dtype: object


  utils.warn_names_duplicates("var")


In [2]:
# Remove 'GRCh38_' prefix from the index and create a new column GeneName
adata.var['GeneName'] = adata.var.index
adata.var

Unnamed: 0,gene_ids,feature_types,genome,GeneName
DDX11L2,ENSG00000290825,Gene Expression,GRCh38,DDX11L2
MIR1302-2HG,ENSG00000243485,Gene Expression,GRCh38,MIR1302-2HG
FAM138A,ENSG00000237613,Gene Expression,GRCh38,FAM138A
ENSG00000290826,ENSG00000290826,Gene Expression,GRCh38,ENSG00000290826
OR4F5,ENSG00000186092,Gene Expression,GRCh38,OR4F5
...,...,...,...,...
U1,ENSG00000275987,Gene Expression,GRCh38-smallRNA,U1
U6,ENSG00000277890,Gene Expression,GRCh38-smallRNA,U6
U6,ENSG00000277927,Gene Expression,GRCh38-smallRNA,U6
U6,ENSG00000278625,Gene Expression,GRCh38-smallRNA,U6


In [3]:
# Specify the path to the GTF file
gtf_file = "/oak/stanford/projects/kibr/Alina/Analysis/F1.method_comparison/filtered_gene_id_biotypes.csv"

# Read the CSV file into a DataFrame
gtf_data = pd.read_csv(gtf_file)

gtf_data = gtf_data.drop_duplicates(subset='gene_id', keep='first')

# Function to append biotype to adata
def append_biotype(adata, gtf_data):
    # Ensure compatibility of gene_ids column
    adata.var = adata.var.copy()  # Avoid potential warnings about chained assignments
    adata.var["gene_ids"] = adata.var["gene_ids"].astype(str)
    gtf_data["gene_id"] = gtf_data["gene_id"].astype(str)
    
    # Merge on gene_ids and gene_id
    merged_var = adata.var.merge(
        gtf_data[["gene_id", "biotype"]],  # Only include relevant columns
        left_on="gene_ids",                # Match adata.var['gene_ids'] with gtf_data['gene_id']
        right_on="gene_id",
        how="left"
    )
    
    # Add biotype back to adata.var
    adata.var = merged_var
    
    # Fill missing biotype entries with "unknown"
    adata.var["biotype"] = adata.var["biotype"].fillna("unknown")
    
    return adata


# Apply the function to each AnnData object in the list
adata = append_biotype(adata, gtf_data)

adata.var

Unnamed: 0,gene_ids,feature_types,genome,GeneName,gene_id,biotype
0,ENSG00000290825,Gene Expression,GRCh38,DDX11L2,ENSG00000290825,lncRNA
1,ENSG00000243485,Gene Expression,GRCh38,MIR1302-2HG,ENSG00000243485,lncRNA
2,ENSG00000237613,Gene Expression,GRCh38,FAM138A,ENSG00000237613,lncRNA
3,ENSG00000290826,Gene Expression,GRCh38,ENSG00000290826,ENSG00000290826,lncRNA
4,ENSG00000186092,Gene Expression,GRCh38,OR4F5,ENSG00000186092,protein_coding
...,...,...,...,...,...,...
46168,ENSG00000275987,Gene Expression,GRCh38-smallRNA,U1,ENSG00000275987,snRNA
46169,ENSG00000277890,Gene Expression,GRCh38-smallRNA,U6,ENSG00000277890,snRNA
46170,ENSG00000277927,Gene Expression,GRCh38-smallRNA,U6,ENSG00000277927,snRNA
46171,ENSG00000278625,Gene Expression,GRCh38-smallRNA,U6,ENSG00000278625,snRNA


In [4]:
#adata.var.index = adata.var["GeneName"]

In [5]:
adata.write("SSTX_counts_adata_2XDASH.h5ad")