# Build Data Notebook

This notebook performs the following steps:

- **Process the `bam_file`** for each dataset to generate a **counts DataFrame**.
- **Construct AnnData matrices** from the counts DataFrame, representing the naive counts for \( u[k] \) across all \( k \).
- **Generate predicted counts** using:
  - The **uniform model**
  - The **non-uniform model**


## 1. Processing the Bam_file

In [1]:

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import sklearn
import anndata as ad 
from tqdm import tqdm as tqdm
import pysam
import scanpy as sc

In [1]:
import pysam
import pandas as pd

# 1) INPUT: Path to the filtered BAM file (choose one dataset)
# filtered_bam_path = '/data/dagyeman/cellranger/bam_file_analysis/1k_PBMCs/filtered_1k_PBMCS_bam.bam'   # BAM file for 1k dataset
# filtered_bam_path = '/data/dagyeman/cellranger/bam_file_analysis/10k_PBMCs/filtered_10k_PBMCS_bam.bam' # BAM file for 10k dataset
# filtered_bam_path = '/data/dagyeman/cellranger/bam_file_analysis/500_PBMCs/filtered_500_PBMCS_bam.bam'    # BAM file for 500 dataset
filtered_bam_path = '/data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/filtered_5k_PBMCS_bam.bam'    # BAM file for 5k dataset

# 2) data: list to hold extracted (barcode, gene, UMI) triplets
data = []

# 3) EXTRACT: Read BAM file and pull out barcode, gene, and UMI tags
with pysam.AlignmentFile(filtered_bam_path, "rb") as bam_file:
    for read in bam_file:
        if read.has_tag('CB') and read.has_tag('GN') and read.has_tag('UB'):
            barcode = read.get_tag('CB')  # string: cell barcode
            gene = read.get_tag('GN')     # string: gene name
            umi = read.get_tag('UB')      # string: unique molecular identifier
            data.append([barcode, gene, umi])

# 4) df: pandas DataFrame containing all reads with columns (barcode, gene, UMI)
df = pd.DataFrame(data, columns=['barcode', 'gene', 'UMI'])
print("First few raw rows:")
print(df.head())

# 5) CLEAN: Remove any rows where UMI contains ambiguous base 'N'
df = df[~df['UMI'].str.contains('N')]

# 6) deduplicated_df: DataFrame containing only unique (barcode, gene, UMI) combinations
#    This is the deduplicated set of observed UMIs before truncating
deduplicated_df = df.drop_duplicates(subset=['barcode', 'gene', 'UMI']).reset_index(drop=True)
print("First few deduplicated rows:")
print(deduplicated_df.head())

# 7) LOOP: For UMI lengths k = 1 to 12, compute number of unique UMIs per (barcode, gene)
for k in range(1, 13):
    deduplicated_df[f'UMI_{k}'] = deduplicated_df['UMI'].str[:k]  # first k bases
    grouped_df = (
        deduplicated_df.groupby(['barcode', 'gene'])[f'UMI_{k}']
                       .nunique()
                       .reset_index()
                       .rename(columns={f'UMI_{k}': f'unique_UMI_count_{k}'})
    )
    print(f"Unique UMI counts for UMI length {k}:")
    print(grouped_df.head())

# 8) final_df: merged table with unique UMI counts for all lengths 1–12
final_df = (
    deduplicated_df.groupby(['barcode', 'gene'])
                   .agg({f'UMI_{k}': 'nunique' for k in range(1, 13)})
                   .reset_index()
)
final_df.columns = ['barcode', 'gene'] + [f'unique_UMI_count_{k}' for k in range(1, 13)]
print("Final merged table:")
print(final_df.head())


First few raw rows:
              barcode         gene           UMI
0  GTGACGTTCATGGTTG-1  MIR1302-2HG  GCCTTACGGCTC
1  GGTATATTCGATGTCA-1  MIR1302-2HG  TACCGCCAACAG
2  ATGTCCCCAGGCATTG-1  MIR1302-2HG  GTCGTCATTCGT
3  GTCAATGTCGTCTCTG-1  MIR1302-2HG  AATATGAGTAGC
4  ATCAAGCCAACCCTAC-1  MIR1302-2HG  CGAACGCACCCT
First few deduplicated rows:
              barcode         gene           UMI
0  GTGACGTTCATGGTTG-1  MIR1302-2HG  GCCTTACGGCTC
1  GGTATATTCGATGTCA-1  MIR1302-2HG  TACCGCCAACAG
2  ATGTCCCCAGGCATTG-1  MIR1302-2HG  GTCGTCATTCGT
3  GTCAATGTCGTCTCTG-1  MIR1302-2HG  AATATGAGTAGC
4  ATCAAGCCAACCCTAC-1  MIR1302-2HG  CGAACGCACCCT
Unique UMI counts for UMI length 1:
              barcode   gene  unique_UMI_count_1
0  AAACCAAAGGTGACGA-1  AAGAB                   1
1  AAACCAAAGGTGACGA-1   AAK1                   4
2  AAACCAAAGGTGACGA-1  AAMDC                   1
3  AAACCAAAGGTGACGA-1   AAMP                   1
4  AAACCAAAGGTGACGA-1   AAR2                   1
Unique UMI counts for UMI length 

## 2. Adata Objects: Naive Method - created from final_df dataframe

In [2]:
import pandas as pd
import anndata as ad

# Dictionary to store the AnnData objects for each UMI length
adata_dict = {}

# Loop over UMI lengths from 1 to 12
for k in range(1, 13):
    # Extract the relevant columns for the current UMI length
    # Contains: barcode, gene, and the deduplicated count for UMI length k
    adata_matrix = final_df[['barcode', 'gene', f'unique_UMI_count_{k}']]

    # Pivot the DataFrame:
    # - Rows = barcodes (cells)
    # - Columns = genes
    # - Values = deduplicated UMI counts for the current UMI length
    matrix_df = adata_matrix.pivot(
        index='barcode',
        columns='gene',
        values=f'unique_UMI_count_{k}'
    ).fillna(0)

    # Create an AnnData object from the counts matrix
    adata = ad.AnnData(X=matrix_df.values)

    # Assign observation names (rows) to the barcodes
    adata.obs_names = matrix_df.index

    # Assign variable names (columns) to the gene names
    adata.var_names = matrix_df.columns

    # Store the AnnData object in the dictionary with the UMI length as the key
    adata_dict[k] = adata

# Example: View the AnnData object for UMI length 1
print(adata_dict[1])


AnnData object with n_obs × n_vars = 5709 × 31131


### Saving naive adata objects

In [3]:
dataset = "5k_PBMCs"

# file = "/data/dagyeman/cellranger/bam_file_analysis/1k_PBMCs/ub_objects/adata_matrices/" # saving path for 1k dataset
filepath = f"/data/dagyeman/cellranger/bam_file_analysis/{dataset}/ub_objects/adata_matrices/" # saving path for 10k dataset 

for k in range(1, 13):
    # Save each AnnData object to a file
    adata_dict[k].write_h5ad(f'{filepath}adata_UMI_length_{k}.h5ad')
    print(f'Saved AnnData object for UMI length {k} to adata_UMI_length_{k}.h5ad')

Saved AnnData object for UMI length 1 to adata_UMI_length_1.h5ad
Saved AnnData object for UMI length 2 to adata_UMI_length_2.h5ad
Saved AnnData object for UMI length 3 to adata_UMI_length_3.h5ad
Saved AnnData object for UMI length 4 to adata_UMI_length_4.h5ad
Saved AnnData object for UMI length 5 to adata_UMI_length_5.h5ad
Saved AnnData object for UMI length 6 to adata_UMI_length_6.h5ad
Saved AnnData object for UMI length 7 to adata_UMI_length_7.h5ad
Saved AnnData object for UMI length 8 to adata_UMI_length_8.h5ad
Saved AnnData object for UMI length 9 to adata_UMI_length_9.h5ad
Saved AnnData object for UMI length 10 to adata_UMI_length_10.h5ad
Saved AnnData object for UMI length 11 to adata_UMI_length_11.h5ad
Saved AnnData object for UMI length 12 to adata_UMI_length_12.h5ad


## 3. Adata Objects: Uniform Estimator 

#### Generating AnnData matrix for uniform estimator at length k

In [2]:
import scanpy as sc
import anndata as ad
import pandas as pd
from umi_utils import mom_estimator_unif

dataset = "5k_PBMCs"  # Change this to "1k_PBMCs" or "500_PBMCs" as needed
filepath = f"/data/dagyeman/cellranger/bam_file_analysis/{dataset}/ub_objects" #10k

# Load naive AnnData counts (UMI lengths 1–12)
adata_dict = {}
for i in range(1, 13):
    adata_dict[i] = sc.read_h5ad(f"{filepath}/adata_matrices/adata_{i}.h5ad")
    

# Build the predicted AnnData for a chosen UMI length k 
chosen_k = 6  # set this as needed

adata = adata_dict[chosen_k]                 # naive counts AnnData for k

# Dense matrix view of counts
matrix = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X
original_matrix = matrix.copy()              # keep a copy if you want to compare later

# Predict counts with your estimator
predicted_matrix = mom_estimator_unif(matrix, 4**chosen_k)
# AnnData with predicted counts (cells x genes align with the naive object)
predicted_adata = ad.AnnData(X=predicted_matrix)
predicted_adata.obs_names = adata.obs_names
predicted_adata.var_names = adata.var_names

# Example check
print(predicted_adata)

AnnData object with n_obs × n_vars = 5709 × 31131


### Saving adata matrices for unif estimator


In [3]:
import scanpy as sc
import anndata as ad
import pandas as pd
import os
from umi_utils import mom_estimator_unif

dataset = "5k_PBMCs"  # Change this to "1k_PBMCs" or "500_PBMCs" as needed
filepath = f"/data/dagyeman/cellranger/bam_file_analysis/{dataset}/ub_objects"   # 10k

# Load naive AnnData counts (UMI lengths 1–12)
adata_dict = {}
for i in range(1, 13):
    adata_dict[i] = sc.read_h5ad(f"{filepath}/adata_matrices/adata_{i}.h5ad")

# Output directory for uniform-estimator predictions
save_dir = f"{filepath}/col_aware_unif_matrices"
os.makedirs(save_dir, exist_ok=True)

# Build and save predicted AnnData for each UMI length k
for k in range(1, 13):
    adata = adata_dict[k]
    matrix = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X

    predicted_matrix = mom_estimator_unif(matrix, 4**k)

    predicted_adata = ad.AnnData(X=predicted_matrix)
    predicted_adata.obs_names = adata.obs_names
    predicted_adata.var_names = adata.var_names

    out_path = os.path.join(save_dir, f"adata_{k}.h5ad")
    predicted_adata.write_h5ad(out_path)
    print(f"Saved: {out_path}")

# Example check
print(sc.read_h5ad(os.path.join(save_dir, "adata_6.h5ad")))


Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_unif_matrices/adata_1.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_unif_matrices/adata_2.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_unif_matrices/adata_3.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_unif_matrices/adata_4.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_unif_matrices/adata_5.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_unif_matrices/adata_6.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_unif_matrices/adata_7.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_unif_matrices/adata_8.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_unif_matrices/adata_9.h5ad
Saved: /data/dagyeman/cellra

## 4. Adata Objects: Non-uniform Model

#### Generating UMI probs

In [None]:
import itertools
import pandas as pd
import numpy as np

## nucleotide probabilities
nt_probs = {'A': 0.23, 'C': 0.24, 'G': 0.21, 'T': 0.32}

def generate_umis_and_probs(max_len=12):
    umi_dfs = {}
    for length in range(1, max_len + 1):
        
        # generate all UMIs for this length
        umis = [''.join(p) for p in itertools.product(nt_probs.keys(), repeat=length)]
        
        # compute probabilities by multiplying nucleotide probs
        probs = [np.prod([nt_probs[base] for base in umi]) for umi in umis]
        # Store in a DataFrame
        df = pd.DataFrame({'UMI': umis, 'Probability': probs})
        umi_dfs[length] = df
    return umi_dfs

# Example usage
umis_dict = generate_umis_and_probs(12)

## view UMIs of length 4
print(umis_dict[4].head(10))
print("Total UMIs for length 4:", len(umis_dict[4]))


#### Generating AnnData matrix for non-uniform estimator at length k


In [4]:
import scanpy as sc
import anndata as ad
import pandas as pd
from umi_utils import generate_nonunif_estimator
dataset = "5k_PBMCs"  # Change this to "1k_PBMCs" or "500_PBMCs" as needed
# filepath = "/data/dagyeman/cellranger/bam_file_analysis/1k_PBMCs/ub_objects" #1k
filepath = f"/data/dagyeman/cellranger/bam_file_analysis/{dataset}/ub_objects" #10k

# Load naive AnnData counts (UMI lengths 1–12)
adata_dict = {}
for i in range(1, 13):
    adata_dict[i] = sc.read_h5ad(f"{filepath}/adata_matrices/adata_{i}.h5ad")

# Load non-uniform UMI probability distributions (UMI lengths 1–12)
umi_prob_dict = {}
for i in range(1, 13):
    umi_prob_dict[i] = pd.read_csv(f"/data/dagyeman/cellranger/bam_file_analysis/data/umi_probs/umi_probs_{i}.csv")

# Build the predicted AnnData for a chosen UMI length k
chosen_k = 5  # set this as needed

adata = adata_dict[chosen_k]                 # naive counts AnnData for k
umi_probs = umi_prob_dict[chosen_k]['prob']  # non-uniform probs for k

# Dense matrix view of counts
matrix = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X
original_matrix = matrix.copy()              # keep a copy if you want to compare later


# Build the estimator 
Y_max = int(min(original_matrix.max(), 4**chosen_k))
estimator = generate_nonunif_estimator(umi_probs.values, chosen_k, Y_max, verbose=True)

# Predict counts with your estimator
predicted_matrix = estimator(original_matrix)

# AnnData with predicted counts (cells x genes align with the naive object)
predicted_adata = ad.AnnData(X=predicted_matrix)
predicted_adata.obs_names = adata.obs_names
predicted_adata.var_names = adata.var_names

# Example check
print(predicted_adata)


Found n_max = 8192 where f(n_max)= 1021.2942848258443 > 1020
f(n_max/2)= 986.0597505200279 < 1020
Generating estimator lookup table for Y_max = 1020 using n_max = 8192...
Estimator generated successfully.
AnnData object with n_obs × n_vars = 5709 × 31131


### Saving collision_aware matrices

In [5]:
import scanpy as sc
import anndata as ad
import pandas as pd
import os
from umi_utils import generate_nonunif_estimator  

dataset = "5k_PBMCs"  # Change this to "1k_PBMCs" or "500_PBMCs" as needed
filepath = f"/data/dagyeman/cellranger/bam_file_analysis/{dataset}/ub_objects"   # 10k

# Load naive AnnData counts (UMI lengths 1–12)
adata_dict = {}
for i in range(1, 13):
    adata_dict[i] = sc.read_h5ad(f"{filepath}/adata_matrices/adata_{i}.h5ad")
    
print(adata_dict[1].shape)

# Load non-uniform UMI probability distributions (UMI lengths 1–12)
umi_prob_dict = {}
for i in range(1, 13):
    umi_prob_dict[i] = pd.read_csv(f"/data/dagyeman/cellranger/bam_file_analysis/data/umi_probs/umi_probs_{i}.csv")

# Output directory for non-uniform estimator predictions
save_dir = f"{filepath}/col_aware_nunif_matrices"
os.makedirs(save_dir, exist_ok=True)


for k in range(1, 13):
    adata = adata_dict[k]
    umi_probs = umi_prob_dict[k]['prob']

    # Get the counts matrix
    matrix = adata.X.toarray() if hasattr(adata.X, "toarray") else adata.X

    # Compute Y_max for this matrix
    Y_max = int(min(matrix.max(), 4**k))

    # Build the estimator for this k
    estimator = generate_nonunif_estimator(umi_probs.values, k, Y_max, verbose=False)

    # Apply it to the matrix
    predicted_matrix = estimator(matrix.astype(int))

    # Wrap back into AnnData
    predicted_adata = ad.AnnData(X=predicted_matrix)
    predicted_adata.obs_names = adata.obs_names
    predicted_adata.var_names = adata.var_names

    # Save
    out_path = os.path.join(save_dir, f"adata_{k}.h5ad")
    predicted_adata.write_h5ad(out_path)
    print(f"Saved: {out_path}")


(5709, 31131)
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_nunif_matrices/adata_1.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_nunif_matrices/adata_2.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_nunif_matrices/adata_3.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_nunif_matrices/adata_4.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_nunif_matrices/adata_5.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_nunif_matrices/adata_6.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_nunif_matrices/adata_7.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_nunif_matrices/adata_8.h5ad
Saved: /data/dagyeman/cellranger/bam_file_analysis/5k_PBMCs/ub_objects/col_aware_nunif_matrices/adata_9.h5ad
Saved