## Introduction

Single-cell technologies have revolutionized biology by allowing us to study the transcriptome, epigenome, or proteome of individual cells, rather than averaging over bulk populations. However, single-cell datasets are inherently high-dimensional, sparse, and noisy. Each cell is represented by thousands of features (genes), but many of these features are zero in any given cell due to dropout events. Machine learning (ML) offers powerful tools to handle these challenges: dimensionality reduction, clustering, and predictive modeling can reveal hidden patterns, trajectories, and gene programs in single-cell data.

In this tutorial, we will explore the workflow of analyzing single-cell transcriptomic data using ML.

The data is from https://www.10xgenomics.com/datasets/pbmc-from-a-healthy-donor-no-cell-sorting-3-k-1-standard-2-0-0, containing the PBMC 3k of ATAC and expression.

In [None]:
import scanpy as sc
import pandas as pd


h5_file = "data_singlecell/pbmc_unsorted_3k_filtered_feature_bc_matrix.h5"

adata = sc.read_10x_h5(h5_file)

adata.var_names_make_unique()

print(adata)


AnnData object with n_obs × n_vars = 3009 × 36601
    var: 'gene_ids', 'feature_types', 'genome', 'interval'


  utils.warn_names_duplicates("var")
  utils.warn_names_duplicates("var")


In [13]:
print(adata.X.shape) 

(3009, 36601)


In [34]:
import pandas as pd
import pyranges as pr

# Read fragments
fragments = pd.read_csv(
    "data_singlecell/pbmc_unsorted_3k_atac_fragments.tsv.gz.tsv",
    sep='\t',
    header=None,
    names=['chrom','start','end','barcode','count','strand'],
    compression='gzip',
    comment='#'
)

# Filter for RNA cells
cell_barcodes = set(adata.obs_names)
fragments = fragments[fragments['barcode'].isin(cell_barcodes)]

# Rename columns for PyRanges
fragments = fragments.rename(columns={
    'chrom': 'Chromosome',
    'start': 'Start',
    'end': 'End',
    'strand': 'Strand'  # optional
})

# Create PyRanges object
fr = pr.PyRanges(fragments[['Chromosome','Start','End','barcode','count','Strand']])


In [40]:
import pandas as pd
import pyranges as pr

# Read peaks BED, skipping comment lines
peaks = pd.read_csv(
    "data_singlecell/pbmc_unsorted_3k_atac_peaks.bed",
    sep='\t',
    header=None,
    names=['chrom','start','end'],
    comment='#'   # <-- skip any lines starting with #
)

# Rename columns for PyRanges
peaks = peaks.rename(columns={
    'chrom': 'Chromosome',
    'start': 'Start',
    'end': 'End'
})

# Now create PyRanges object
pk = pr.PyRanges(peaks)
overlaps = fr.join(pk)

In [42]:
overlaps

Unnamed: 0,Chromosome,Start,End,barcode,count,Strand,Start_b,End_b
0,GL000009.2,114064,114295,GACCTCAAGGAACGGT-1,1,,113930,114840
1,GL000009.2,114247,114474,GCTAGCCAGGCGAATA-1,1,,113930,114840
2,GL000009.2,114267,114440,TATTCGTTCAATCTCT-1,1,,113930,114840
3,GL000194.1,28130,28352,TACGCACCAATAGCAA-1,1,,27950,28827
4,GL000194.1,28146,28381,GTCCATTGTGGATGTC-1,1,,27950,28827
...,...,...,...,...,...,...,...,...
19178517,chrY,56847220,56847315,GAAGGAACATTCAGCA-1,2,,56846844,56847757
19178518,chrY,56847423,56847562,GTTGGCGGTTCGCGCT-1,1,,56846844,56847757
19178519,chrY,56847435,56847577,AGTAATCGTGGATGTC-1,2,,56846844,56847757
19178520,chrY,56847512,56847679,GACCTCAAGGAACGGT-1,1,,56846844,56847757


In [44]:
import pandas as pd
import pyranges as pr
import numpy as np
from scipy.sparse import coo_matrix
import scanpy as sc


In [46]:
print(overlaps.df.head())
print(overlaps.df.columns)


   Chromosome   Start     End             barcode  count Strand  Start_b  \
0  GL000009.2  114064  114295  GACCTCAAGGAACGGT-1      1    NaN   113930   
1  GL000009.2  114247  114474  GCTAGCCAGGCGAATA-1      1    NaN   113930   
2  GL000009.2  114267  114440  TATTCGTTCAATCTCT-1      1    NaN   113930   
3  GL000194.1   28130   28352  TACGCACCAATAGCAA-1      1    NaN    27950   
4  GL000194.1   28146   28381  GTCCATTGTGGATGTC-1      1    NaN    27950   

    End_b  
0  114840  
1  114840  
2  114840  
3   28827  
4   28827  
Index(['Chromosome', 'Start', 'End', 'barcode', 'count', 'Strand', 'Start_b',
       'End_b'],
      dtype='object')


In [47]:
# Assign peak indices
peaks_df = pk.df.reset_index(drop=True)
peaks_df['peak_idx'] = np.arange(len(peaks_df))

# Merge overlaps with peak indices
overlaps_df = overlaps.df.merge(
    peaks_df[['Chromosome','Start','End','peak_idx']],
    left_on=['Chromosome','Start_b','End_b'],  # notice Chromosome alone
    right_on=['Chromosome','Start','End'],
    how='left'
)

# Map barcodes to RNA cell indices
cell_to_idx = {cell:i for i, cell in enumerate(adata.obs_names)}
overlaps_df['cell_idx'] = overlaps_df['barcode'].map(cell_to_idx)

# Build sparse matrix
from scipy.sparse import coo_matrix

rows = overlaps_df['cell_idx'].to_numpy()
cols = overlaps_df['peak_idx'].to_numpy()
data = overlaps_df['count'].to_numpy()

X_atac = coo_matrix((data, (rows, cols)), shape=(adata.n_obs, len(peaks_df)))


In [49]:
X_atac.shape

(3009, 81156)

In [58]:
X_atac

<COOrdinate sparse matrix of dtype 'int64'
	with 19178522 stored elements and shape (3009, 81156)>

In [50]:
import scipy.sparse as sp
import scanpy as sc
atac_adata = sc.AnnData(X=X_atac, obs=adata.obs.copy(), var=peaks)




In [52]:
atac_adata.shape, adata.shape

((3009, 81156), (3009, 36601))

In [59]:
print(atac_adata)


AnnData object with n_obs × n_vars = 3009 × 81156
    var: 'Chromosome', 'Start', 'End'
