We compare:
- Raw gene-level counts generated using STAR + featureCounts
- The published RPKM expression matrix provided by the authors

In [None]:
suppressPackageStartupMessages({
  library(tidyverse)
})


In [None]:
counts_file <- "results/gene_counts.txt"
paper_rpkm_file <- "data/pancreas_refseq_rpkms_counts_3514sc.txt"


In [None]:
counts_raw <- read.table(
  counts_file,
  header = TRUE,
  sep = "\t",
  comment.char = "#",
  check.names = FALSE
)

# Keep Geneid and count columns only
counts <- counts_raw %>%
  select(Geneid, starts_with("ERR"))

dim(counts)
head(counts)


In [None]:
library(data.table)

paper_rpkm_file <- "data/pancreas_refseq_rpkms_counts_3514sc.txt"

paper_rpkm <- fread(paper_rpkm_file)

dim(paper_rpkm)
paper_rpkm[1:5, 1:5]


In [None]:
# Parse GTF to extract gene_id and gene_name
gtf_file <- "data/genome/Homo_sapiens.GRCh38.109.gtf.gz"

cat("Reading GTF file...\n")

gtf_data <- read.table(
  gzfile(gtf_file),
  sep = "\t",
  comment.char = "#",
  quote = "",
  col.names = c("chr", "source", "feature", "start", "end", "score", "strand", "frame", "attributes"),
  colClasses = c(rep("character", 3), rep("integer", 2), rep("character", 4))
)

gtf_genes <- gtf_data %>%
  filter(feature == "gene")

cat("Extracting gene IDs and names...\n")

# Extract gene_id and gene_name from attributes column
extract_attribute <- function(attr_string, attr_name) {
  pattern <- paste0(attr_name, ' "([^"]+)"')
  matches <- regmatches(attr_string, regexec(pattern, attr_string))
  sapply(matches, function(x) if(length(x) > 1) x[2] else NA)
}

gene_mapping <- tibble(
  ensembl_gene_id = extract_attribute(gtf_genes$attributes, "gene_id"),
  external_gene_name = extract_attribute(gtf_genes$attributes, "gene_name")
) %>%
  filter(!is.na(ensembl_gene_id) & !is.na(external_gene_name)) %>%
  distinct()

# Keep only genes that are in the counts
gene_mapping <- gene_mapping %>%
  filter(ensembl_gene_id %in% counts$Geneid)

cat("Mapped", nrow(gene_mapping), "genes\n")
cat("Sample mappings:\n")
head(gene_mapping, 10)


In [None]:
# Add gene symbols to counts
counts_with_symbols <- counts %>%
  left_join(gene_mapping, by = c("Geneid" = "ensembl_gene_id"))

counts_with_symbols <- counts_with_symbols %>%
  filter(!is.na(external_gene_name))

genes_counts <- counts_with_symbols$external_gene_name
genes_paper  <- paper_rpkm[[1]]

common_genes <- intersect(genes_counts, genes_paper)

cat("Genes in featureCounts with symbols:", length(genes_counts), "\n")
cat("Genes in paper:", length(genes_paper), "\n")
cat("Common genes:", length(common_genes), "\n")

In [None]:
overlap_pct <- length(common_genes) / length(genes_counts) * 100
cat(sprintf("Overlap: %.1f%%\n", overlap_pct))

In [None]:
library_sizes <- colSums(counts[, -1])

library_sizes_df <- tibble(
  sample = names(library_sizes),
  total_counts = library_sizes
)

ggplot(library_sizes_df, aes(x = sample, y = total_counts)) +
  geom_col(fill = "steelblue") +
  theme_minimal() +
  labs(
    title = "Library sizes from featureCounts",
    x = "Sample",
    y = "Total assigned reads"
  ) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

In [None]:
# Get gene lengths from featureCounts output
gene_lengths <- counts_raw %>%
  select(Geneid, Length) %>%
  inner_join(gene_mapping, by = c("Geneid" = "ensembl_gene_id")) %>%
  select(external_gene_name, Length)

# Handling duplicates
counts_mat <- counts_with_symbols %>%
  select(external_gene_name, starts_with("ERR")) %>%
  group_by(external_gene_name) %>%
  summarise(across(starts_with("ERR"), sum)) %>%
  ungroup() %>%
  column_to_rownames("external_gene_name")

gene_lengths_agg <- gene_lengths %>%
  group_by(external_gene_name) %>%
  summarise(Length = mean(Length)) %>%
  ungroup()

# RPKM calculation: (counts / gene_length_kb) / (total_counts / 1e6)
# Step 1: Normalization by library size
rpkm_like <- sweep(counts_mat, 2, colSums(counts_mat), FUN = "/") * 1e6

# Step 2: Normalization by gene length (convert to RPKM)
gene_lengths_vec <- gene_lengths_agg$Length
names(gene_lengths_vec) <- gene_lengths_agg$external_gene_name

rpkm_like <- sweep(rpkm_like, 1, gene_lengths_vec[rownames(rpkm_like)] / 1000, FUN = "/")

cat("RPKM matrix dimensions:", dim(rpkm_like), "\n")
cat("Number of unique genes:", nrow(rpkm_like), "\n")
cat("\nSample of RPKM values:\n")
print(rpkm_like[1:5, 1:3])

In [None]:
# Filter our data
cat("Step 1: Filtering our RPKM data...\n")
rpkm_common <- rpkm_like[rownames(rpkm_like) %in% common_genes, ]
cat("  Done. Common genes in RPKM matrix:", nrow(rpkm_common), "\n")

# Convert our data to log scale
cat("Step 2: Converting our data to log scale...\n")
log_expr_ours <- log10(as.numeric(as.matrix(rpkm_common)) + 1)
cat("  Done. Our expression values:", length(log_expr_ours), "\n")

cat("  Paper RPKM dimensions:", dim(paper_rpkm), "\n")

paper_filtered <- paper_rpkm[paper_rpkm[[1]] %in% common_genes, ]
cat("  Done. Filtered to:", nrow(paper_filtered), "rows\n")

# Remove duplicates (simple way)
cat("Step 5: Removing duplicates...\n")
dup_count <- sum(duplicated(paper_filtered[[1]]))
cat("  Found", dup_count, "duplicates\n")
paper_unique <- paper_filtered[!duplicated(paper_filtered[[1]]), ]
cat("  Unique rows:", nrow(paper_unique), "\n")

# Set rownames
cat("Step 6: Setting rownames...\n")
rownames(paper_unique) <- paper_unique[[1]]
paper_mat <- paper_unique[, -1]
cat("  Done. Matrix dimensions:", dim(paper_mat), "\n")

# Convert to log scale
cat("Step 7: Converting paper data to log scale...\n")
log_expr_paper <- log10(as.numeric(as.matrix(paper_mat)) + 1)
cat("  Done. Paper expression values:", length(log_expr_paper), "\n")

# Summary
cat("\nSummary:\n")
cat("Mean expression (ours):", round(mean(log_expr_ours), 2), "\n")
cat("Mean expression (paper):", round(mean(log_expr_paper), 2), "\n")

In [None]:
# Our data
rpkm_common <- rpkm_like[rownames(rpkm_like) %in% common_genes, ]
log_expr_ours <- log10(as.numeric(as.matrix(rpkm_common)) + 1)

# Paper data - subsample
paper_filtered <- paper_rpkm[paper_rpkm[[1]] %in% common_genes, ]
paper_unique <- paper_filtered[!duplicated(paper_filtered[[1]]), ]
rownames(paper_unique) <- paper_unique[[1]]

paper_mat <- paper_unique[, 2:101]

paper_mat_numeric <- as.data.frame(lapply(paper_mat, as.numeric))
log_expr_paper <- log10(as.numeric(as.matrix(paper_mat_numeric)) + 1)

# Remove NAs
log_expr_paper <- log_expr_paper[!is.na(log_expr_paper)]
log_expr_ours <- log_expr_ours[!is.na(log_expr_ours)]

cat("Our expression values:", length(log_expr_ours), "\n")
cat("Paper expression values:", length(log_expr_paper), "\n")
cat("Mean expression (ours):", round(mean(log_expr_ours), 2), "\n")
cat("Mean expression (paper):", round(mean(log_expr_paper), 2), "\n")

In [None]:
expr_df <- tibble(
  expression = c(log_expr_ours, log_expr_paper),
  source = rep(
    c("This study", "Published"),
    times = c(length(log_expr_ours), length(log_expr_paper))
  )
)

ggplot(expr_df, aes(x = expression, fill = source)) +
  geom_density(alpha = 0.5) +
  theme_minimal() +
  scale_fill_manual(values = c("This study" = "steelblue", "Published" = "coral")) +
  labs(
    title = "Expression distribution comparison (log10 scale)",
    x = "log10(RPKM + 1)",
    y = "Density",
    fill = "Source"
  ) +
  theme(legend.position = "top")

In [None]:
sessionInfo()
