In [1]:
############################### Load libraries  ################################ 
suppressPackageStartupMessages({
  library(Seurat)
  library(SeuratDisk)
  library(tidyr)
  library(stringr)
  library(patchwork)
  library(ggplot2)
})

############################### PATHS ##########################################
base_dir    <- "/home/jovyan/work"
data_dir    <- file.path(base_dir, "Data")
results_dir <- file.path(base_dir, "Results", "script4")
dir.create(results_dir, showWarnings = FALSE, recursive = TRUE)

mtx_file      <- file.path(data_dir, "matrix.mtx.gz")
barcodes_file <- file.path(data_dir, "barcodes.tsv.gz")
features_file <- file.path(data_dir, "features.tsv.gz")  # 10x: col1 = gene id

######################### Load scRNA-seq data into R ###########################

# .mtx format (10x). Usa la 1ª colonna delle features come gene id.
obj <- ReadMtx(
  mtx      = mtx_file,
  cells    = barcodes_file,
  features = features_file,
  feature.column = 1
)

seurat_obj <- CreateSeuratObject(
  counts       = obj,
  project      = "myProject",
  min.cells    = 3,    # filtra geni espressi in >= 3 cellule
  min.features = 200   # filtra cellule con < 200 geni espressi
)

cat(sprintf("✅ Seurat object: %d genes x %d cells\n",
            nrow(seurat_obj), ncol(seurat_obj)))

# --------- SAVE OUTPUTS ----------
saveRDS(seurat_obj, file.path(results_dir, "seurat_obj_script4.rds"))
cat("✅ Salvato: ", file.path(results_dir, "seurat_obj_script4.rds"), "\n")


✅ Seurat object: 18164 genes x 798 cells
✅ Salvato:  /home/jovyan/work/Results/script4/seurat_obj_script4.rds 


In [2]:

# Show number of cells/genes
ncol(seurat_obj)
nrow(seurat_obj)

In [3]:
# After filtering low-quality cells and genes from each sample, the next step 
# is data normalization.
# Sample-specific differences, such as variations in UMI counts that can inflate
# the mean and variance, are properly corrected because normalization is 
# performed independently on each layer.
# Since raw counts are influenced by library size, normalization ensures that 
# expression levels are comparable across cells and variable feature selection 
# is more accurate. 

# show raw counts 

In [4]:
# visualize metadata
View(seurat_obj@meta.data)

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA
Unnamed: 0_level_1,<fct>,<dbl>,<int>
AAATGGATCAAGATAG-1_A549-A549,myProject,13369,3080
AACAAAGTCGGTCGGT-1_A549-A549,myProject,14773,2372
AACCATGTCCATTGCC-1_A549-A549,myProject,19406,4152
AACCTTTGTAAGAACT-1_A549-A549,myProject,6326,1876
AAGCATCAGTACAGAT-1_A549-A549,myProject,7935,1672
AAGCGTTGTTGGGACA-1_A549-A549,myProject,8971,1771
AAGGTAAGTCTCCTGT-1_A549-A549,myProject,9551,2047
ACCAAACTCGCAACAT-1_A549-A549,myProject,18387,3315
ACCATTTCAACTGAAA-1_A549-A549,myProject,10065,2030
ACCGTTCCAATCTCGA-1_A549-A549,myProject,12568,2808


In [5]:
raw_counts <- seurat_obj@assays$RNA
raw_counts=raw_counts@counts
head(raw_counts)

  [[ suppressing 798 column names ‘AAATGGATCAAGATAG-1_A549-A549’, ‘AACAAAGTCGGTCGGT-1_A549-A549’, ‘AACCATGTCCATTGCC-1_A549-A549’ ... ]]



6 x 798 sparse Matrix of class "dgCMatrix"
                                                                               
ENSG00000237491 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
ENSG00000228794 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
ENSG00000272438 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
ENSG00000223764 . . . . . . . . . . . . . . . . 1 . . . . . . . . . . . . . . .
ENSG00000187634 1 . . 1 . 2 . 4 . . . . . 1 . 2 . . . . . 1 . . . 1 . . . . . .
ENSG00000188976 . . . . . 1 . . . . . . . 1 1 1 . . . . . . . . . 1 . . . . . .
                                                                               
ENSG00000237491 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
ENSG00000228794 . . . . . . . . . . . . . . . . 1 . . . . . . 1 . . . . . . . .
ENSG00000272438 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
ENSG00000223764 . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

In [6]:
# ======== SCRIPT 4 (parte) — Istogramma raw counts (per cellula) ========
suppressPackageStartupMessages(library(Matrix))

base_dir    <- "/home/jovyan/work"
results_dir <- file.path(base_dir, "Results", "script4")
dir.create(results_dir, showWarnings = FALSE, recursive = TRUE)

# Se 'raw_counts' non esiste, calcolalo: somma dei counts per cellula
if (!exists("raw_counts")) {
  if (exists("seurat_obj")) {
    raw_counts <- Matrix::colSums(GetAssayData(seurat_obj, slot = "counts"))
  } else if (exists("obj")) {  # matrice di counts letta con ReadMtx()
    raw_counts <- Matrix::colSums(obj)
  } else {
    stop("Non trovo né 'seurat_obj' né 'obj' per calcolare i raw counts.")
  }
}

# Istogramma (frequenze log-trasformate)
raw_hist <- hist(as.vector(raw_counts), breaks = 50, plot = FALSE)
raw_hist$counts <- log1p(raw_hist$counts)

out_png <- file.path(results_dir, "raw_counts_hist.png")
png(out_png, width = 1500, height = 1500, res = 300, units = "px")
plot(raw_hist,
     col  = "#bc91a5",
     main = NULL,
     xlab = "Raw counts per cell",
     ylab = "Log-transformed frequency")
dev.off()

cat("✅ Istogramma salvato in: ", out_png, "\n")


✅ Istogramma salvato in:  /home/jovyan/work/Results/script4/raw_counts_hist.png 


In [7]:

# Seurat normalizes gene expression by dividing each gene count by the total 
# UMI counts per cell, multiplying by a scale factor (1e4 by default), and 
# log-transforming the result (global-scaling):
# - NOTE: log1p computes ln(1 + x)
# - NOTE: Raw counts are stored in the layer named 'counts' and normalized
# counts are stored in the layer named 'data'.  

In [8]:
seurat_obj <- NormalizeData(seurat_obj, 
                                   normalization.method = "LogNormalize", 
                                   scale.factor = 10000)

In [9]:
normalized_counts <- seurat_obj@assays$RNA
normalized_counts=normalized_counts@data

In [11]:
# ======== SCRIPT 4 (parte) — Istogramma log-normalized counts ========
suppressPackageStartupMessages(library(Matrix))

base_dir    <- "/home/jovyan/work"
results_dir <- file.path(base_dir, "Results", "script4")
dir.create(results_dir, showWarnings = FALSE, recursive = TRUE)

# Se 'normalized_counts' non esiste, calcolalo dallo slot "data" del Seurat object
if (!exists("normalized_counts")) {
  if (exists("seurat_obj")) {
    normalized_counts <- GetAssayData(seurat_obj, slot = "data")
  } else if (exists("obj")) {
    stop("Hai caricato solo la matrice grezza (obj). Serve un Seurat object normalizzato.")
  } else {
    stop("Non trovo 'seurat_obj' per estrarre i log-normalized counts.")
  }
}

# Istogramma (frequenze log-trasformate)
normalized_hist <- hist(as.vector(normalized_counts), breaks = 50, plot = FALSE)
normalized_hist$counts <- log1p(normalized_hist$counts)

out_png <- file.path(results_dir, "normalized_counts_hist.png")
png(out_png, width = 1500, height = 1500, res = 300, units = "px")
plot(normalized_hist,
     col  = "#bc91a5",
     main = NULL,
     xlab = "Log-normalized counts",
     ylab = "Log-transformed frequency")
dev.off()

cat("✅ Istogramma salvato in: ", out_png, "\n")


✅ Istogramma salvato in:  /home/jovyan/work/Results/script4/normalized_counts_hist.png 


In [12]:
# ======== SCRIPT 4 (parte) — CPM (RC) + istogramma ========
suppressPackageStartupMessages(library(Seurat))
suppressPackageStartupMessages(library(Matrix))

base_dir    <- "/home/jovyan/work"
results_dir <- file.path(base_dir, "Results", "script4")
dir.create(results_dir, showWarnings = FALSE, recursive = TRUE)

stopifnot(exists("seurat_obj"))

# Calcola CPM usando NormalizeData con metodo RC (non distruggo l'oggetto originale)
cpm_obj <- seurat_obj
cpm_obj <- NormalizeData(
  cpm_obj,
  normalization.method = "RC",  # Relative Counts
  scale.factor = 1e6,
  verbose = FALSE
)

# Estrai la matrice CPM (slot 'data')
cpm_counts <- GetAssayData(cpm_obj, slot = "data")

# Istogramma (frequenze log-trasformate)
cpm_hist <- hist(as.vector(cpm_counts), breaks = 50, plot = FALSE)
cpm_hist$counts <- log1p(cpm_hist$counts)

out_png <- file.path(results_dir, "cpm_counts_hist.png")
png(out_png, width = 1500, height = 1500, res = 300, units = "px")
plot(cpm_hist,
     col  = "#bc91a5",
     main = NULL,
     xlab = "Counts per million (CPM)",
     ylab = "Log-transformed frequency")
dev.off()

cat("✅ Istogramma CPM salvato in: ", out_png, "\n")

# (Opzionale) Se vuoi controllare le prime righe:
# print(as.matrix(cpm_counts[1:5, 1:5]))


✅ Istogramma CPM salvato in:  /home/jovyan/work/Results/script4/cpm_counts_hist.png 


In [13]:
# ======== SCRIPT 4 (parte) — Variable genes (HVGs) + plot ========
suppressPackageStartupMessages({
  library(Seurat)
  library(ggplot2)
})

base_dir    <- "/home/jovyan/work"
results_dir <- file.path(base_dir, "Results", "script4")
dir.create(results_dir, showWarnings = FALSE, recursive = TRUE)

stopifnot(exists("seurat_obj"))

# Identifica le HVGs (2000 di default) con VST
seurat_obj <- FindVariableFeatures(
  seurat_obj,
  selection.method = "vst",
  nfeatures = 2000,
  verbose = FALSE
)

# Esporta la meta-feature table (info per gene) invece di View()
meta_feat <- seurat_obj@assays$RNA@meta.features
meta_csv  <- file.path(results_dir, "RNA_meta_features.csv")
write.csv(meta_feat, meta_csv)
cat("✅ Meta-features salvate in: ", meta_csv, "\n")

# Plot top 10 HVGs
top_10_genes <- head(VariableFeatures(seurat_obj), 10)
p <- VariableFeaturePlot(seurat_obj)
p <- LabelPoints(plot = p, points = top_10_genes, repel = TRUE, xnudge = 0, ynudge = 0)

# Salva plot
out_png <- file.path(results_dir, "top_10_variable_genes.png")
ggsave(out_png, plot = p, width = 2000/300, height = 2000/300, dpi = 300)  # 2000px @300dpi
cat("✅ Plot salvato in: ", out_png, "\n")

# Salva l’oggetto aggiornato
out_rds <- file.path(results_dir, "seurat_obj_after_HVGs.rds")
saveRDS(seurat_obj, out_rds)
cat("✅ Seurat object salvato in: ", out_rds, "\n")


✅ Meta-features salvate in:  /home/jovyan/work/Results/script4/RNA_meta_features.csv 
✅ Plot salvato in:  /home/jovyan/work/Results/script4/top_10_variable_genes.png 
✅ Seurat object salvato in:  /home/jovyan/work/Results/script4/seurat_obj_after_HVGs.rds 
