In [1]:
include <- function(pkg) {
  if (!suppressMessages(require(pkg, character.only = TRUE)))
    install.packages(pkg, character.only = TRUE)
  suppressMessages(library(pkg, pkg, character.only = TRUE))
}

include("alakazam")
include("shazam")
include("dplyr")
include("ggplot2")

In [2]:
# Helper
writeAnalysisTable <- function(table, filepath) {
    write.table(
        table, 
        file = filepath, 
        quote = FALSE, 
        row.names = FALSE, 
        sep = "\t")
    print(paste0("Output is written to ", filepath))
}

# Pipeline
clonal_analysis <- function(clones_path, name) {

    clones_filename = paste0(name, "_novj_with_clones.tsv")
    repertoire = read.csv(
        paste0(clones_path, "/", clones_filename), 
        sep='\t'
    )

    repertoire = repertoire[order(-repertoire[, "duplicate_count"]), ]
    
    # Collapse clones
    clonal_sequences = shazam::collapseClones(
        db = repertoire[, c("sequence_alignment", "germline_alignment", "clone_id")], 
        cloneColumn="clone_id", 
        sequenceColumn="sequence_alignment", 
        germlineColumn="germline_alignment", 
        regionDefinition=NULL, #shazam::IMGT_V, 
        method="thresholdedFreq", minimumFrequency=0.6,
        includeAmbiguous=FALSE, breakTiesStochastic=FALSE, 
        nproc=12
    )
    clonal_sequences_dt = data.table::data.table(clonal_sequences)
    clonal_sequences_dt$clone_id = as.integer(clonal_sequences_dt$clone_id)
    # Augment clones data.table with extra informative columns
    augment_clones = function(repertoire_dt, clonal_sequences_dt) {
        clones_dt = repertoire_dt[, 
                      list(
                          counts=sum(duplicate_count), 
                          num_seqs=length(unique((sequence_id)))
                      ), 
                      by="clone_id"][order(-counts)]

        clones_dt = merge(clones_dt, clonal_sequences_dt, by = "clone_id", all = FALSE)[order(-counts)]
        return(clones_dt)
    }

    repertoire_dt = data.table::data.table(repertoire)
    clones_dt = augment_clones(repertoire_dt, clonal_sequences_dt)
    writeAnalysisTable(clones_dt, paste0(clones_path, "/", name, "_collapse_clones.tsv"))

    # Observed mutations (sequences)
    repertoire_obs <- shazam::observedMutations(
        repertoire_dt, 
        sequenceColumn="sequence",
        germlineColumn="germline_alignment_d_mask",  # d_mask
#                             regionDefinition=shazam::IMGT_VDJ_BY_REGIONS,
        frequency=TRUE,
        combine=FALSE,
        nproc=12)
    writeAnalysisTable(repertoire_obs, paste0(clones_path, "/", name, "_novj_with_clones_and_muts.tsv"))
    
    # Selection pressure
    baseline <- shazam::calcBaseline(
        clones_dt, 
        testStatistic="focused", 
        regionDefinition=shazam::IMGT_V, 
        nproc=1, 
        calcStats = TRUE)
    writeAnalysisTable(baseline@stats, paste0(clones_path, "/", name, "_collapse_clones_with_selection_pressure.tsv"))
    
}

In [3]:
# clones_path = "/data/samples/AIRR-Seq/OURS/S5205Nr1/S5205Nr1-P1_IgG1Fc_H/clones"
# name = "S5205Nr1-P1_IgG1Fc_H"
# clonal_analysis(clones_path, name)

In [4]:
config = yaml::read_yaml("/data/sources/immune-repertoires-dash/config.yml")
sample_names = names(config$samples)

“incomplete final line found on '/data/sources/immune-repertoires-dash/config.yml'”


In [None]:
print(paste0("Number of samples: ", length(sample_names)))
for (i in 1:length(sample_names)) {
    sample_name = sample_names[i]
    sample_path = config$samples[[sample_name]]$sample_path
    clones_path = paste0(sample_path, "/clones")
    
    print(paste0("Clonal analysis for: ", i, " - ", sample_name))
    clonal_analysis(clones_path, sample_name)
    print("\n")
}

[1] "Number of samples: 13"
[1] "Clonal analysis for: 1 - S3987Nr1-PBMC1_heavy"
[1] "Output is written to /data/samples/AIRR-Seq/OURS/S3987Nr1/S3987Nr1-PBMC1_heavy/clones/S3987Nr1-PBMC1_heavy_collapse_clones.tsv"
[1] "Output is written to /data/samples/AIRR-Seq/OURS/S3987Nr1/S3987Nr1-PBMC1_heavy/clones/S3987Nr1-PBMC1_heavy_novj_with_clones_and_muts.tsv"


calcBaseline will calculate observed and expected mutations for clonal_sequence using clonal_germline as a reference.



Calculating BASELINe probability density functions...
Calculating BASELINe statistics...
[1] "Output is written to /data/samples/AIRR-Seq/OURS/S3987Nr1/S3987Nr1-PBMC1_heavy/clones/S3987Nr1-PBMC1_heavy_collapse_clones_with_selection_pressure.tsv"
[1] "\n"
[1] "Clonal analysis for: 2 - S3987Nr1-PBMC1_light"
[1] "Output is written to /data/samples/AIRR-Seq/OURS/S3987Nr1/S3987Nr1-PBMC1_light/clones/S3987Nr1-PBMC1_light_collapse_clones.tsv"
[1] "Output is written to /data/samples/AIRR-Seq/OURS/S3987Nr1/S3987Nr1-PBMC1_light/clones/S3987Nr1-PBMC1_light_novj_with_clones_and_muts.tsv"


calcBaseline will calculate observed and expected mutations for clonal_sequence using clonal_germline as a reference.



Calculating BASELINe probability density functions...
Calculating BASELINe statistics...
[1] "Output is written to /data/samples/AIRR-Seq/OURS/S3987Nr1/S3987Nr1-PBMC1_light/clones/S3987Nr1-PBMC1_light_collapse_clones_with_selection_pressure.tsv"
[1] "\n"
[1] "Clonal analysis for: 3 - S3987Nr1-RAMOS_heavy"
[1] "Output is written to /data/samples/AIRR-Seq/OURS/S3987Nr1/S3987Nr1-RAMOS_heavy/clones/S3987Nr1-RAMOS_heavy_collapse_clones.tsv"
[1] "Output is written to /data/samples/AIRR-Seq/OURS/S3987Nr1/S3987Nr1-RAMOS_heavy/clones/S3987Nr1-RAMOS_heavy_novj_with_clones_and_muts.tsv"


calcBaseline will calculate observed and expected mutations for clonal_sequence using clonal_germline as a reference.



Calculating BASELINe probability density functions...
Calculating BASELINe statistics...
[1] "Output is written to /data/samples/AIRR-Seq/OURS/S3987Nr1/S3987Nr1-RAMOS_heavy/clones/S3987Nr1-RAMOS_heavy_collapse_clones_with_selection_pressure.tsv"
[1] "\n"
[1] "Clonal analysis for: 4 - S3987Nr1-RAMOS_light"
[1] "Output is written to /data/samples/AIRR-Seq/OURS/S3987Nr1/S3987Nr1-RAMOS_light/clones/S3987Nr1-RAMOS_light_collapse_clones.tsv"
[1] "Output is written to /data/samples/AIRR-Seq/OURS/S3987Nr1/S3987Nr1-RAMOS_light/clones/S3987Nr1-RAMOS_light_novj_with_clones_and_muts.tsv"


calcBaseline will calculate observed and expected mutations for clonal_sequence using clonal_germline as a reference.

