<a href="https://colab.research.google.com/github/anl078/ECE204Repo/blob/Solana/project2/R_dNdScv.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
if (!requireNamespace("BiocManager", quietly = TRUE)) {
  install.packages("BiocManager")
}

# Install required dependencies using Bioconductor
BiocManager::install(c("Rcpp", "dplyr", "ggplot2", "data.table", "magrittr"))

library(devtools); install_github("im3sanger/dndscv")
library(dndscv)
library(dplyr)
library(magrittr)
library(ggplot2)
library(Rcpp)
library(data.table)

'getOption("repos")' replaces Bioconductor standard repositories, see
'help("repositories", package = "BiocManager")' for details.
Replacement repositories:
    CRAN: https://cran.rstudio.com

Bioconductor version 3.21 (BiocManager 1.30.25), R 4.5.0 (2025-04-11)

“package(s) not installed when version(s) same as or greater than current; use
  `force = TRUE` to re-install: 'Rcpp' 'dplyr' 'ggplot2' 'data.table'
  'magrittr'”
Old packages: 'systemfonts', 'textshaping', 'utf8'

Skipping install of 'dndscv' from a github remote, the SHA1 (dab351ae) has not changed since last install.
  Use `force = TRUE` to force installation



In [None]:
### READ MUTATION DATA ###
df <- read.table("TCGA.STAD.mutations.txt", header=TRUE, sep="\t")

### PREPROCESSING ###

## Remove hypermutators (top 1% by mutation count)
mutation_counts <- df %>%
  count(patient_id, name = "mut_count")

cutoff <- quantile(mutation_counts$mut_count, 0.99)

reg_mutaters <- df %>%
  inner_join(mutation_counts, by = "patient_id") %>%
  filter(mut_count <= cutoff) %>%
  select(-mut_count)

# Remove genes with less than 10 mutations
filtered_mutations <- reg_mutaters %>%
  filter(!is.na(Hugo_Symbol)) %>%
  group_by(Hugo_Symbol) %>%
  filter(n() >= 10) %>%
  ungroup()

# Only keep SNP
filtered_mutations <- filtered_mutations %>%
  filter(Variant_Type == "SNP")

## Retain pertinent columns
filtered_mutations <- filtered_mutations[c("patient_id","Hugo_Symbol","Chromosome", "Start_Position","Reference_Allele", "Tumor_Seq_Allele1","Tumor_Seq_Allele2")]

# Clean up: only keep the allele with the mutatation
dndscv_data <- filtered_mutations %>%
  mutate(
    mut = case_when(
      Tumor_Seq_Allele1 != Reference_Allele & !is.na(Tumor_Seq_Allele1) ~ Tumor_Seq_Allele1,
      Tumor_Seq_Allele2 != Reference_Allele & !is.na(Tumor_Seq_Allele2) ~ Tumor_Seq_Allele2,
      TRUE ~ NA_character_
    ),
    chr = as.character(Chromosome),  # Ensure chromosome is a character type
    pos = Start_Position,
    ref = Reference_Allele,
  ) %>%
  select(sampleID = patient_id, chr, pos, ref, mut)

write.table(dndscv_data, file = "dndscv_data.tsv", sep = "\t", row.names = FALSE, quote = FALSE) # save input df

In [None]:
### RUN dNdScv
data("dataset_simbreast", package="dndscv")
dndsout = dndscv(dndscv_data)

[1] Loading the environment...

[2] Annotating the mutations...

“Mutations observed in contiguous sites within a sample. Please annotate or remove dinucleotide or complex substitutions for best results.”
“Same mutations observed in different sampleIDs. Please verify that these are independent events and remove duplicates otherwise.”
    Note: 173 mutations removed for exceeding the limit of mutations per gene per sample (see the max_muts_per_gene_per_sample argument in dndscv)

    30% ...

    60% ...

    90% ...

“27 (0.081%) mutations have a wrong reference base (see the affected mutations in dndsout$wrongmuts). Please identify the causes and rerun dNdScv.”
[3] Estimating global rates...

[4] Running dNdSloc...

[5] Running dNdScv...

“iteration limit reached”
“NaNs produced”
“iteration limit reached”
“NaNs produced”
    Regression model for substitutions (theta = 4.11e+04).



In [None]:
sel_cv = dndsout$sel_cv
print(head(sel_cv), digits = 2)

      gene_name n_syn n_mis n_non n_spl n_ind wmis_cv wnon_cv wspl_cv wind_cv
18057      TP53     2    70    15     8    21    87.8   363.0   363.0   167.4
9768      LRP1B    20    69     6     3     6    10.7    13.2    13.2     4.1
13955     PTPRT    16    28     3     1     2     9.6    15.6    15.6     4.3
12566    PCDH15    10    31     3     1     1    10.8    14.4    14.4     1.6
17024     SYNE1    20    62     5     2     5     4.4     4.9     4.9     1.8
16660     SPTA1     9    48     1     2     5    12.1     6.4     6.4     6.5
      pmis_cv ptrunc_cv pallsubs_cv pind_cv qmis_cv qtrunc_cv qallsubs_cv
18057       0   0.0e+00           0 7.5e-13       0   0.00000           0
9768        0   4.8e-08           0 7.9e-02       0   0.00016           0
13955       0   1.4e-04           0 1.1e-01       0   0.10878           0
12566       0   1.9e-04           0 2.8e-01       0   0.12924           0
17024       0   8.7e-04           0 1.9e-01       0   0.37186           0
16660     

Note that no gene has enough truncating and splice site mutations to fit the 3-parameter model that generates qglobal_cv (due to input of SNV only). We use qallsubs_cv (FDR-adjusted p-value for all nonsynonymous mutations) to rank genes.

In [None]:
results <- top_n(sel_cv,-30,qallsubs_cv)
# top_drivers <- top_n(results,10,pglobal_cv) # breaking tie with p-value if ties with q
# top_drivers <- top_n(top_drivers,10,n_ind) # breaking tie with number of

results$percentage_samples <- sapply(
  results$gene_name,
  function(gene) {
    length(unique(filtered_mutations$patient_id[filtered_mutations$Hugo_Symbol == gene])) / # number of mutations for a gene
      length(unique(filtered_mutations$patient_id)) # total number mutations (after filtering ofc)
  }
)

results

gene_name,n_syn,n_mis,n_non,n_spl,wmis_cv,wnon_cv,wspl_cv,pmis_cv,ptrunc_cv,pallsubs_cv,qmis_cv,qtrunc_cv,qallsubs_cv,percentage_samples
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
TP53,2,70,15,8,87.700652,362.775791,362.775791,0.0,0.0,0.0,0.0,0.0,0.0,0.22332506
LRP1B,20,69,6,3,10.713671,13.17442,13.17442,0.0,4.855618e-08,0.0,0.0,0.0001625904,0.0,0.15880893
PTPRT,16,28,3,1,9.57023,15.569526,15.569526,0.0,0.0001419561,0.0,0.0,0.1090793823,0.0,0.10173697
PCDH15,10,31,3,1,10.743529,14.400634,14.400634,0.0,0.0001934795,0.0,0.0,0.1295732503,0.0,0.1191067
SYNE1,20,63,5,2,4.504588,4.886639,4.886639,0.0,0.0008748673,0.0,0.0,0.3739778447,0.0,0.15632754
SPTA1,9,48,1,2,12.048457,6.439626,6.439626,0.0,0.01346934,0.0,0.0,0.9255890694,0.0,0.12406948
FLG,20,56,3,0,8.165875,6.279775,6.279775,0.0,0.01447476,0.0,0.0,0.9255890694,0.0,0.14888337
CSMD1,16,50,2,1,7.992299,5.37737,5.37737,0.0,0.02247234,0.0,0.0,0.9255890694,0.0,0.13647643
PREX2,8,28,1,1,10.745717,8.400943,8.400943,0.0,0.0255025,0.0,0.0,0.9255890694,0.0,0.08933002
RYR2,15,44,1,2,5.283529,3.511346,3.511346,0.0,0.07165783,0.0,0.0,0.9255890694,0.0,0.10918114


In [None]:
signif_genes = results[results$qallsubs_cv<0.1, c("gene_name","qallsubs_cv","percentage_samples")]
rownames(signif_genes) = NULL

top_drivers <- top_n(signif_genes,-10,qallsubs_cv)

write.table(top_drivers, file = "dndscv_package_drivers.tsv", sep = "\t", row.names = FALSE, quote = FALSE)

top_10 <- top_n(top_drivers,10,percentage_samples)
top_10

gene_name,qallsubs_cv,percentage_samples
<chr>,<dbl>,<dbl>
TP53,0,0.2233251
LRP1B,0,0.1588089
SYNE1,0,0.1563275
FLG,0,0.1488834
CSMD1,0,0.1364764
TTN,0,0.3275434
FAT4,0,0.1315136
CSMD3,0,0.1538462
MUC16,0,0.2084367
HMCN1,0,0.1265509


(dndsout$globaldnds) table with the global MLEs for the dN/dS ratios across all genes. dN/dS ratios with associated confidence intervals are calculated for missense, nonsense and essential splice site substitutions separately, as well as for all non-synonymous substitutions (wall) and for all truncating substitutions together (wtru), which include nonsense and essential splice site mutations.


In [None]:
print(dndsout$globaldnds)

     name       mle     cilow    cihigh
wmis wmis 1.0506913 1.0242809 1.0777828
wnon wnon 1.1509126 1.0786538 1.2280121
wspl wspl 0.7060628 0.6377090 0.7817433
wtru wtru 0.9860714 0.9321425 1.0431204
wall wall 1.0487036 1.0226258 1.0754463


annotated table of coding mutations (annotmuts)

In [None]:
head(dndsout$annotmuts)

sampleID,chr,pos,ref,mut,gene,strand,ref_cod,mut_cod,ref3_cod,mut3_cod,aachange,ntchange,codonsub,impact,pid
<chr>,<chr>,<dbl>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr[1d]>,<chr[1d]>,<chr[1d]>,<chr[1d]>,<chr[1d]>,<chr[1d]>,<chr>
TCGA-3M-AB46,10,71021034,C,T,HKDC1,1,C,T,CCT,CTT,L786L,C2356T,CTG>TTG,Synonymous,ENSP00000346643
TCGA-3M-AB46,10,89692887,G,C,PTEN,1,G,C,TGT,TCT,C124S,G371C,TGT>TCT,Missense,ENSP00000361021
TCGA-3M-AB46,11,6661679,G,A,DCHS1,-1,C,T,TCT,TTT,S389F,C1166T,TCT>TTT,Missense,ENSP00000299441
TCGA-3M-AB46,11,9445412,G,A,IPO7,1,G,A,CGC,CAC,R377H,G1130A,CGC>CAC,Missense,ENSP00000369042
TCGA-3M-AB46,11,78516529,T,G,TENM4,-1,A,C,CAT,CCT,M663L,A1987C,ATG>CTG,Missense,ENSP00000278550
TCGA-3M-AB46,2,113940668,G,T,PSD4,1,G,T,GGG,GTG,G212V,G635T,GGG>GTG,Missense,ENSP00000245796


In [None]:
dndsout$wrongmuts

sampleID,chr,pos,ref,mut
<chr>,<chr>,<dbl>,<chr>,<chr>
TCGA-B7-5818,19,22939044,C,A
TCGA-BR-4292,19,10943787,G,A
TCGA-BR-4368,6,38702284,A,G
TCGA-BR-6455,11,33106623,C,T
TCGA-BR-7704,6,32017337,A,G
TCGA-BR-7707,17,80059604,G,A
TCGA-BR-7851,3,56657581,A,G
TCGA-BR-8078,3,64184616,T,C
TCGA-BR-8078,6,32015785,C,T
TCGA-BR-8589,19,55453089,T,G
