In [4]:
library(dplyr)
source("~/sc-online/utils.R")

BASE_PATH = "/mnt/accessory/seq_data/calico"
no_subset_basename = 'vireo_outs/no_subset/donor_ids.tsv'
donor_list_basename = 'vireo_outs/donor_list/donor_ids.tsv'

calico_libs_long = readLines("~/calico-libs-long.txt")
calico_libs_long = calico_libs_long[calico_libs_long != ""]
calico_libs = lapply(calico_libs_long, function(x) {
    split = strsplit(x, split = "_")[[1]]
    return(paste(split[2:length(split)], collapse = "_"))
})
names(calico_libs_long) = calico_libs

manifest = read.table("~/sc-online/notebook_data/pd/calico_donor_corrected_manifest_20240306.tsv", header=TRUE, sep="\t")
head(manifest)

no_subset_list = list()
donor_list_list = list()

for (name in calico_libs){
    no_subset_list[[name]] = read.table(file.path(BASE_PATH, name, no_subset_basename), sep="\t", header=T)
    donor_list_list[[name]] = read.table(file.path(BASE_PATH, name, donor_list_basename), sep="\t", header=T)
}

Unnamed: 0_level_0,participant_id,chip_well_barcode,age,sex,case_control
Unnamed: 0_level_1,<chr>,<chr>,<int>,<chr>,<chr>
1,PDC110,206954930010_R05C02,85,Male,ctr
2,PDC091,206954930060_R11C02,94,Female,ctr
3,PDC078,206954930093_R05C02,91,Male,ctr
4,PDC078,207762950055_R07C02,91,Male,ctr
5,2019-102,207762950055_R11C01,58,Male,pd
6,PD0878,207762950086_R02C01,71,Male,pd


In [2]:
# A few questions
# 1. How many change ID overall? For each donor?
# 2. How many go from unassingned to assigned? For each donor?
# 3. How many go from assigned to unassigned? For each donor?

for (name in calico_libs){

    nosub = no_subset_list[[name]]
    yessub = donor_list_list[[name]]

    print(paste("Lib:", name))
    print(paste("Total cells:", nrow(nosub)))
    print(paste("frac assig WITH donor subsetting", round(getFracAssignableVireo(yessub), 2)))
    print(paste("frac assig NO donor subsetting", round(getFracAssignableVireo(nosub), 2)))
    print("::::::::::::::::::::::::::::::::::")
}



[1] "Lib: pCalicoPDsHSrSNSN_VTAiPoold230719FD"
[1] "Total cells: 7701"
[1] "frac assig WITH donor subsetting 0.96"
[1] "frac assig NO donor subsetting 0.95"
[1] "::::::::::::::::::::::::::::::::::"
[1] "Lib: pCalicoPDsHSrSNSN_VTAiPoold230719G1"
[1] "Total cells: 6565"
[1] "frac assig WITH donor subsetting 0.81"
[1] "frac assig NO donor subsetting 0.77"
[1] "::::::::::::::::::::::::::::::::::"
[1] "Lib: pCalicoPDsHSrSNSN_VTAiPoold230719G2"
[1] "Total cells: 7065"
[1] "frac assig WITH donor subsetting 0.77"
[1] "frac assig NO donor subsetting 0.74"
[1] "::::::::::::::::::::::::::::::::::"
[1] "Lib: pCalicoPDsHSrSNSN_VTAiPoold230719GD"
[1] "Total cells: 7798"
[1] "frac assig WITH donor subsetting 0.92"
[1] "frac assig NO donor subsetting 0.92"
[1] "::::::::::::::::::::::::::::::::::"
[1] "Lib: pCalicoPDsHSrSNSN_VTAiPoold230719I1"
[1] "Total cells: 4684"
[1] "frac assig WITH donor subsetting 0.77"
[1] "frac assig NO donor subsetting 0.53"
[1] "::::::::::::::::::::::::::::::::::"
[1] "Lib: 

In [16]:
manifest$donor_id = manifest$chip_well_barcode
donor_to_participant_map = setNames(
    manifest$participant_id, 
    sapply(
        manifest$donor_id,
        function(x) {paste0(x, "_1")})
    )

summary_df_list = list()

donor_list_grouped_list = list()
no_subset_grouped_list = list()

for (name in calico_libs){
    imputed_nosub_orig = no_subset_list[[name]] 
    imputed_sub_orig = donor_list_list[[name]]

    imputed_nosub_orig = imputed_nosub_orig %>% filter(cell %in% imputed_sub_orig$cell)
    imputed_sub_orig = imputed_sub_orig %>% filter(cell %in% imputed_nosub_orig$cell)

    donor_list = readLines(file.path(BASE_PATH, name, "vireo_outs/donor_list/donor_list.txt"))

    n_donors = length(donor_list)-1

    imputed_sub = imputed_sub_orig[(imputed_sub_orig$prob_max >= 0.9) & (!imputed_sub_orig$donor_id %in% c("unassigned", "doublet")),]
    imputed_nosub = imputed_nosub_orig[(imputed_nosub_orig$prob_max >= 0.9) & (!imputed_nosub_orig$donor_id %in% c("unassigned", "doublet")),]
    imputed_sub$participant_id = donor_to_participant_map[imputed_sub$donor_id]
    imputed_nosub$participant_id = donor_to_participant_map[imputed_nosub$donor_id]

    imputed_sub_grouped = imputed_sub %>% group_by(participant_id) %>% summarize(n_cells = n()) %>% arrange(desc(n_cells))
    imputed_nosub_grouped = imputed_nosub %>% group_by(participant_id) %>% summarize(n_cells = n()) %>% arrange(desc(n_cells))
    imputed_nosub_grouped = imputed_nosub_grouped[1:n_donors,]

    print(name)
    print(paste("Same Donors?", all(sort(imputed_sub_grouped$participant_id) == sort(imputed_nosub_grouped$participant_id))))
    print(paste('WITH donor subsetting', round(getFracAssignableVireo(imputed_sub_orig), 2)))
    print(imputed_sub_grouped)
    print(paste('NO donor subsetting', round(getFracAssignableVireo(imputed_nosub_orig), 2)))
    print(imputed_nosub_grouped)
    print(":::::::::::::::::::::::::::::::::::::::::::")

    frac_assignable_sub = round(getFracAssignableVireo(imputed_sub_orig), 2)
    frac_assignable_nosub = round(getFracAssignableVireo(imputed_nosub_orig), 2)
    only_in_sub = paste(sort(setdiff(imputed_sub_grouped$participant_id, imputed_nosub_grouped$participant_id)), collapse = ' -- ')
    only_in_nosub = paste(sort(setdiff(imputed_nosub_grouped$participant_id, imputed_sub_grouped$participant_id)), collapse = ' -- ')

    summary_df = data.frame(
        frac_assignable_sub = frac_assignable_sub,
        frac_assignable_nosub = frac_assignable_nosub,
        same_donors = all(sort(imputed_sub_grouped$participant_id) == sort(imputed_nosub_grouped$participant_id)),
        in_donor_list_but_not_top_donor_in_whitelist = only_in_sub,
        top_donor_in_whitelist_but_not_in_donor_list = only_in_nosub
        # donors_sub = paste(sort(imputed_sub_grouped$donor_id), collapse = ' -- '),
        # donors_nosub = paste(sort(imputed_nosub_grouped$donor_id), collapse = ' -- ')
    )
    summary_df_list[[name]] = summary_df
    donor_list_grouped_list[[name]] = imputed_sub_grouped
    no_subset_grouped_list[[name]] = imputed_nosub_grouped
}
summary_df = do.call(rbind, summary_df_list)
summary_df

[1] "pCalicoPDsHSrSNSN_VTAiPoold230719FD"
[1] "Same Donors? TRUE"
[1] "WITH donor subsetting 0.96"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m 2019-114          [4m1[24m697
[90m2[39m 2018-130          [4m1[24m604
[90m3[39m PDC092            [4m1[24m324
[90m4[39m 2017-037          [4m1[24m084
[90m5[39m [31mNA[39m                 928
[90m6[39m PD0809             719
[1] "NO donor subsetting 0.95"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m 2019-114          [4m1[24m694
[90m2[39m 2018-130          [4m1[24m606
[90m3[39m PDC092            [4m1[24m316
[90m4[39m 2017-037          [4m1[24m080
[90m5[39m [31mNA[39m                 920
[90m6[39m PD0809             711
[1] ":::::::::::::::::::::::::::::::::::::::::::"
[1] "pCalicoPDsHSrSNSN_VTAiPoold230719G1"
[1] "Same Donors? TRUE"
[1] "WITH 

“longer object length is not a multiple of shorter object length”


[1] "Same Donors? FALSE"
[1] "WITH donor subsetting 0.43"
[90m# A tibble: 5 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PD0793            [4m1[24m357
[90m2[39m PD0903             803
[90m3[39m PD0968             242
[90m4[39m [31mNA[39m                 182
[90m5[39m PD0940              13
[1] "NO donor subsetting 0.36"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PD0793             918
[90m2[39m PD0903             670
[90m3[39m PD0954             400
[90m4[39m [31mNA[39m                 105
[90m5[39m PD0968              38
[90m6[39m 2019-082             3
[1] ":::::::::::::::::::::::::::::::::::::::::::"


“longer object length is not a multiple of shorter object length”


[1] "pCalicoPDsHSrSNSN_VTAiPoold230719CD"


“longer object length is not a multiple of shorter object length”


[1] "Same Donors? FALSE"
[1] "WITH donor subsetting 0.67"
[90m# A tibble: 5 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PD0793            [4m1[24m463
[90m2[39m PD0903            [4m1[24m281
[90m3[39m [31mNA[39m                 647
[90m4[39m PD0968             545
[90m5[39m PD0940               9
[1] "NO donor subsetting 0.69"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PD0793            [4m1[24m427
[90m2[39m PD0903            [4m1[24m239
[90m3[39m PD0954             705
[90m4[39m [31mNA[39m                 553
[90m5[39m PD0968             121
[90m6[39m 2007-020             2
[1] ":::::::::::::::::::::::::::::::::::::::::::"


“longer object length is not a multiple of shorter object length”


[1] "pCalicoPDsHSrSNSN_VTAiPoold230719D1"


“longer object length is not a multiple of shorter object length”


[1] "Same Donors? FALSE"
[1] "WITH donor subsetting 0.49"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PD0793             316
[90m2[39m PD0903              95
[90m3[39m PD0968              73
[90m4[39m PD0940              14
[90m5[39m [31mNA[39m                  13
[90m6[39m PDC091              12
[1] "NO donor subsetting 0.47"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PD0793             207
[90m2[39m PD0954             185
[90m3[39m PD0903              68
[90m4[39m PDC110              19
[90m5[39m PD0968               6
[90m6[39m 2008-028             2
[1] ":::::::::::::::::::::::::::::::::::::::::::"


“longer object length is not a multiple of shorter object length”


[1] "pCalicoPDsHSrSNSN_VTAiPoold230719D2"


“longer object length is not a multiple of shorter object length”


[1] "Same Donors? FALSE"
[1] "WITH donor subsetting 0.55"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PD0793            [4m1[24m525
[90m2[39m PD0903             888
[90m3[39m PD0968             377
[90m4[39m [31mNA[39m                  29
[90m5[39m PD0940              18
[90m6[39m PDC091              10
[1] "NO donor subsetting 0.63"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PD0793            [4m1[24m465
[90m2[39m PD0903             844
[90m3[39m PD0954             527
[90m4[39m PDC110             280
[90m5[39m PD0968              92
[90m6[39m PD0981               3
[1] ":::::::::::::::::::::::::::::::::::::::::::"


“longer object length is not a multiple of shorter object length”


[1] "pCalicoPDsHSrSNSN_VTAiPoold230719DD"


“longer object length is not a multiple of shorter object length”


[1] "Same Donors? FALSE"
[1] "WITH donor subsetting 0.54"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PD0793            [4m1[24m339
[90m2[39m PD0903            [4m1[24m154
[90m3[39m PD0968             636
[90m4[39m [31mNA[39m                  14
[90m5[39m PDC091               8
[90m6[39m PD0940               6
[1] "NO donor subsetting 0.8"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PD0793            [4m1[24m313
[90m2[39m PDC110            [4m1[24m229
[90m3[39m PD0903            [4m1[24m110
[90m4[39m PD0954             797
[90m5[39m PD0968             186
[90m6[39m 2007-032             1
[1] ":::::::::::::::::::::::::::::::::::::::::::"


“longer object length is not a multiple of shorter object length”


[1] "pCalicoPDsHSrSNSN_VTAiPoold230719E1"
[1] "Same Donors? TRUE"
[1] "WITH donor subsetting 0.89"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PDC092            [4m1[24m097
[90m2[39m 2019-114           687
[90m3[39m 2018-130           629
[90m4[39m 2008-028           547
[90m5[39m 2017-037           298
[90m6[39m PD0809             108
[1] "NO donor subsetting 0.87"
[90m# A tibble: 6 × 2[39m
  participant_id n_cells
  [3m[90m<chr>[39m[23m            [3m[90m<int>[39m[23m
[90m1[39m PDC092            [4m1[24m080
[90m2[39m 2019-114           682
[90m3[39m 2018-130           619
[90m4[39m 2008-028           544
[90m5[39m 2017-037           285
[90m6[39m PD0809              99
[1] ":::::::::::::::::::::::::::::::::::::::::::"
[1] "pCalicoPDsHSrSNSN_VTAiPoold230719E2"
[1] "Same Donors? TRUE"
[1] "WITH donor subsetting 0.92"
[90m# A tibble: 6 × 2[39m
  participant_id n_cell

Unnamed: 0_level_0,frac_assignable_sub,frac_assignable_nosub,same_donors,in_donor_list_but_not_top_donor_in_whitelist,top_donor_in_whitelist_but_not_in_donor_list
Unnamed: 0_level_1,<dbl>,<dbl>,<lgl>,<chr>,<chr>
pCalicoPDsHSrSNSN_VTAiPoold230719FD,0.96,0.95,True,,
pCalicoPDsHSrSNSN_VTAiPoold230719G1,0.81,0.77,True,,
pCalicoPDsHSrSNSN_VTAiPoold230719G2,0.77,0.74,True,,
pCalicoPDsHSrSNSN_VTAiPoold230719GD,0.92,0.92,True,,
pCalicoPDsHSrSNSN_VTAiPoold230719I1,0.77,0.53,False,PD0878,PDC078
pCalicoPDsHSrSNSN_VTAiPoold230719I2,0.76,0.53,False,PD0878,PDC078
pCalicoPDsHSrSNSN_VTAiPoold230719ID,0.95,0.82,False,PD0878,PDC078
pCalicoPDsHSrSNSN_VTAiPoold230719J1,0.48,0.41,True,,
pCalicoPDsHSrSNSN_VTAiPoold230719J2,0.56,0.47,True,,
pCalicoPDsHSrSNSN_VTAiPoold230719K1,0.68,0.86,False,PDC110,PDC091


In [17]:
head(summary_df)

Unnamed: 0_level_0,frac_assignable_sub,frac_assignable_nosub,same_donors,in_donor_list_but_not_top_donor_in_whitelist,top_donor_in_whitelist_but_not_in_donor_list
Unnamed: 0_level_1,<dbl>,<dbl>,<lgl>,<chr>,<chr>
pCalicoPDsHSrSNSN_VTAiPoold230719FD,0.96,0.95,True,,
pCalicoPDsHSrSNSN_VTAiPoold230719G1,0.81,0.77,True,,
pCalicoPDsHSrSNSN_VTAiPoold230719G2,0.77,0.74,True,,
pCalicoPDsHSrSNSN_VTAiPoold230719GD,0.92,0.92,True,,
pCalicoPDsHSrSNSN_VTAiPoold230719I1,0.77,0.53,False,PD0878,PDC078
pCalicoPDsHSrSNSN_VTAiPoold230719I2,0.76,0.53,False,PD0878,PDC078


In [24]:
summary_df[summary_df$same_donors == FALSE,]

Unnamed: 0_level_0,frac_assignable_sub,frac_assignable_nosub,same_donors,in_donor_list_but_not_top_donor_in_whitelist,top_donor_in_whitelist_but_not_in_donor_list
Unnamed: 0_level_1,<dbl>,<dbl>,<lgl>,<chr>,<chr>
pCalicoPDsHSrSNSN_VTAiPoold230719I1,0.77,0.53,False,PD0878,PDC078
pCalicoPDsHSrSNSN_VTAiPoold230719I2,0.76,0.53,False,PD0878,PDC078
pCalicoPDsHSrSNSN_VTAiPoold230719ID,0.95,0.82,False,PD0878,PDC078
pCalicoPDsHSrSNSN_VTAiPoold230719K1,0.68,0.86,False,PDC110,PDC091
pCalicoPDsHSrSNSN_VTAiPoold230719K2,0.68,0.87,False,PDC110,PDC091
pCalicoPDsHSrSNSN_VTAiPoold230719C,0.43,0.36,False,PD0940,2019-082 -- PD0954
pCalicoPDsHSrSNSN_VTAiPoold230719CD,0.67,0.69,False,PD0940,2007-020 -- PD0954
pCalicoPDsHSrSNSN_VTAiPoold230719D1,0.49,0.47,False,PD0940 -- PDC091,2008-028 -- PD0954 -- PDC110
pCalicoPDsHSrSNSN_VTAiPoold230719D2,0.55,0.63,False,PD0940 -- PDC091,PD0954 -- PD0981 -- PDC110
pCalicoPDsHSrSNSN_VTAiPoold230719DD,0.54,0.8,False,PD0940 -- PDC091,2007-032 -- PD0954 -- PDC110


In [25]:
write.table(summary_df[summary_df$same_donors == FALSE,], file="~/sc-online/notebook_data/pd/calico_lib_donor_mismatches.tsv", sep="\t", quote=FALSE)

In [26]:
summary_df[summary_df$same_donors == FALSE,]

Unnamed: 0_level_0,frac_assignable_sub,frac_assignable_nosub,same_donors,in_donor_list_but_not_top_donor_in_whitelist,top_donor_in_whitelist_but_not_in_donor_list
Unnamed: 0_level_1,<dbl>,<dbl>,<lgl>,<chr>,<chr>
pCalicoPDsHSrSNSN_VTAiPoold230719I1,0.77,0.53,False,PD0878,PDC078
pCalicoPDsHSrSNSN_VTAiPoold230719I2,0.76,0.53,False,PD0878,PDC078
pCalicoPDsHSrSNSN_VTAiPoold230719ID,0.95,0.82,False,PD0878,PDC078
pCalicoPDsHSrSNSN_VTAiPoold230719K1,0.68,0.86,False,PDC110,PDC091
pCalicoPDsHSrSNSN_VTAiPoold230719K2,0.68,0.87,False,PDC110,PDC091
pCalicoPDsHSrSNSN_VTAiPoold230719C,0.43,0.36,False,PD0940,2019-082 -- PD0954
pCalicoPDsHSrSNSN_VTAiPoold230719CD,0.67,0.69,False,PD0940,2007-020 -- PD0954
pCalicoPDsHSrSNSN_VTAiPoold230719D1,0.49,0.47,False,PD0940 -- PDC091,2008-028 -- PD0954 -- PDC110
pCalicoPDsHSrSNSN_VTAiPoold230719D2,0.55,0.63,False,PD0940 -- PDC091,PD0954 -- PD0981 -- PDC110
pCalicoPDsHSrSNSN_VTAiPoold230719DD,0.54,0.8,False,PD0940 -- PDC091,2007-032 -- PD0954 -- PDC110


In [50]:
donor_list_grouped_list[["pCalicoPDsHSrSNB8id230921B8"]]
no_subset_grouped_list[["pCalicoPDsHSrSNB8id230921B8"]]

participant_id,n_cells
<chr>,<int>
PD0899,456
PD0825,453
PDC094,193
PD0926,125
PD0905,2
PD0938,2


participant_id,n_cells
<chr>,<int>
PD0899,423
PD0825,422
PDC094,185
PD0784,119
PD0926,89
PDC090,19


In [20]:
donor_list_grouped_list[["pCalicoPDsHSrSNC8id230921C8"]]
no_subset_grouped_list[["pCalicoPDsHSrSNC8id230921C8"]]

participant_id,n_cells
<chr>,<int>
PD0825,734
PD0899,622
PDC094,260
PD0926,178
PD0905,11
PD0938,9


participant_id,n_cells
<chr>,<int>
PD0825,620
PD0899,548
PDC094,243
PD0784,188
PD0926,100
PDC090,24


In [21]:
donor_list_grouped_list[["pCalicoPDsHSrSND8id230921D8"]]
no_subset_grouped_list[["pCalicoPDsHSrSND8id230921D8"]]

participant_id,n_cells
<chr>,<int>
PDC094,1626
PD0926,1499
PD0825,1172
PD0899,1088
PD0905,15
PD0938,10


participant_id,n_cells
<chr>,<int>
PD0784,1993
PDC090,1727
PDC094,1527
PD0926,1415
PD0825,1097
PD0899,1002


In [22]:

donor_list_grouped_list[["pCalicoPDsHSrSND9id230921D9"]]
no_subset_grouped_list[["pCalicoPDsHSrSND9id230921D9"]]

participant_id,n_cells
<chr>,<int>
PD0938,1399
PD0906,1176
PD0985,1082
PD0730,1009
PDC089,727
PDC110,11


participant_id,n_cells
<chr>,<int>
PD0938,1357
PD0906,1068
PD0985,972
PDC091,963
PD0730,938
PDC089,678


In [None]:
donor_list_grouped_list

In [None]:
for (name in diff_donor_libs){
    imputed_sub = imputed_tsvs_sub[[name]] %>% filter(prob_max >= 0.9 & !donor_id %in% c("unassigned", "doublet"))
    imputed_nosub = imputed_nosub_tsvs[[name]] %>% filter(prob_max >= 0.9 & !donor_id %in% c("unassigned", "doublet"))
    imputed_sub$participant_id = donor_to_participant_map[imputed_sub$donor_id]
    imputed_nosub$participant_id = donor_to_participant_map[imputed_nosub$donor_id]
    this_summary = summary_df[rownames(summary_df) == name,]
    
    only_in_sub = strsplit(this_summary$only_in_sub, ' -- ')[[1]]
    only_in_nosub = strsplit(this_summary$only_in_nosub, ' -- ')[[1]]

    imputed_sub = imputed_sub %>% filter(participant_id %in% only_in_sub)
    imputed_nosub = imputed_nosub %>% filter(participant_id %in% only_in_nosub)
    print(name)
    print("Only In Sub")
    print(imputed_sub %>% group_by(participant_id) %>% summarize(n_cells = n()) %>% arrange(desc(n_cells)))
    print("Only In No-Sub")
    print(imputed_nosub %>% group_by(participant_id) %>% summarize(n_cells = n()) %>% arrange(desc(n_cells)))
    print(":::::::::::::::::::::::::::::::::::::::::::")

}

In [5]:
kf_master = read.csv("~/sc-online/notebook_data/pd/calico_kf_master_20240307.csv", header=TRUE)

In [9]:
head(kf_master[,
    c("Collaborator.Participate.ID", "Collaborator.Sample.ID", "Lab.Donor.ID", "Chipwell.barcode", "Correct.Donor.ID")
])

Unnamed: 0_level_0,Collaborator.Participate.ID,Collaborator.Sample.ID,Lab.Donor.ID,Chipwell.barcode,Correct.Donor.ID
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
1,10.24.23_GTEx_SN_TD2,10.24.23_GTEx_SN_TD2,MQVOT,,GTEX-Y8DK
2,10.24.23_GTEx_SN_TD2,10.24.23_GTEx_SN_TD2,MQVOI,,GTEX-1RDX4
3,10.24.23_GTEx_SN_TD2,10.24.23_GTEx_SN_TD2,MQVOC,,GTEX-1QEPI
4,10.24.23_GTEx_SN_TD2,10.24.23_GTEx_SN_TD2,MQVOF,,GTEX-1POEN
5,10.24.23_GTEx_SN_TD2,10.24.23_GTEx_SN_TD2,MQVOG,,GTEX-1POEN
6,10.24.23_GTEx_SN_TD2,10.24.23_GTEx_SN_TD2,MQVO1,,GTEX-11DZ1


In [19]:
kf_master$chip_well_barcode = kf_master$Chipwell.barcode
kf_master$participant_id = kf_master$Collaborator.Sample.ID

In [20]:
kf_master_calico = kf_master[kf_master$Correct.Donor.ID %in% unique(manifest$participant_id),]

In [29]:
tail(kf_master_calico)

Unnamed: 0_level_0,PDO,Flowcell,SK.ID,SM.ID,Number.of.Donors.in.Pool,Index,Collaborator.Participate.ID,Collaborator.Sample.ID,Lab.Pool.ID,Lab.Donor.ID,Correct.Donor.ID,Brain.Bank,Chipwell.barcode,Macosko.Sequencing.ID,Brain.Region,Stain,Disease.Status,chip_well_barcode,participant_id
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
874,PDO-34065,22JHCCLT3,SK-67BJ,SM-N57KL,6,SI-TT-D10,2.5.24_UK_NBB_SN_NURR_Z1,2.5.24_UK_NBB_SN_NURR_Z1,Z1,114-VTA,PDC114,UKPD,206954930093_R08C01,pPDsHSrSNNURRid240224Z1,SN_VTA,NURR,-,206954930093_R08C01,2.5.24_UK_NBB_SN_NURR_Z1
875,PDO-34065,22JHCCLT3,SK-67BJ,SM-N57KM,6,SI-TT-C10,2.5.24_UK_NBB_SN_NURR_Z2,2.5.24_UK_NBB_SN_NURR_Z2,Z2,114-VTA,PDC114,UKPD,206954930093_R08C01,pPDsHSrSNNURRid240224Z2,SN_VTA,NURR,-,206954930093_R08C01,2.5.24_UK_NBB_SN_NURR_Z2
876,PDO-34065,22JHCCLT3,SK-67BJ,SM-N57KU,7,SI-TT-B11,2.5.24_UK_NBB_SN_DAPI_YD1,2.5.24_UK_NBB_SN_DAPI_YD1,YD1,112-VTA,PDC112,UKPD,206954930093_R06C02,pPDsHSrSNDAPIid240224YD1,SN_VTA,DAPI,-,206954930093_R06C02,2.5.24_UK_NBB_SN_DAPI_YD1
877,PDO-34065,22JHCCLT3,SK-67BJ,SM-N57KP,7,SI-TT-A11,2.5.24_UK_NBB_SN_DAPI_YD2,2.5.24_UK_NBB_SN_DAPI_YD2,YD2,112-VTA,PDC112,UKPD,206954930093_R06C02,pPDsHSrSNDAPIid240224YD2,SN_VTA,DAPI,-,206954930093_R06C02,2.5.24_UK_NBB_SN_DAPI_YD2
878,PDO-34065,22JHCCLT3,SK-67BJ,SM-N57KQ,7,SI-TT-H10,2.5.24_UK_NBB_SN_NURR_Y1,2.5.24_UK_NBB_SN_NURR_Y1,Y1,112-VTA,PDC112,UKPD,206954930093_R06C02,pPDsHSrSNNURRid240224Y1,SN_VTA,NURR,-,206954930093_R06C02,2.5.24_UK_NBB_SN_NURR_Y1
879,PDO-34065,22JHCCLT3,SK-67BJ,SM-N57KR,7,SI-TT-G10,2.5.24_UK_NBB_SN_NURR_Y2,2.5.24_UK_NBB_SN_NURR_Y2,Y2,112-VTA,PDC112,UKPD,206954930093_R06C02,pPDsHSrSNNURRid240224Y2,SN_VTA,NURR,-,206954930093_R06C02,2.5.24_UK_NBB_SN_NURR_Y2


In [22]:
kf_map = setNames(
    kf_master_calico$Correct.Donor.ID,
    kf_master_calico$chip_well_barcode
)

In [25]:
length(kf_map)

In [28]:
nrow(kf_master_calico[, c("Correct.Donor.ID", "chip_well_barcode")] %>% distinct())

In [30]:
length(unique(kf_master_calico$chip_well_barcode))

In [31]:
length(unique(manifest$chip_well_barcode))

In [25]:
# confirm number of donors detected in each library
clean_no_subset_list = list()
donor_counts_list = list()
n_donors_list = list()
for (name in names(no_subset_list)){
    df = no_subset_list[[name]]
    df_clean = df %>% filter(donor_id != "unassigned" & donor_id != "doublet")

    # group by donor_id and get all donors with at least 50 cells
    donor_counts = df_clean %>% group_by(donor_id) %>% summarize(n_cells = n()) %>% filter(n_cells >= 15) %>% arrange(desc(n_cells))
    donor_counts_list[[name]] = donor_counts
    df_clean = df_clean %>% filter(donor_id %in% donor_counts$donor_id)
    clean_no_subset_list[[name]] = df_clean
    n_donors_list[[name]] = length(unique(df_clean$donor_id))
}

In [26]:
n_donors_list

In [27]:
donor_counts_list[["pCalicoPDsHSrSNSN_VTAiPoold230719D1"]]

donor_id,n_cells
<chr>,<int>
206954930010_R08C01_1,207
207762960003_R05C02_1,185
207762950055_R02C01_1,68
206954930010_R05C02_1,19


In [28]:
donor_counts_list[["pCalicoPDsHSrSNSN_VTAiPoold230719DD"]]

donor_id,n_cells
<chr>,<int>
206954930010_R08C01_1,1313
206954930010_R05C02_1,1229
207762950055_R02C01_1,1110
207762960003_R05C02_1,797
207762950086_R05C02_1,94
206954930010_R07C01_1,92


In [23]:
getFracAssignableVireo(no_subset_list[["pCalicoPDsHSrSNSN_VTAiPoold230719DD"]])
getFracAssignableVireo(donor_list_list[["pCalicoPDsHSrSNSN_VTAiPoold230719DD"]])

In [29]:
donor_counts_list[["pCalicoPDsHSrSNSN_VTAiPoold230719ID"]]
donor_counts_list[["pCalicoPDsHSrSNSN_VTAiPoold230719I1"]]
donor_counts_list[["pCalicoPDsHSrSNSN_VTAiPoold230719I2"]]

donor_id,n_cells
<chr>,<int>
206954930093_R01C02_1,1616
206954930010_R03C01_1,1578
207762950108_R03C02_1,1532
206954930093_R03C01_1,1329
206954930010_R05C01_1,568
206954930010_R04C01_1,73


donor_id,n_cells
<chr>,<int>
207762950108_R03C02_1,883
206954930093_R01C02_1,590
206954930093_R03C01_1,463
206954930010_R03C01_1,377
206954930010_R05C01_1,151


donor_id,n_cells
<chr>,<int>
207762950108_R03C02_1,800
206954930093_R01C02_1,560
206954930093_R03C01_1,383
206954930010_R03C01_1,373
206954930010_R05C01_1,148


In [31]:
donor_counts_list[["pCalicoPDsHSrSNE8id230921E8"]]
donor_counts_list[["pCalicoPDsHSrSNG8id230921G8"]]
donor_counts_list[["pCalicoPDsHSrSNF8id230921F8"]]


donor_id,n_cells
<chr>,<int>
206954930060_R03C01_1,1380
206954930011_R02C02_1,1304
207762950108_R01C02_1,1296
207762950055_R04C01_1,1265
207762960003_R05C02_1,1009


donor_id,n_cells
<chr>,<int>
206954930011_R12C01_1,927
206954930011_R02C02_1,837
207762950108_R01C02_1,830
207762950055_R04C01_1,811
207762960003_R05C02_1,764
206954930060_R03C01_1,535


donor_id,n_cells
<chr>,<int>
206954930060_R03C01_1,2624
206954930011_R02C02_1,2593
207762950055_R04C01_1,1530
207762950108_R01C02_1,1332
206954930011_R12C01_1,1035
207762960003_R05C02_1,231


In [33]:
getFracAssignableVireo(donor_list_list[["pCalicoPDsHSrSNE8id230921E8"]])