# Import packages and setup

In [1]:
%matplotlib inline
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 400)

# Read in IgBLAST BCR data

In [2]:
source_folder = "/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/2_BCR_TCR/1_IgBLAST_assignment/IgBLAST_output_BCR"

file_list = [x for x in os.listdir(f'{source_folder}') if x.endswith('pass.tsv')]

file_dfs = []

for file in file_list:    
    file_df = pd.read_csv(f'{source_folder}/{file}', sep='\t')
    file_df['pool'] =  file.split("_")[0]
    file_dfs.append(file_df)
    
bcr_df = pd.concat(file_dfs)

In [3]:
bcr_df.pool.nunique()

68

In [4]:
len(bcr_df)

229108

# Read in sample, pool and batch info

In [5]:
batch_info = pd.read_csv("/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/1_Metadata/slemap_cellranger_to_pool.txt", sep = "\t", header = None)
batch_info.columns = ["sample", "pool", "scSeq_batch"]

In [6]:
bcr_df = pd.merge(bcr_df, batch_info)

In [7]:
bcr_df["contig"] = bcr_df["sequence_id"].str.split("_", n = 1).str[1]
bcr_df["chain"] = np.where(bcr_df['locus'].str.contains('IGH'), "Heavy" , "Light")
bcr_df["sample_cell_id"] = bcr_df["sample"] + "_" + bcr_df["cell_id"]

In [8]:
chain_counts_bcr = bcr_df.groupby(['pool','chain'])['sample_cell_id'].agg(number_chains = "value_counts").reset_index()
chain_counts_heavy = chain_counts_bcr.query("chain == 'Heavy'").copy()

In [35]:
chain_counts_heavy

Unnamed: 0,pool,chain,sample_cell_id,number_chains,BCR_doublet,barcode
0,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,2,Yes,ACAGCCGGTCAAGCGA-1
1,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,2,Yes,ACGTCAAAGCACACAG-1
2,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,2,Yes,ACTGAACCACCCAGTG-1
3,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,2,Yes,AGAGTGGCAAGAGGCT-1
4,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,2,Yes,AGCAGCCTCTGCTTGC-1
...,...,...,...,...,...,...
221288,LDP69,Heavy,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,No,TTTGTCAAGACAAGCC-1
221289,LDP69,Heavy,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,No,TTTGTCAAGCTACCTA-1
221290,LDP69,Heavy,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,No,TTTGTCAAGTGAAGTT-1
221291,LDP69,Heavy,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,No,TTTGTCAGTACCCAAT-1


In [9]:
chain_counts_heavy["BCR_doublet"] = np.where(chain_counts_heavy['number_chains'] == 1, "No" , "Yes")

In [10]:
chain_counts_heavy[["sample_cell_id", "pool", "BCR_doublet"]]

Unnamed: 0,sample_cell_id,pool,BCR_doublet
0,cellranger700_multi_dbbf8652dc66faa6a536508357...,LDP02,Yes
1,cellranger700_multi_dbbf8652dc66faa6a536508357...,LDP02,Yes
2,cellranger700_multi_dbbf8652dc66faa6a536508357...,LDP02,Yes
3,cellranger700_multi_dbbf8652dc66faa6a536508357...,LDP02,Yes
4,cellranger700_multi_dbbf8652dc66faa6a536508357...,LDP02,Yes
...,...,...,...
221288,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
221289,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
221290,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
221291,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No


In [11]:
chain_counts_heavy["barcode"] = chain_counts_heavy["sample_cell_id"].str.rsplit("_", n=1).str[1]

chain_counts_heavy.to_csv("/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/2_BCR_TCR/1_IgBLAST_assignment/bcr_doublet_barcodes.csv", index = False)

In [12]:
chain_counts_heavy.BCR_doublet.value_counts()

BCR_doublet
No     105952
Yes      1536
Name: count, dtype: int64

In [44]:
has_single_bcr = chain_counts_heavy.query("number_chains == 1").copy()
has_single_bcr["VDJ"] = "BCR"

# Read in IgBLAST TCR data

In [13]:
source_folder = "/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/2_BCR_TCR/1_IgBLAST_assignment/IgBLAST_output_TCR"

file_list = [x for x in os.listdir(f'{source_folder}') if x.endswith('pass.tsv')]

file_dfs = []

for file in file_list:    
    file_df = pd.read_csv(f'{source_folder}/{file}', sep='\t')
    file_df['pool'] =  file.split("_")[0]
    file_dfs.append(file_df)
    
tcr_df = pd.concat(file_dfs)

In [14]:
tcr_df.pool.nunique()

69

In [15]:
len(tcr_df)

1177627

# Read in sample, pool and batch info

In [16]:
tcr_df = pd.merge(tcr_df, batch_info)

In [17]:
tcr_df["contig"] = tcr_df["sequence_id"].str.split("_", n = 1).str[1]
tcr_df["chain"] = np.where(tcr_df['locus'].str.contains('TRB'), "Beta" , "Alpha")
tcr_df["sample_cell_id"] = tcr_df["sample"] + "_" + tcr_df["cell_id"]

In [18]:
tcr_df.locus.value_counts()

locus
TRB    629316
TRA    548311
Name: count, dtype: int64

In [19]:
tcr_df.chain.value_counts()

chain
Beta     629316
Alpha    548311
Name: count, dtype: int64

In [20]:
chain_counts_tcr = tcr_df.groupby(['pool', 'chain'])['sample_cell_id'].agg(number_chains = "value_counts").reset_index()
chain_counts_beta = chain_counts_tcr.query("chain == 'Beta'").copy()

In [21]:
chain_counts_beta["tcr_doublet"] = np.where(chain_counts_beta['number_chains'] == 1, "No" , "Yes")

In [22]:
chain_counts_beta[["sample_cell_id", "pool", "tcr_doublet"]]

Unnamed: 0,sample_cell_id,pool,tcr_doublet
2708,cellranger700_multi_21212877c397b6975eaf35c723...,LDP01,Yes
2709,cellranger700_multi_21212877c397b6975eaf35c723...,LDP01,Yes
2710,cellranger700_multi_21212877c397b6975eaf35c723...,LDP01,Yes
2711,cellranger700_multi_21212877c397b6975eaf35c723...,LDP01,Yes
2712,cellranger700_multi_21212877c397b6975eaf35c723...,LDP01,Yes
...,...,...,...
1091760,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
1091761,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
1091762,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
1091763,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No


In [23]:
chain_counts_beta.tcr_doublet.value_counts()

tcr_doublet
No     559978
Yes     34669
Name: count, dtype: int64

In [24]:
chain_counts_beta["barcode"] = chain_counts_beta["sample_cell_id"].str.rsplit("_", n=1).str[1]

In [25]:
chain_counts_beta.to_csv("/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/2_BCR_TCR/1_IgBLAST_assignment/tcr_doublet_barcodes.csv", index = False)

In [50]:
has_single_tcr = chain_counts_beta.query("number_chains == 1").copy()
has_single_tcr["VDJ"] = "TCR"

# Get TCR/BCR doublets

In [26]:
has_trb = list(chain_counts_beta.sample_cell_id)

In [27]:
has_igh = list(chain_counts_heavy.sample_cell_id)

In [28]:
len(list(set(has_trb) & set(has_igh)))

15262

In [29]:
tcr_bcr_doublet = pd.DataFrame(list(set(has_trb) & set(has_igh)))

In [30]:
tcr_bcr_doublet = tcr_bcr_doublet.rename({0:"sample_cell_id"}, axis = 1)

In [31]:
tcr_bcr_doublet["sample"] = tcr_bcr_doublet["sample_cell_id"].str.rsplit("_", n=1).str[0]
tcr_bcr_doublet["barcode"] = tcr_bcr_doublet["sample_cell_id"].str.rsplit("_", n=1).str[1]

In [32]:
tcr_bcr_doublet = pd.merge(tcr_bcr_doublet, batch_info)

In [33]:
tcr_bcr_doublet.to_csv("/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/2_BCR_TCR/1_IgBLAST_assignment/tcr_bcr_doublet_barcodes.csv", index = False)

In [34]:
tcr_bcr_doublet

Unnamed: 0,sample_cell_id,sample,barcode,pool,scSeq_batch
0,cellranger700_multi_45469_SLE_map12928662_and_...,cellranger700_multi_45469_SLE_map12928662_and_...,GACTACAGTGCAACGA-1,LDP08,2
1,cellranger700_multi_4e757ffa82c83e064b611aeb10...,cellranger700_multi_4e757ffa82c83e064b611aeb10...,AGCCTAACAGCCACCA-1,LDP64,9
2,cellranger700_multi_4aa86417eee207b8497d71521f...,cellranger700_multi_4aa86417eee207b8497d71521f...,ATTGGACGTTCAGGCC-1,LDP13,3
3,cellranger700_multi_b4f011068f70444b8985ca3169...,cellranger700_multi_b4f011068f70444b8985ca3169...,GCGCAGTAGTGAACAT-1,LDP30,6
4,cellranger700_multi_e0b0dbb966d0c07fba9ee7428c...,cellranger700_multi_e0b0dbb966d0c07fba9ee7428c...,GTCGTAAGTTCTGTTT-1,LDP62,9
...,...,...,...,...,...
15257,cellranger700_multi_8c7bd37d89d698c31a0f0bdd1a...,cellranger700_multi_8c7bd37d89d698c31a0f0bdd1a...,CAGCATAGTGGGTATG-1,LDP32,6
15258,cellranger700_multi_473638366415e11e383ee33cb2...,cellranger700_multi_473638366415e11e383ee33cb2...,GGCGTGTCAGTTCATG-1,LDP51,8
15259,cellranger700_multi_473638366415e11e383ee33cb2...,cellranger700_multi_473638366415e11e383ee33cb2...,GACTGCGCAGACGCAA-1,LDP51,8
15260,cellranger700_multi_b4f011068f70444b8985ca3169...,cellranger700_multi_b4f011068f70444b8985ca3169...,AGCTCCTTCGGCTTGG-1,LDP30,6


# Get cells with a single heavy or beta chain
Not checking for paired receptors yet

In [51]:
has_single_tcr = has_single_tcr.drop("tcr_doublet",  axis =1)
has_single_tcr

Unnamed: 0,pool,chain,sample_cell_id,number_chains,barcode,VDJ
2956,LDP01,Beta,cellranger700_multi_21212877c397b6975eaf35c723...,1,AAACCTGAGATGTGTA-1,TCR
2957,LDP01,Beta,cellranger700_multi_21212877c397b6975eaf35c723...,1,AAACCTGCACCGGAAA-1,TCR
2958,LDP01,Beta,cellranger700_multi_21212877c397b6975eaf35c723...,1,AAACCTGGTTACGGAG-1,TCR
2959,LDP01,Beta,cellranger700_multi_21212877c397b6975eaf35c723...,1,AAACCTGTCCTAAGTG-1,TCR
2960,LDP01,Beta,cellranger700_multi_21212877c397b6975eaf35c723...,1,AAACCTGTCCTCATTA-1,TCR
...,...,...,...,...,...,...
1091760,LDP69,Beta,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCAGTTTAGCTG-1,TCR
1091761,LDP69,Beta,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCATCGCAAACT-1,TCR
1091762,LDP69,Beta,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCATCTCGTTTA-1,TCR
1091763,LDP69,Beta,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCATCTTAGCCC-1,TCR


In [55]:
has_single_bcr = has_single_bcr.drop("BCR_doublet",  axis =1)
has_single_bcr

Unnamed: 0,pool,chain,sample_cell_id,number_chains,barcode,VDJ
57,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,1,AAACCTGAGACCGGAT-1,BCR
58,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,1,AAACCTGAGGGAACGG-1,BCR
59,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,1,AAACCTGCAAACGCGA-1,BCR
60,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,1,AAACCTGCAATCTGCA-1,BCR
61,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,1,AAACCTGGTAGAAGGA-1,BCR
...,...,...,...,...,...,...
221288,LDP69,Heavy,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCAAGACAAGCC-1,BCR
221289,LDP69,Heavy,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCAAGCTACCTA-1,BCR
221290,LDP69,Heavy,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCAAGTGAAGTT-1,BCR
221291,LDP69,Heavy,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCAGTACCCAAT-1,BCR


In [65]:
has_single_bcr_tcr = pd.concat([has_single_bcr, has_single_tcr])

In [70]:
has_single_bcr_or_tcr = has_single_bcr_tcr.copy()

has_single_bcr_or_tcr = has_single_bcr_or_tcr[~has_single_bcr_or_tcr.sample_cell_id.isin(list(set(has_trb) & set(has_igh)))]



In [71]:
has_single_bcr_or_tcr.to_csv("/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/2_BCR_TCR/1_IgBLAST_assignment/has_tcr_bcr_barcodes.csv", index = False)

In [72]:
check_a = list(has_single_bcr_or_tcr.sample_cell_id)

In [73]:
check_b = list(tcr_bcr_doublet.sample_cell_id)

In [74]:
len(list(set(check_a) & set(check_b)))

0

In [None]:
AAACCTGAGCTCAACT-1-SLE_WGS13446469_LDP04__donor

In [75]:
metadata = pd.read_csv("/lustre/scratch126/opentargets/opentargets/OTAR2064/working/slemap/metadata/SLEmap_metadata_191224.csv")

In [88]:
has_single_bcr_or_tcr["cellranger_file"] = has_single_bcr_or_tcr["sample_cell_id"].str.rsplit("_", n=1).str[0]

In [89]:
has_single_bcr_or_tcr.iloc[1]['cellranger_file']

'cellranger700_multi_dbbf8652dc66faa6a5365083574e5bf4'

In [None]:
AAACCTGAGCTCAACT-1-SLE_WGS13446469_LDP04__donor

In [82]:
metadata.iloc[1]['cellranger_file']

'cellranger700_multi_21212877c397b6975eaf35c723c5a5da'

In [None]:
pd.merge(has_single_bcr_or_tcr, metadata)

In [90]:
has_single_bcr_or_tcr

Unnamed: 0,pool,chain,sample_cell_id,number_chains,barcode,VDJ,cellranger_file
57,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,1,AAACCTGAGACCGGAT-1,BCR,cellranger700_multi_dbbf8652dc66faa6a536508357...
58,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,1,AAACCTGAGGGAACGG-1,BCR,cellranger700_multi_dbbf8652dc66faa6a536508357...
60,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,1,AAACCTGCAATCTGCA-1,BCR,cellranger700_multi_dbbf8652dc66faa6a536508357...
61,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,1,AAACCTGGTAGAAGGA-1,BCR,cellranger700_multi_dbbf8652dc66faa6a536508357...
62,LDP02,Heavy,cellranger700_multi_dbbf8652dc66faa6a536508357...,1,AAACCTGTCACCCTCA-1,BCR,cellranger700_multi_dbbf8652dc66faa6a536508357...
...,...,...,...,...,...,...,...
1091760,LDP69,Beta,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCAGTTTAGCTG-1,TCR,cellranger700_multi_47347f8bd01f6c23c63871b334...
1091761,LDP69,Beta,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCATCGCAAACT-1,TCR,cellranger700_multi_47347f8bd01f6c23c63871b334...
1091762,LDP69,Beta,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCATCTCGTTTA-1,TCR,cellranger700_multi_47347f8bd01f6c23c63871b334...
1091763,LDP69,Beta,cellranger700_multi_47347f8bd01f6c23c63871b334...,1,TTTGTCATCTTAGCCC-1,TCR,cellranger700_multi_47347f8bd01f6c23c63871b334...


In [76]:
metadata

Unnamed: 0,externalID,experiment_id,SC_duplicate,SC_included,condition,recruitment_centre,recruitment_date,ancestry.l2,ancestry.l1,internalID,date_processed,post_thaw_count.10.6.,post_thaw_viability,pre.pool_count.10.6.,pre.pool_viability,pre_facs_count.10.6.,pre_facs_viability,facs_events.10.4.,post_facs_count.10.4.,pre_10X_viability,poolID,numberID,pool_cdna_conc_ngul,donor_cell_recovery,scSeq_run,scSeq_batch,cellranger_file,DNA_conc_totmicrogram,DNA_conc_QC,WGS_ID,WGS_EGAN_ID,WGS_batch,WGS_duplicate,vireo_assignment
0,EQTL083P,SLE_WGS13446475_LDP01,,use,SLE,Kings,11/01/2022,Mix-any other,Mixed ancestry,D802,19/04/2022,3.66,56.0,3.0,59.4,0.96,63.0,270.0,200.0,96.5,LDP01,802,1.7,1178.0,45271,1,cellranger700_multi_21212877c397b6975eaf35c723...,,passed,SLE_WGS13446475,EGAN00004257507,1.0,,LDP01donor3
1,L11,SLE_WGS13446476_LDP01,,use,SLE,Imperial,21/08/2020,White British,European ancestry,D803,19/04/2022,3.61,84.0,1.63,67.2,0.96,63.0,270.0,200.0,96.5,LDP01,803,1.7,770.0,45271,1,cellranger700_multi_21212877c397b6975eaf35c723...,,passed,SLE_WGS13446476,EGAN00004257510,1.0,,LDP01donor2
2,L150,SLE_WGS13446477_LDP01,,use,SLE,Imperial,06/07/2021,White Other,European ancestry,D804,19/04/2022,3.31,89.9,1.46,67.0,0.96,63.0,270.0,200.0,96.5,LDP01,804,1.7,1266.0,45271,1,cellranger700_multi_21212877c397b6975eaf35c723...,,passed,SLE_WGS13446477,EGAN00004257512,1.0,,LDP01donor1
3,L219,SLE_map14783129_LDP01,,use,SLE,Imperial,08/03/2022,White Other,European ancestry,D805,19/04/2022,1.3,82.9,0.58,62.0,0.96,63.0,270.0,200.0,96.5,LDP01,805,1.7,798.0,45271,1,cellranger700_multi_21212877c397b6975eaf35c723...,,passed,SLE_map14783129,EGAN00004528084,4.0,,LDP01donor0
4,EQTL058P,SLE_WGS13446479_LDP02,,use,SLE,Kings,05/08/2021,White other European,European ancestry,D807,20/04/2022,5.34,71.7,3.07,55.8,1.96,70.0,283.0,209.0,95.5,LDP02,807,4.0,2652.0,45271,1,cellranger700_multi_dbbf8652dc66faa6a536508357...,,passed,SLE_WGS13446479,EGAN00004257513,1.0,,LDP02donor3
5,EQTL086P,SLE_WGS13446480_LDP02,,use,SLE,Kings,25/01/2022,Black Caribbean,Afro-Caribbean ancestry,D808,20/04/2022,4.8,71.8,2.87,57.6,1.96,70.0,283.0,209.0,95.5,LDP02,808,4.0,2833.0,45271,1,cellranger700_multi_dbbf8652dc66faa6a536508357...,,passed,SLE_WGS13446480,EGAN00004257514,1.0,,LDP02donor0
6,L143,SLE_WGS13446482_LDP02,,use,SLE,Imperial,15/06/2021,Black Caribbean,Afro-Caribbean ancestry,D810,20/04/2022,4.76,88.2,2.78,73.0,1.96,70.0,283.0,209.0,95.5,LDP02,810,4.0,2343.0,45271,1,cellranger700_multi_dbbf8652dc66faa6a536508357...,,passed,SLE_WGS13446482,EGAN00004257518,1.0,,LDP02donor2
7,L53,SLE_map14783135_LDP02,,use,SLE,Imperial,20/10/2020,White Other,European ancestry,D809,20/04/2022,1.51,92.9,1.06,76.9,1.96,70.0,283.0,209.0,95.5,LDP02,809,4.0,2738.0,45271,1,cellranger700_multi_dbbf8652dc66faa6a536508357...,,passed,SLE_map14783135,EGAN00004528078,4.0,,LDP02donor1
8,EQTL060P,SLE_donor3_LDP03,scDup7_drop,drop,SLE,Kings,07/09/2021,White other European,European ancestry,D813,26/04/2022,14.66,76.0,9.13,72.8,2.19,72.5,283.0,242.0,90.4,LDP03,813,3.5,1468.0,45271,1,cellranger700_multi_e03d5d0c36933757ee86bec398...,0.55,passed,,,,,LDP03donor3
9,L15,SLE_WGS13446486_LDP03,,use,SLE,Imperial,25/08/2020,White Other,European ancestry,D816,26/04/2022,6.7,87.4,9.13,65.7,2.19,72.5,283.0,242.0,90.4,LDP03,816,3.5,1823.0,45271,1,cellranger700_multi_e03d5d0c36933757ee86bec398...,,passed,SLE_WGS13446486,EGAN00004257520,1.0,,LDP03donor2


In [None]:
AAACCTGAGCTCAACT-1-SLE_WGS13446469_LDP04__donor