# Import packages and setup

In [1]:
%matplotlib inline
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 400)

# Read in IgBLAST BCR data

In [2]:
source_folder = "/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/2_BCR_TCR/1_IgBLAST_assignment/IgBLAST_output_BCR"

file_list = [x for x in os.listdir(f'{source_folder}') if x.endswith('pass.tsv')]

file_dfs = []

for file in file_list:    
    file_df = pd.read_csv(f'{source_folder}/{file}', sep='\t')
    file_df['pool'] =  file.split("_")[0]
    file_dfs.append(file_df)
    
bcr_df = pd.concat(file_dfs)


In [3]:
bcr_df.pool.nunique()

68

In [4]:
len(bcr_df)

229108

# Read in sample, pool and batch info

In [5]:
batch_info = pd.read_csv("/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/1_Metadata/slemap_cellranger_to_pool.txt", sep = "\t", header = None)
batch_info.columns = ["sample", "pool", "scSeq_batch"]

In [6]:
bcr_df = pd.merge(bcr_df, batch_info)

In [7]:
bcr_df["contig"] = bcr_df["sequence_id"].str.split("_", n = 1).str[1]
bcr_df["chain"] = np.where(bcr_df['locus'].str.contains('IGH'), "Heavy" , "Light")
bcr_df["sample_cell_id"] = bcr_df["sample"] + "_" + bcr_df["cell_id"]

In [8]:
chain_counts_bcr = bcr_df.groupby(['pool','chain'])['sample_cell_id'].agg(number_chains = "value_counts").reset_index()
chain_counts_heavy = chain_counts_bcr.query("chain == 'Heavy'").copy()

In [9]:
chain_counts_heavy["BCR_doublet"] = np.where(chain_counts_heavy['number_chains'] == 1, "No" , "Yes")

In [10]:
chain_counts_heavy[["sample_cell_id", "pool", "BCR_doublet"]]

Unnamed: 0,sample_cell_id,pool,BCR_doublet
0,cellranger700_multi_dbbf8652dc66faa6a536508357...,LDP02,Yes
1,cellranger700_multi_dbbf8652dc66faa6a536508357...,LDP02,Yes
2,cellranger700_multi_dbbf8652dc66faa6a536508357...,LDP02,Yes
3,cellranger700_multi_dbbf8652dc66faa6a536508357...,LDP02,Yes
4,cellranger700_multi_dbbf8652dc66faa6a536508357...,LDP02,Yes
...,...,...,...
221288,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
221289,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
221290,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
221291,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No


In [11]:
chain_counts_heavy["barcode"] = chain_counts_heavy["sample_cell_id"].str.rsplit("_", n=1).str[1]

chain_counts_heavy.to_csv("/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/2_BCR_TCR/1_IgBLAST_assignment/bcr_doublet_barcodes.csv", index = False)

In [12]:
chain_counts_heavy.BCR_doublet.value_counts()

BCR_doublet
No     105952
Yes      1536
Name: count, dtype: int64

# Read in IgBLAST TCR data

In [13]:
source_folder = "/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/2_BCR_TCR/1_IgBLAST_assignment/IgBLAST_output_TCR"

file_list = [x for x in os.listdir(f'{source_folder}') if x.endswith('pass.tsv')]

file_dfs = []

for file in file_list:    
    file_df = pd.read_csv(f'{source_folder}/{file}', sep='\t')
    file_df['pool'] =  file.split("_")[0]
    file_dfs.append(file_df)
    
tcr_df = pd.concat(file_dfs)


In [14]:
tcr_df.pool.nunique()

69

In [15]:
len(tcr_df)

1177627

# Read in sample, pool and batch info

In [16]:
tcr_df = pd.merge(tcr_df, batch_info)

In [17]:
tcr_df["contig"] = tcr_df["sequence_id"].str.split("_", n = 1).str[1]
tcr_df["chain"] = np.where(tcr_df['locus'].str.contains('TRB'), "Beta" , "Alpha")
tcr_df["sample_cell_id"] = tcr_df["sample"] + "_" + tcr_df["cell_id"]

In [18]:
tcr_df.locus.value_counts()

locus
TRB    629316
TRA    548311
Name: count, dtype: int64

In [19]:
tcr_df.chain.value_counts()

chain
Beta     629316
Alpha    548311
Name: count, dtype: int64

In [20]:
chain_counts_tcr = tcr_df.groupby(['pool', 'chain'])['sample_cell_id'].agg(number_chains = "value_counts").reset_index()
chain_counts_beta = chain_counts_tcr.query("chain == 'Beta'").copy()

In [21]:
chain_counts_beta["tcr_doublet"] = np.where(chain_counts_beta['number_chains'] == 1, "No" , "Yes")

In [22]:
chain_counts_beta[["sample_cell_id", "pool", "tcr_doublet"]]

Unnamed: 0,sample_cell_id,pool,tcr_doublet
2708,cellranger700_multi_21212877c397b6975eaf35c723...,LDP01,Yes
2709,cellranger700_multi_21212877c397b6975eaf35c723...,LDP01,Yes
2710,cellranger700_multi_21212877c397b6975eaf35c723...,LDP01,Yes
2711,cellranger700_multi_21212877c397b6975eaf35c723...,LDP01,Yes
2712,cellranger700_multi_21212877c397b6975eaf35c723...,LDP01,Yes
...,...,...,...
1091760,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
1091761,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
1091762,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No
1091763,cellranger700_multi_47347f8bd01f6c23c63871b334...,LDP69,No


In [23]:
chain_counts_beta.tcr_doublet.value_counts()

tcr_doublet
No     559978
Yes     34669
Name: count, dtype: int64

In [24]:
chain_counts_beta["barcode"] = chain_counts_beta["sample_cell_id"].str.rsplit("_", n=1).str[1]

In [25]:
chain_counts_beta.to_csv("/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/2_BCR_TCR/1_IgBLAST_assignment/tcr_doublet_barcodes.csv", index = False)

In [26]:
has_trb = list(chain_counts_beta.sample_cell_id)

In [27]:
has_igh = list(chain_counts_heavy.sample_cell_id)

In [28]:
len(list(set(has_trb) & set(has_igh)))

15262

In [29]:
tcr_bcr_doublet = pd.DataFrame(list(set(has_trb) & set(has_igh)))

In [30]:
tcr_bcr_doublet = tcr_bcr_doublet.rename({0:"sample_cell_id"}, axis = 1)

In [31]:
tcr_bcr_doublet["sample"] = tcr_bcr_doublet["sample_cell_id"].str.rsplit("_", n=1).str[0]
tcr_bcr_doublet["barcode"] = tcr_bcr_doublet["sample_cell_id"].str.rsplit("_", n=1).str[1]

In [32]:
tcr_bcr_doublet = pd.merge(tcr_bcr_doublet, batch_info)

In [33]:
tcr_bcr_doublet.to_csv("/lustre/scratch126/opentargets/opentargets/OTAR2064/working/users/cs54/final_data/2_BCR_TCR/1_IgBLAST_assignment/tcr_bcr_doublet_barcodes.csv", index = False)

In [34]:
tcr_bcr_doublet

Unnamed: 0,sample_cell_id,sample,barcode,pool,scSeq_batch
0,cellranger700_multi_e6491bbae1405a9f6c7f93c2fb...,cellranger700_multi_e6491bbae1405a9f6c7f93c2fb...,TGGCCAGAGGTCATCT-1,LDP39,6
1,cellranger700_multi_b23b7edb25e76f2a7c2d8665e1...,cellranger700_multi_b23b7edb25e76f2a7c2d8665e1...,TTAACTCAGCGTGTCC-1,LDP35,6
2,cellranger700_multi_45469_SLE_map12928664_and_...,cellranger700_multi_45469_SLE_map12928664_and_...,TGAGGGAGTTAAGAAC-1,LDP10,2
3,cellranger700_multi_7c15960418bdf52c2de92caec2...,cellranger700_multi_7c15960418bdf52c2de92caec2...,ACTTTCAAGATTACCC-1,LDP60,9
4,cellranger700_multi_1ed40ccf3d8f2014f59925d3ff...,cellranger700_multi_1ed40ccf3d8f2014f59925d3ff...,CGATGGCAGGAATTAC-1,LDP41,6
...,...,...,...,...,...
15257,cellranger700_multi_7fbaf95e765fb929dedd7e0f93...,cellranger700_multi_7fbaf95e765fb929dedd7e0f93...,GGACAAGCACGTAAGG-1,LDP24,4
15258,cellranger700_multi_210f420afdef510e028f5b16a8...,cellranger700_multi_210f420afdef510e028f5b16a8...,CTCGAAACACTGAAGG-1,LDP26,5
15259,cellranger700_multi_fa31044dd699a5ba7bbceefb16...,cellranger700_multi_fa31044dd699a5ba7bbceefb16...,GTTTCTACAGGACCCT-1,LDP53,8
15260,cellranger700_multi_9fae3f5f9f97ac0265c62429f8...,cellranger700_multi_9fae3f5f9f97ac0265c62429f8...,ATTGGACGTTGTTTGG-1,LDP38,6
