In [1]:
import os
import re
import pandas as pd
from subprocess import Popen, PIPE
from multiprocessing import Pool
from matplotlib import pyplot as plt
import statsmodels.stats.multitest as multitest

In [2]:
# override IPython's default %%bash to not buffer all output
from IPython.core.magic import register_cell_magic
@register_cell_magic
def bash(line, cell): get_ipython().system(cell)

# Filter non-functional clonotypes (from vdjtools-formated clonotype tabels)

#### Adaptive

In [8]:
#make metadata file
input_folder="/projects/fmba_covid/adaptive_new/corr"
output_folder="/home/taumata/COV_AdaptiveBiotech/AB_corr"


try:
    os.mkdir(output_folder)
except:
    pass

with (str(output_folder)+"/metadata.tsv","w") as meta_file :
    meta_file.write("#file.name\tsample.id")
    files=os.listdir(input_folder)
    for file_name in files:
        if re.search(".TCRB.tsv.txt",file_name): #check "extension" of file
            meta_file.write("\n"+str(input_folder)+"/"+str(file_name)+"\t"+str(file_name))


In [None]:
%%bash
vdjtools FilterNonFunctional -m /home/taumata/COV_AdaptiveBiotech/AB_corr/metadata.tsv /home/taumata/COV_AdaptiveBiotech/AB_corr_functional/



#### HIP

In [66]:
#make metadata file
with open("/home/taumata/COV_AdaptiveBiotech/hip_full_metadata.txt","w") as out_hip_meta:
    with open("/projects/fmba_covid/hip_full/metadata.txt","r") as in_hip_meta:
        for line in in_hip_meta:
            line=re.sub(r"../mixcr/../corr/",r"/projects/fmba_covid/hip_full/corr/",line)
            out_hip_meta.write(line)

In [None]:
%%bash
vdjtools FilterNonFunctional -m /home/taumata/COV_AdaptiveBiotech/hip_full_metadata.txt /home/taumata/COV_AdaptiveBiotech/hip_corr_func/


#### FMBA

In [84]:
#make metadata file
with open("/home/taumata/COV_AdaptiveBiotech/fmba_new_corr_TRB_metadata.txt","w") as out_fmba_meta:
    with open("/projects/fmba_covid/fmba_new/corr/metadata.txt","r") as in_fmba_meta:
        out_fmba_meta.write("file_name\tsample_id\t..filter..\n")
        for line in in_fmba_meta:
            if re.search("Undetermined",line):
                continue
            elif re.search("clonotypes.TRB",line):
                out_fmba_meta.write("/projects/fmba_covid/fmba_new/corr/"+str(line))
               

In [None]:
%%bash
vdjtools FilterNonFunctional -m /home/taumata/COV_AdaptiveBiotech/fmba_new_corr_TRB_metadata.txt /home/taumata/COV_AdaptiveBiotech/fmba_new_corr_TRB_func/


# Describe feature

## (1) top 1000 from adaptive_new vs hip 

In [86]:
clones=pd.read_csv("/home/taumata/COV_AdaptiveBiotech/AB_prediction/covid_status_top.txt",sep="\t", index_col=0)

In [87]:
print("fdr")
print(sum(multitest.multipletests(clones["p_value"],method="fdr_bh")[0]))
print("bonferroni")
print(sum(multitest.multipletests(clones["p_value"],method="bonferroni")[0]))

fdr
20212
bonferroni
105


In [88]:
#with FDR correction all 20212 clones are significant, with bonferroni - only top 105, but we will take top 1000

In [89]:
COVID_associated_clones=set(clones.head(1000)["cdr3aa"]) #check for duplicates
COVID_associated_clones=list(COVID_associated_clones)

In [90]:
len(COVID_associated_clones) #some of them are non-functional, but i will apply them to functional-filtred clonotypes

1000

### (1.1) find fraction of each of these clonotypes with ONE mismatch/indel in each sample 

#### Adaptive

In [96]:
import multiprocessing as mp
from polyleven import levenshtein

#clones_in_samples={} #store fractions of clones of interes in each sample

treshold=100 #less than this treshold

#----------read sample names----------
samples=pd.read_csv("/home/taumata/COV_AdaptiveBiotech/AB_corr_functional/ncfilter.summary.txt",sep="\t")
samples=list(samples["sample_id"])
#-------------------------------------

def get_clonotypes_frequences(sample_name, COVID_associated_clones = COVID_associated_clones, treshold=100):
    sample_short_name=sample_name.split("_TCRB")[0]
    #print(sample_short_name)
    local_clones_in_samples=dict([(xT,0) for xT in COVID_associated_clones]) #initialize by all zeroes
    #------read clonotype tabel    
    with open("/home/taumata/COV_AdaptiveBiotech/AB_corr_functional/"+str(sample_name),"r") as clonotype_file:
        total_counts=0
        for clone in clonotype_file.readlines()[1:]: #skip header
            clone=clone.split("\t")
            total_counts+=int(clone[0]) #clone[0] is clone count
            #zero or one mismatch/indel (clone[3] is amino acid CDR3 sequence)
            for clone_of_interest in local_clones_in_samples:
                if levenshtein(clone_of_interest, clone[3], 1)<=1:
                    local_clones_in_samples[clone_of_interest]+=float(clone[1]) #clone[1] is fraction
        if total_counts<treshold: #we will remember names of too small samples
            return (sample_short_name,"too_small")
        return (sample_short_name, local_clones_in_samples)


print("START")    
pool = mp.Pool(30) #SET THE NUMBER OF CORES THAT YOU GOING TO USE
results=pool.map(get_clonotypes_frequences, samples)
pool.close()
clones_in_samples=dict(results)
print("DONE")

START
DONE


In [98]:
count=0
with open("/home/taumata/COV_AdaptiveBiotech/AB_prediction/AB_freq_of_top1000_leven1_v2.tsv","w") as out_file:
    out_file.write("\t".join(COVID_associated_clones))
    for sample_name in clones_in_samples:
        if clones_in_samples[sample_name]=="too_small":
            print(sample_name)
            count+=1
            continue
        else:
            out_file.write("\n")
            out_file.write(str(sample_name)+"\t")
            out_file.write("\t".join([str(clones_in_samples[sample_name][clone]) for clone in COVID_associated_clones]))
            
print(count)



INCOV067-AC-3
1


#### HIP

In [99]:
import multiprocessing as mp
from polyleven import levenshtein

#clones_in_samples={} #store fractions of clones of interes in each sample

treshold=100 #less than this treshold

#----------read sample names----------
samples=pd.read_csv("/home/taumata/COV_AdaptiveBiotech/hip_corr_func/ncfilter.summary.txt",sep="\t")
samples=list(samples["sample_id"])
#-------------------------------------

def get_clonotypes_frequences(sample_name, COVID_associated_clones = COVID_associated_clones, treshold=100):
    sample_short_name=sample_name
    #print(sample_short_name)
    local_clones_in_samples=dict([(xT,0) for xT in COVID_associated_clones]) #initialize by all zeroes
    #------read clonotype tabel    
    with open("/home/taumata/COV_AdaptiveBiotech/hip_corr_func/"+str(sample_name)+".txt","r") as clonotype_file:
        total_counts=0
        for clone in clonotype_file.readlines()[1:]: #skip header
            clone=clone.split("\t")
            total_counts+=int(clone[0]) #clone[0] is clone count
            #zero or one mismatch/indel (clone[3] is amino acid CDR3 sequence)
            for clone_of_interest in local_clones_in_samples:
                if levenshtein(clone_of_interest, clone[3], 1)<=1:
                    local_clones_in_samples[clone_of_interest]+=float(clone[1]) #clone[1] is fraction
        if total_counts<treshold: #we will remember names of too small samples
            return (sample_short_name,"too_small")
        return (sample_short_name, local_clones_in_samples)


print("START")    
pool = mp.Pool(30) #SET THE NUMBER OF CORES THAT YOU GOING TO USE
results=pool.map(get_clonotypes_frequences, samples)
pool.close()
clones_in_samples=dict(results)
print("DONE")

START
DONE


In [100]:
count=0
with open("/home/taumata/COV_AdaptiveBiotech/AB_prediction/HIP_freq_of_top1000_leven1_v2.tsv","w") as out_file:
    out_file.write("\t".join(COVID_associated_clones))
    for sample_name in clones_in_samples:
        if clones_in_samples[sample_name]=="too_small":
            print(sample_name)
            count+=1
            continue
        else:
            out_file.write("\n")
            out_file.write(str(sample_name)+"\t")
            out_file.write("\t".join([str(clones_in_samples[sample_name][clone]) for clone in COVID_associated_clones]))
            
print(count)




0


#### FMBA

In [93]:
import multiprocessing as mp
from polyleven import levenshtein

#clones_in_samples={} #store fractions of clones of interes in each sample

treshold=100 #less than this treshold

#----------read sample names----------
samples=pd.read_csv("/home/taumata/COV_AdaptiveBiotech/fmba_new_corr_TRB_func/ncfilter.summary.txt",sep="\t")
samples=list(samples["sample_id"])
#-------------------------------------

def get_clonotypes_frequences(sample_name, COVID_associated_clones = COVID_associated_clones, treshold=100):
    sample_short_name=sample_name.split("_S")[0]
    #print(sample_short_name)
    local_clones_in_samples=dict([(xT,0) for xT in COVID_associated_clones]) #initialize by all zeroes
    #------read clonotype tabel    
    with open("/home/taumata/COV_AdaptiveBiotech/fmba_new_corr_TRB_func/"+str(sample_name)+".txt","r") as clonotype_file:
        total_counts=0
        for clone in clonotype_file.readlines()[1:]: #skip header
            clone=clone.split("\t")
            total_counts+=int(clone[0]) #clone[0] is clone count
            #zero or one mismatch/indel via levensthein clone[3] is amino acid CDR3 sequence
            for clone_of_interest in local_clones_in_samples:
                if levenshtein(clone_of_interest, clone[3], 1)<=1:
                    local_clones_in_samples[clone_of_interest]+=float(clone[1]) #clone[1] is fraction
        if total_counts<treshold: #we will remember names of too small samples
            return (sample_short_name,"too_small")
        return (sample_short_name, local_clones_in_samples)


print("START")    
pool = mp.Pool(30) #SET THE NUMBER OF CORES THAT YOU GOING TO USE
results=pool.map(get_clonotypes_frequences, samples)
pool.close()
clones_in_samples=dict(results)
print("DONE")

START
DONE


In [94]:
count=0
with open("/home/taumata/COV_AdaptiveBiotech/AB_prediction/FMBA_freq_of_top1000_leven1_v2.tsv","w") as out_file:
    out_file.write("\t".join(COVID_associated_clones))
    for sample_name in clones_in_samples:
        if clones_in_samples[sample_name]=="too_small":
            print(sample_name)
            count+=1
            continue
        else:
            out_file.write("\n")
            out_file.write(str(sample_name)+"\t")
            out_file.write("\t".join([str(clones_in_samples[sample_name][clone]) for clone in COVID_associated_clones]))
            
print(count)





020000330808
020003830808
050000520808
050000710808
050000880808
050000930808
050001200808
050001350808
050001690807
050001700808
050001780808
050001790808
050001820808
050002010808
050002170808
140003720808
140003810808
210000170808
210000620808
210003270808
210003330808
210003340808
250002250808
260002780807
260002860808
260002910808
260002940808
260003000808
260003040807
260004150808
260004200808
260004290808
330001570808
330001670808
330001860808
330001910808
330002550808
440000050808
440000240808
440003270808
460001920808
500000220808
500000930808
500003850808
500003870808
500003990808
500004180808
520000800808
520001220808
540003110808
550001550808
550001860808
550001890808
590001290808
590001680808
590002670808
590002820808
590002830808
590002840808
590004410808
640000670808
640000850808
640002170808
640003040808
640003530808
670002310808
670004740808
670004750808
670004880808
670005040808
670005090808
670005400808
690000060808
690000770808
690000780808
690000970808
690000990808

### (1.2) find fraction of each of these clonotypes without mismatch in each sample 



#### Adaptive

In [None]:
import multiprocessing as mp
import distance

#clones_in_samples={} #store fractions of clones of interes in each sample

treshold=100 #less than this treshold

#----------read sample names----------
samples=pd.read_csv("/home/taumata/COV_AdaptiveBiotech/AB_corr_functional/ncfilter.summary.txt",sep="\t")
samples=list(samples["sample_id"])
#-------------------------------------

def get_clonotypes_frequences(sample_name, COVID_associated_clones = COVID_associated_clones, treshold=100):
    sample_short_name=sample_name.split("_TCRB")[0]
    #print(sample_short_name)
    local_clones_in_samples=dict([(xT,0) for xT in COVID_associated_clones]) #initialize by all zeroes
    #------read clonotype tabel    
    with open("/home/taumata/COV_AdaptiveBiotech/AB_corr_functional/"+str(sample_name),"r") as clonotype_file:
        total_counts=0
        for clone in clonotype_file.readlines()[1:]: #skip header
            clone=clone.split("\t")
            total_counts+=int(clone[0]) #clone[0] is clone count
            if clone[3] in COVID_associated_clones: #exact match check via hash (clone[3] is amino acid CDR3 sequence)
                    local_clones_in_samples[clone[3]]+=float(clone[1])
        if total_counts<treshold: #we will remember names of too small samples
            return (sample_short_name,"too_small")
        return (sample_short_name, local_clones_in_samples)


print("START")    
pool = mp.Pool(30) #SET THE NUMBER OF CORES THAT YOU GOING TO USE
results=pool.map(get_clonotypes_frequences, samples)
pool.close()
clones_in_samples=dict(results)
print("DONE")

In [None]:
count=0
with open("/home/taumata/COV_AdaptiveBiotech/AB_prediction/AB_freq_of_top1000_exact.tsv","w") as out_file:
    out_file.write("\t".join(COVID_associated_clones))
    for sample_name in clones_in_samples:
        if clones_in_samples[sample_name]=="too_small":
            print(sample_name)
            count+=1
            continue
        else:
            out_file.write("\n")
            out_file.write(str(sample_name)+"\t")
            out_file.write("\t".join([str(clones_in_samples[sample_name][clone]) for clone in COVID_associated_clones]))
            
print(count)




#### HIP

In [None]:
import multiprocessing as mp
import distance

#clones_in_samples={} #store fractions of clones of interes in each sample

treshold=100 #less than this treshold

#----------read sample names----------
samples=pd.read_csv("/home/taumata/COV_AdaptiveBiotech/AB_corr_functional/ncfilter.summary.txt",sep="\t")
samples=list(samples["sample_id"])
#-------------------------------------

def get_clonotypes_frequences(sample_name, COVID_associated_clones = COVID_associated_clones, treshold=100):
    sample_short_name=sample_name
    #print(sample_short_name)
    local_clones_in_samples=dict([(xT,0) for xT in COVID_associated_clones]) #initialize by all zeroes
    #------read clonotype tabel    
    with open("/home/taumata/COV_AdaptiveBiotech/AB_corr_functional/"+str(sample_name)+".txt","r") as clonotype_file:
        total_counts=0
        for clone in clonotype_file.readlines()[1:]: #skip header
            clone=clone.split("\t")
            total_counts+=int(clone[0]) #clone[0] is clone count
            if clone[3] in COVID_associated_clones: #exact match check via hash (clone[3] is amino acid CDR3 sequence)
                    local_clones_in_samples[clone[3]]+=float(clone[1])
        if total_counts<treshold: #we will remember names of too small samples
            return (sample_short_name,"too_small")
        return (sample_short_name, local_clones_in_samples)


print("START")    
pool = mp.Pool(30) #SET THE NUMBER OF CORES THAT YOU GOING TO USE
results=pool.map(get_clonotypes_frequences, samples)
pool.close()
clones_in_samples=dict(results)
print("DONE")

In [None]:
count=0
with open("/home/taumata/COV_AdaptiveBiotech/AB_prediction/HIP_freq_of_top1000_exact.tsv","w") as out_file:
    out_file.write("\t".join(COVID_associated_clones))
    for sample_name in clones_in_samples:
        if clones_in_samples[sample_name]=="too_small":
            print(sample_name)
            count+=1
            continue
        else:
            out_file.write("\n")
            out_file.write(str(sample_name)+"\t")
            out_file.write("\t".join([str(clones_in_samples[sample_name][clone]) for clone in COVID_associated_clones]))
            
print(count)





## (2) Use clusters based on TCRnet (?)

### load file with clusters

In [4]:
clusters_TRA={}
clusters_TRA_back={}
clusters_TRB={}
clusters_TRB_back={}
cluster_file=pd.read_csv("clusters_final.txt",sep="\t")
for index, row in cluster_file.loc[cluster_file.chain=="TRA",].iterrows():
    if row["cid"] not in clusters_TRA:
        clusters_TRA[row["cid"]]=set()
    clusters_TRA[row["cid"]].add(row["cdr3aa"])
    clusters_TRA_back[row["cdr3aa"]]=row["cid"]
    
for index, row in cluster_file.loc[cluster_file.chain=="TRB",].iterrows():
    if row["cid"] not in clusters_TRB:
        clusters_TRB[row["cid"]]=set()
    clusters_TRB[row["cid"]].add(row["cdr3aa"])
    clusters_TRB_back[row["cdr3aa"]]=row["cid"]

### check frequencies of clones from clusters in samples

In [23]:
#TRB first
too_small_beta=set()
all_TRB_clusters=list(clusters_TRB.keys())

TRB_samples_clusters={}

TRB_samples=os.listdir("/home/taumata/COV_AdaptiveBiotech/FMBA_vdjtools_functional/")
for TRB_sample in TRB_samples:
    if TRB_sample.startswith("Undetermined"):
        continue
    elif re.search("clonotypes.TRB.txt",TRB_sample):
        sample_name=TRB_sample.split("_S")[0]
        TRB_samples_clusters[sample_name]=dict([(xT,0) for xT in all_TRB_clusters])
        with open("/home/taumata/COV_AdaptiveBiotech/FMBA_vdjtools_functional/"+str(TRB_sample),"r") as clonotypes:
            total_counts=0
            for clone in clonotypes.readlines()[1:]:
                temp=clone.split("\t")
                total_counts+=int(temp[0])
                if temp[3] in clusters_TRB_back:
                    TRB_samples_clusters[sample_name][clusters_TRB_back[temp[3]]]+=float(temp[1])
            if total_counts<100:
                too_small_beta.add(sample_name)


In [24]:
#now TRA
too_small_alpha=set()
all_TRA_clusters=list(clusters_TRA.keys())

TRA_samples_clusters={}

TRA_samples=os.listdir("/home/taumata/COV_AdaptiveBiotech/FMBA_vdjtools_TRA/")
for TRA_sample in TRA_samples:
    if TRA_sample.startswith("Undetermined"):
        continue
    elif re.search("clonotypes.TRA.txt",TRA_sample):
        sample_name=TRA_sample.split("_S")[0]
        TRA_samples_clusters[sample_name]=dict([(xT,0) for xT in all_TRA_clusters])
        with open("/home/taumata/COV_AdaptiveBiotech/FMBA_vdjtools_TRA/"+str(TRA_sample),"r") as clonotypes:
            total_counts=0
            for clone in clonotypes.readlines()[1:]:
                temp=clone.split("\t")
                total_counts+=int(temp[0])
                if temp[3] in clusters_TRA_back:
                    TRA_samples_clusters[sample_name][clusters_TRA_back[temp[3]]]+=float(temp[1])
                if total_counts<100:
                    too_small_alpha.add(sample_name)



In [31]:
print(len(too_small_alpha))
print(len(too_small_beta))
print(len(too_small_alpha|too_small_beta))
print(len(too_small_alpha&too_small_beta))


99
75
120
54


### save results

In [29]:
sample_names=list(TRB_samples_clusters.keys())
data_matrix=[]
for sample_name in sample_names:
    sample_row=[sample_name]
    for TRB in all_TRB_clusters:
        sample_row.append(TRB_samples_clusters[sample_name][TRB])
    for TRA in all_TRA_clusters:
        sample_row.append(TRA_samples_clusters[sample_name][TRA])
    data_matrix.append(sample_row)
sample_clusters=pd.DataFrame(data_matrix, columns=["sample_name"]+list(all_TRB_clusters)+
                            list(all_TRA_clusters))        
    

In [30]:
sample_clusters.to_csv("count_of_clusters_in_samples.tsv",sep="\t",index=False)

### save results only for samples with number of counts greater than treshold (100)

In [33]:
sample_names=set(TRB_samples_clusters.keys())
print(len(sample_names))
sample_names=sample_names-(too_small_alpha&too_small_beta)
sample_names=list(sample_names)
print(len(sample_names))
data_matrix=[]
for sample_name in sample_names:
    sample_row=[sample_name]
    for TRB in all_TRB_clusters:
        sample_row.append(TRB_samples_clusters[sample_name][TRB])
    for TRA in all_TRA_clusters:
        sample_row.append(TRA_samples_clusters[sample_name][TRA])
    data_matrix.append(sample_row)
sample_clusters=pd.DataFrame(data_matrix, columns=["sample_name"]+list(all_TRB_clusters)+
                            list(all_TRA_clusters))        
    

572
518


In [34]:
sample_clusters.to_csv("count_of_clusters_in_samples_filt100.tsv",sep="\t",index=False)

In [None]:
#TO DO:
#Downsample all samples to 10000 clonotypes and repeat work with them