# Generate background CDR3 regions kmer count from Naive T-cell donors

In [1]:
import itertools
import collections
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
sns.set_style("whitegrid")

## Functions

Returns vector with each cluster weights for number of clusters

In [3]:
def make_cluster_profile_weighted(line, weights):
    kmer_profile=collections.OrderedDict({})
    for i in weights.columns:
         kmer_profile[i]=0
    for i in range(1,len(line)-3):
        for c in weights.columns:
            kmer_profile[c]+=weight_func(weights.ix[line[i:i+3], c])
    vector=[]
    for k, v in kmer_profile.items(): vector.append(v)
    return vector

Returns vector with counts for each cluster

In [15]:
def make_cluster_profile(line, cluster_dict, cluster_n):
#    print(line)
    kmer_profile=collections.OrderedDict({})
    for i in range(1,cluster_n+1):
        kmer_profile[str(i)]=0
    for i in range(1,len(line)-3):
        if line[i:i+3] in cluster_dict.keys():
            kmer_profile[cluster_dict[line[i:i+3]]]+=1
    vector=[]
    for k, v in kmer_profile.items(): vector.append(v)
    return vector

Returns vector with count for each kmer

In [13]:
def make_kmer_profile(line, all_kmers):
    kmer_profile=collections.OrderedDict({})
    for i in all_kmers:
        kmer_profile[i]=0
    for i in range(1,len(line)-3):
        if line[i:i+3] in all_kmers:
            kmer_profile[line[i:i+3]]+=1
    vector=[]
    for k, v in kmer_profile.items(): vector.append(v)
    return vector

## background
Calculation of weights in background:  
for each of four samples calculate mean weights of classes, than compare them  
Samples: CMV+/CMV-; for each - memory and naive (pos_M, pos_N, neg_M, neg_M)

Reading data from all samples

In [3]:
samples={}
sample_names=["pos_M", "pos_N", "neg_M", "neg_N"]
sample_paths=["Donor7.CD8.M.1.txt", "Donor7.CD8.N.1.txt", "Donor9.CD8.M.1.txt", "Donor9.CD8.N.1.txt"]

for i in range(4):
    samples[sample_names[i]]=pd.read_table(sample_paths[i])

In [4]:
samples["pos_M"].head()

Unnamed: 0,count,freq,cdr3nt,cdr3aa,v,d,j,VEnd,DStart,DEnd,JStart
0,34875,0.144025,TGCAGTGCTAGCCGGGACGAATGGGGCGGCTACACCTTC,CSASRDEWGGYTF,TRBV20-1,TRBD1,TRBJ1-2,11,13,18,27
1,10747,0.044382,TGTGCCAGCAGCTTAGTGGGGGGGTCCACCGGGGAGCTGTTTTTT,CASSLVGGSTGELFF,TRBV7-8,TRBD2,TRBJ2-2,16,17,24,26
2,10374,0.042842,TGTGCCAGCAGCGTAGGTTCAGGGGGCACGAACGAGCAGTACTTC,CASSVGSGGTNEQYF,TRBV9,TRBD1,TRBJ2-7,16,19,27,31
3,7862,0.032468,TGTGCCAGCAGTTTAGATGATCAGCCCCAGCATTTT,CASSLDDQPQHF,TRBV12-3,.,TRBJ1-5,16,-1,-1,19
4,7510,0.031014,TGCAGTGCTAGATCCTCGGGACTAAACTACAATGAGCAGTTCTTC,CSARSSGLNYNEQFF,TRBV20-1,TRBD2,TRBJ2-1,12,17,24,26


In [5]:
naive=pd.concat([samples["pos_N"], samples["neg_N"]])

In [6]:
naive_unique=pd.DataFrame(naive.groupby(by="cdr3aa")['freq'].sum())

In [11]:
naive_unique.reset_index(inplace=True)

In [12]:
naive_unique.head()

Unnamed: 0,cdr3aa,freq
0,CAAAALRNTQYF,7e-06
1,CAAACMG_HVDEQYF,3e-06
2,CAAACPG_ESYEQYF,3e-06
3,CAAAGGAPYEQYF,1.3e-05
4,CAAAGGATNEKLFF,3e-06


In [13]:
naive_unique.to_csv("naive_unique_cdr3.csv", index=False)

Reading tables with 100 clusters for kmers

In [9]:
clusters100={}
with open("vdjam_clusters100_ids.txt") as f:
    for l in f.readlines():
        line=l.split("\t")
        clusters100[line[0]]=line[1][:-1]

Calculating vector with presence of each cluster frequency in the bulk data:  
1) count for each cdr3 how many times each class of kmers is present  
2) miltiply by the frequency of each cdr3 in the population  
3) sum for each class  
4) normalize by total sum of frequencies

In [16]:
samp_vec=naive_unique["cdr3aa"].apply(lambda x: make_cluster_profile(x, clusters100, 100))
samp_vec_df=pd.DataFrame.from_records(samp_vec.tolist(), columns=['c'+str(i) for i in range (1, 101)])

In [17]:
samp_vec_df.head()

Unnamed: 0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,...,c91,c92,c93,c94,c95,c96,c97,c98,c99,c100
0,2,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,1,0,0,2,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,2,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [18]:
freqs=naive_unique["freq"].tolist()
vec_of_means_clusters_bg=samp_vec_df.mul(freqs, axis=0).sum(axis=0)/sum(samp_vec_df.mul(freqs, axis=0).sum(axis=1))

In [20]:
sum(vec_of_means_clusters_bg)

0.9999999999995277