# Data
Before getting started, let's take a look at the provided data

## Provided datasets

### GILGFVFTL_data
This dataset contains positive examples of TCR Cells (alpha and beta parts (sometimes both, sometimes only one of them)) that react with th GILGFVFTL epitope.

In [40]:
# Load data/GILGFVFTL_data.tsv in a dataframe
import pandas as pd
df_g = pd.read_csv('data/GILGFVFTL_data.tsv') #, sep='\t') # Hmm, for some reason it has a .tsv extension but is actually a .csv file
df_g.head()

Unnamed: 0.1,Unnamed: 0,GeneA,CDR3_alfa,TRAV,TRAJ,MHC A_alfa,Epitope,Score_alfa,GeneB,CDR3_beta,TRBV,TRBJ,MHC A_beta,Epitope.1,Score_beta
0,0,TRA,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,HLA-A*02:01:48,GILGFVFTL,3.0,TRB,CASSSRSSYEQYF,TRBV19*01,TRBJ2-7*01,HLA-A*02:01:48,GILGFVFTL,3.0
1,1,TRA,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,HLA-A*02:01:48,GILGFVFTL,3.0,TRB,CASSSRSSYEQYF,TRBV19*01,TRBJ2-7*01,HLA-A*02:01:48,GILGFVFTL,3.0
2,2,TRA,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,HLA-A*02:01:48,GILGFVFTL,3.0,TRB,CASSSRSSYEQYF,TRBV19*01,TRBJ2-7*01,HLA-A*02:01:48,GILGFVFTL,3.0
3,3,TRA,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,HLA-A*02:01:48,GILGFVFTL,3.0,TRB,CASSSRASYEQYF,TRBV19*01,TRBJ2-7*01,HLA-A*02:01:48,GILGFVFTL,3.0
4,4,TRA,CAGPGGSSNTGKLIF,TRAV35*01,TRAJ37*01,HLA-A*02:01:48,GILGFVFTL,3.0,TRB,CASSLIYPGELFF,TRBV27*01,TRBJ2-2*01,HLA-A*02:01:48,GILGFVFTL,3.0


In [41]:
# Get per column the number of different values
df_g.nunique()

Unnamed: 0    5192
GeneA            1
CDR3_alfa     1093
TRAV            44
TRAJ            50
MHC A_alfa       3
Epitope          1
Score_alfa       3
GeneB            1
CDR3_beta     3473
TRBV            51
TRBJ            13
MHC A_beta       3
Epitope.1        1
Score_beta       4
dtype: int64

In [42]:
# Number of genes containing alpha (column GeneA = TRA)
alpha_count_g = df_g[df_g['GeneA'] == 'TRA'].shape[0]
f"Number of genes containing alpha: {alpha_count_g} ({alpha_count_g/df_g.shape[0]*100:.2f}%)"

'Number of genes containing alpha: 2161 (41.62%)'

In [43]:
beta_count_g = df_g[df_g['GeneB'] == 'TRB'].shape[0]
f"Number of genes containing beta: {beta_count_g} ({beta_count_g/df_g.shape[0]*100:.2f}%)"

'Number of genes containing beta: 5190 (99.96%)'

In [44]:
# Values to ignore (for now?) are MHC A_alfa (always the same), Score_alfa (confidence score of alfa), Score_beta (confidence score of beta) -> Drop those columns
df_g = df_g.drop(columns=['MHC A_alfa', 'Score_alfa', 'MHC A_beta', 'Score_beta'])
df_g.head()

Unnamed: 0.1,Unnamed: 0,GeneA,CDR3_alfa,TRAV,TRAJ,Epitope,GeneB,CDR3_beta,TRBV,TRBJ,Epitope.1
0,0,TRA,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,GILGFVFTL,TRB,CASSSRSSYEQYF,TRBV19*01,TRBJ2-7*01,GILGFVFTL
1,1,TRA,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,GILGFVFTL,TRB,CASSSRSSYEQYF,TRBV19*01,TRBJ2-7*01,GILGFVFTL
2,2,TRA,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,GILGFVFTL,TRB,CASSSRSSYEQYF,TRBV19*01,TRBJ2-7*01,GILGFVFTL
3,3,TRA,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,GILGFVFTL,TRB,CASSSRASYEQYF,TRBV19*01,TRBJ2-7*01,GILGFVFTL
4,4,TRA,CAGPGGSSNTGKLIF,TRAV35*01,TRAJ37*01,GILGFVFTL,TRB,CASSLIYPGELFF,TRBV27*01,TRBJ2-2*01,GILGFVFTL


### background
This dataset contains some alpha and beta parts from TCR cells, without any epitope. You can use them as negative dataset (by filtering out the alphas and betas that occur in the positive dataset).

In [45]:
import pandas as pd
df_b = pd.read_csv('data/background.tsv', sep='\t') # this is actually a tsv
df_b.head()

Unnamed: 0,CDR3_alfa,TRAV,TRAJ,CDR3_beta,TRBV,TRBJ
0,CAYGSTYNTDKLIF,TRAV38-2DV8,TRAJ34,CASSQEGSGVTDTQYF,TRBV4-3,TRBJ2-3
1,CILRDEGGGADGLTF,TRAV26-2,TRAJ45,CSAREGLAEFNEQFF,TRBV20-1,TRBJ2-1
2,CAGGSGYSTLTF,TRAV12-2,TRAJ11,CASSLGHYGYTF,TRBV12-3,TRBJ1-2
3,CAVRDLIVGANNLFF,TRAV3,TRAJ36,CASSQSFRDDEQYF,TRBV18,TRBJ2-7
4,CATYGGSQGNLIF,TRAV21,TRAJ42,CASSQAVGYNEQFF,TRBV4-1,TRBJ2-1


In [46]:
df_b.nunique()

CDR3_alfa    429396
TRAV             56
TRAJ             56
CDR3_beta    474559
TRBV             61
TRBJ             13
dtype: int64

## Combined Dataset

In [47]:
positive_dataset = df_g
negative_dataset = df_b

In [48]:
# Drop the rows from the negative dataset where CDR3_alpha or CDR3_beta occur in the positive dataset
# Note: Check seperately or together? (e.g. alpha in positive, but alpha + beta combination not)
negative_dataset = negative_dataset[~negative_dataset['CDR3_alfa'].isin(positive_dataset['CDR3_alfa'])]
negative_dataset = negative_dataset[~negative_dataset['CDR3_beta'].isin(positive_dataset['CDR3_beta'])]

In [49]:
# get the percentages of df_g containing alpha, beta and both
alpha_only_count_g = df_g[(df_g['GeneA'] == 'TRA') & (df_g['GeneB'] != 'TRB')].shape[0]
beta_only_count_g = df_g[(df_g['GeneA'] != 'TRA') & (df_g['GeneB'] == 'TRB')].shape[0]
both_count_g = df_g[(df_g['GeneA'] == 'TRA') & (df_g['GeneB'] == 'TRB')].shape[0]
non_count_g = df_g[(df_g['GeneA'] != 'TRA') & (df_g['GeneB'] != 'TRB')].shape[0]
print(f"Number of genes containing alpha only: {alpha_only_count_g} ({alpha_only_count_g/df_g.shape[0]*100:.2f}%)")
print(f"Number of genes containing beta only: {beta_only_count_g} ({beta_only_count_g/df_g.shape[0]*100:.2f}%)")
print(f"Number of genes containing both: {both_count_g} ({both_count_g/df_g.shape[0]*100:.2f}%)")
print(f"Number of genes containing neither: {non_count_g} ({non_count_g/df_g.shape[0]*100:.2f}%)")

Number of genes containing alpha only: 2 (0.04%)
Number of genes containing beta only: 3031 (58.38%)
Number of genes containing both: 2159 (41.58%)
Number of genes containing neither: 0 (0.00%)


In [50]:
def sample_and_drop(df, n):
    """Sample n rows from df and drop them from df"""
    # src: https://stackoverflow.com/questions/39835021/pandas-random-sample-with-remove
    df_subset = df.sample(n)
    df.drop(df_subset.index, inplace=True)
    return df_subset

In [51]:
# Test the sample and drop function
df_ex = pd.DataFrame({'a': [1,2,3,4,5,6,7,8,9,10], 'b': [1,2,3,4,5,6,7,8,9,10]})
# get the number of rows in df_ex
rows_before = df_ex.shape[0]
n = 2
df_subset = sample_and_drop(df_ex, n)
rows_after = df_ex.shape[0]
assert rows_before - rows_after == n
df_subset

Unnamed: 0,a,b
8,9,9
9,10,10


In [52]:
# now sample the same amount of negative examples from the negative dataset
alpha_only_rows_b = sample_and_drop(negative_dataset, alpha_only_count_g)
beta_only_rows_b = sample_and_drop(negative_dataset, beta_only_count_g)
both_rows_b = sample_and_drop(negative_dataset, both_count_g)
non_rows_b = sample_and_drop(negative_dataset, non_count_g)

In [53]:
# combine samples in new dataframe
negative_dataset_same_proportion = pd.concat([alpha_only_rows_b, beta_only_rows_b, both_rows_b, non_rows_b])
# Check whether number of rows is same as in positive dataset
assert negative_dataset_same_proportion.shape[0] == positive_dataset.shape[0]

In [54]:
# Add column 'reaction' 0 for negative_dataset_same_proportion, 1 for positive_dataset
positive_dataset['reaction'] = 1
negative_dataset_same_proportion['reaction'] = 0

In [55]:
# Keep only the columns of the positive dataset that are in the negative datasets
positive_dataset = positive_dataset[negative_dataset_same_proportion.columns]

In [56]:
# Combine the two datasets
combined_dataset = pd.concat([positive_dataset, negative_dataset_same_proportion])
combined_dataset.head()

Unnamed: 0,CDR3_alfa,TRAV,TRAJ,CDR3_beta,TRBV,TRBJ,reaction
0,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,CASSSRSSYEQYF,TRBV19*01,TRBJ2-7*01,1
1,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,CASSSRSSYEQYF,TRBV19*01,TRBJ2-7*01,1
2,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,CASSSRSSYEQYF,TRBV19*01,TRBJ2-7*01,1
3,CAGAGSQGNLIF,TRAV27*01,TRAJ42*01,CASSSRASYEQYF,TRBV19*01,TRBJ2-7*01,1
4,CAGPGGSSNTGKLIF,TRAV35*01,TRAJ37*01,CASSLIYPGELFF,TRBV27*01,TRBJ2-2*01,1
