In [2]:
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
#from Bio import SeqIO
from glob import glob
import os

from mito.genotyping import nucleotide_mutation_prob, mutation_prob, COUNTS_COLUMNS


# Plotting style
sns.set_style('white')
sns.set_context('notebook')
#pd.set_option('max_rows', 1000)
#pd.set_option('max_columns', 100)

def plot_style(figsize=(12, 6), labelsize=20, titlesize=24, ticklabelsize=14, **kwargs):
   basic_style = {
       'figure.figsize': figsize,
       'axes.labelsize': labelsize,
       'axes.titlesize': titlesize,
       'xtick.labelsize': ticklabelsize,
       'ytick.labelsize': ticklabelsize,
       'axes.spines.top': False,
       'axes.spines.right': False,
       'axes.spines.left': False,
       'axes.grid': False,
       'axes.grid.axis': 'y',
   }
   basic_style.update(kwargs)
   return plt.rc_context(rc=basic_style)

blue = sns.xkcd_rgb['ocean blue']

In [3]:
import scanpy as sc

# Load metadata and expression matrix and combine them in a neat AnnData object

## Load cell metadata

In [4]:
def get_meta(experiment=None, filename='', report=True, invivo=True, bulks=False):
    df = pd.read_csv(filename, sep=',', index_col=1, header=0, low_memory=False)
    print(f'Data loaded,  {df.shape[0]} samples.')
    df = df.loc[df['Sample_ID'].notnull()] # Only keep cells with sample ID.
    print('Discarding samples without Sample ID.  {} samples remaining.'.format(len(df)))

    df['Project_ID'] = [sid[:5] for sid in df.index] 
    if experiment:
        df = df[ df['Project_ID'] == experiment] # Only cells from the experiment.
        print('{} samples found in experiment {}.'.format(len(df), experiment))
    else:
        print('{} samples found in experiments {}.'.format(len(df), 
                                                          list(df['Project_ID'].unique())))
    df = df.loc[df['Clone_ID'].notnull()] # Only keep cells with Clone ID.
    print('Discarding samples without Clone ID.  {} samples remaining.'.format(len(df)))
    if invivo:
        df = df.loc[df['Condition'] == 'In_Vivo'] 
        print('Discarding samples that are not in-vivo.  {} samples remaining'.format(len(df)))
    df.drop('Sample_ID', axis=1, inplace=True)
    df.index = [cn[:-1] for cn in df.index]
    if bulks:
        df = df[ df['Cell_Number'] > 1] # Include only bulks.
        print('Discarding single cells.  {} bulks remaining'.format(len(df)))
    else:
        df = df[ df['Cell_Number'] == 1] # Get rid of bulks.
        print('Discarding bulks.  {} single cells remaining'.format(len(df)))
    if report:
        if bulks:
            print('{} bulks found in experiment {}'.format(len(df), experiment))
        else:
            print('{} single cells found in experiment.'.format(len(df)))
        clones = df['Clone_ID'].unique()
        print('{} Clones: {}'.format(len(clones), ', '.join(clones)))
        print('The first five rows of the dataframe are below')
        display(df[:5])
    return df

In [5]:
meta_df = get_meta(filename = '../../data/YFV2001_gene_expression/metadata_marty_NFCORE_Sept2019.csv')
cells = list(meta_df.index)

Data loaded,  2364 samples.
Discarding samples without Sample ID.  2345 samples remaining.
2345 samples found in experiments ['P1299', 'P1902', 'P3128'].
Discarding samples without Clone ID.  1978 samples remaining.
Discarding samples that are not in-vivo.  1236 samples remaining
Discarding bulks.  1236 single cells remaining
1236 single cells found in experiment.
179 Clones: 0, 153, 156, 176, 159, 322, 160, 162, 164, 130, 150, 167, 132, 244, 170, 112, 188, 107, 547, 174, 222, 177, 178, 121, 149, 134, 183, 193, 595, 189, 192, 194, 289, 291, 293, 294, 301, 280, 356, 109, 304, 307, 131, 308, 224, 312, 313, 316, 321, 325, 326, 225, 329, 287, 277, 391, 2, 389, 428, 15, 410, 226, 137, 414, 282, 419, 420, 456, 241, 459, 122, 382, 283, 477, 255, 264, 98, 95, 499, 501, 142, 509, 104, 100, 364, 504, 111, 129, 240, 369, 376, 92, 94, 508, 353, 242, 118, 615, 230, 515, 248, 234, 127, 525, 228, 247, 359, 387, 383, 97, 147, 106, 140, 259, 548, 552, 115, 281, 102, 143, 55, 57, 59, 74, 60, 61, 64, 67,

Unnamed: 0,Condition,Cell_Number,Clone_ID,Day,In_Vivo_Clone_ID,Project_ID
P1299_1097,In_Vivo,1.0,0,15.0,,P1299
P1299_1098,In_Vivo,1.0,0,15.0,,P1299
P1299_1099,In_Vivo,1.0,153,15.0,,P1299
P1299_1100,In_Vivo,1.0,0,15.0,,P1299
P1299_1101,In_Vivo,1.0,153,15.0,,P1299


## Loading gene metadata

Now we load relevant information about the genes. We've downloaded the [GRCh37 human gene data set](https://grch37.ensembl.org/index.html) set from the biomart.

In [6]:
def get_genes(filename):
    df = pd.read_csv(filename, sep='\t',
                     index_col=0, header=0, low_memory=False)
    df.columns = ['Transcript_length', 'Gene_name', 'GC_content','Transcript stable ID', 'TSS']
    print('{} genes found in {}.'.format(len(df), filename))
    return df

In [7]:
gene_raw = get_genes('../../data/YFV2001_gene_expression/GRCh37_Biomart.txt')
gene_raw.head()

215404 genes found in ../../data/YFV2001_gene_expression/GRCh37_Biomart.txt.


Unnamed: 0_level_0,Transcript_length,Gene_name,GC_content,Transcript stable ID,TSS
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSG00000261657,2673,SLC25A26,40.0,ENST00000566782,66119285
ENSG00000261657,1096,SLC25A26,40.0,ENST00000562780,66320895
ENSG00000261657,1840,SLC25A26,40.0,ENST00000569579,66320895
ENSG00000261657,3405,SLC25A26,40.0,ENST00000568242,66320895
ENSG00000261657,375,SLC25A26,40.0,ENST00000565530,66339287


In [8]:
gene_lengths = gene_raw['Transcript_length'].groupby(gene_raw.index).mean()
gene_GCs = gene_raw['GC_content'].groupby(gene_raw.index).mean()
gene_df = pd.DataFrame(gene_lengths, index=gene_lengths.index)
gene_df['GC_content'] = gene_GCs
gene_df['Gene_name'] = gene_raw['Gene_name'].groupby(gene_raw.index).first()
print('{} distinct Ensembl gene IDs found.'.format(len(gene_df)))
gene_df.head()

63677 distinct Ensembl gene IDs found.


Unnamed: 0_level_0,Transcript_length,GC_content,Gene_name
Gene stable ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSG00000000003,1350.333333,40.87,TSPAN6
ENSG00000000005,940.5,40.8,TNMD
ENSG00000000419,974.714286,39.85,DPM1
ENSG00000000457,3274.4,40.14,SCYL3
ENSG00000000460,2197.2,39.22,C1orf112


In [9]:
genes = list(gene_df.index)
gene_names = gene_df['Gene_name']
print('{} ENSG gene ids loaded with {} unique gene names.'.format(len(genes), len(gene_names.unique())))
gene_names.head()

63677 ENSG gene ids loaded with 56638 unique gene names.


Gene stable ID
ENSG00000000003      TSPAN6
ENSG00000000005        TNMD
ENSG00000000419        DPM1
ENSG00000000457       SCYL3
ENSG00000000460    C1orf112
Name: Gene_name, dtype: object

## Load expression counts

Now we load the expression counts.  This might take a minute.

In [10]:
def get_counts(filename, gdf):
    df = pd.read_csv(filename, sep='\t',
                     index_col=0, header=0, low_memory=False)
    df = df[df.columns[1:]]
    cols = list(df.columns)
    cols = ['_'.join(col.split('_')[3:5]) for col in cols]
    df.columns = cols
    return df

In [11]:
foldername = '../../data/YFV2001_gene_expression/NFCore_Joanna_Mapped_Sept2019/'
counts_df = get_counts(foldername+'P1299_merged_gene_counts.txt', gene_df)
counts_df = counts_df.join(get_counts(foldername+'P1902_merged_gene_counts.txt', gene_df), how='left')
counts_df = counts_df.join(get_counts(foldername+'P3128_merged_gene_counts.txt', gene_df), how='left')
counts_df = counts_df[cells]
counts_df.head()

Unnamed: 0_level_0,P1299_1097,P1299_1098,P1299_1099,P1299_1100,P1299_1101,P1299_1102,P1299_1103,P1299_1104,P1299_1105,P1299_1107,...,P3128_1182,P3128_1183,P3128_1184,P3128_1185,P3128_1186,P3128_1187,P3128_1188,P3128_1189,P3128_1190,P3128_1191
Geneid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000223972,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000227232,0,0,0,1,0,0,1,1,0,0,...,0,14,3,0,0,0,0,6,0,18
ENSG00000243485,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000237613,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000268020,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We check to make sure that the genes in the count-dataframe are also found in our gene metadata.

In [12]:
genes = [gene for gene in counts_df.index if gene in gene_df.index]
print('{} genes contained in counts and also in gene metadata.'.format(len(genes)))
gene_diff = len(counts_df) - len(genes)
if gene_diff == 0:
    emoticon = ':)'
else:
    emoticon = ':('
print('{} genes have been lost {}'.format(gene_diff, emoticon))

counts_df = counts_df.loc[genes]
gene_df = gene_df.loc[genes]

63677 genes contained in counts and also in gene metadata.
0 genes have been lost :)


## Removing T-cell receptor genes

If desired, we remove TCR genes right away, so they don't confuse things later on.  We load these genes from a file, and drop all the suspect ENSG IDs.

In [13]:
TR_df = pd.read_csv('../../data/YFV2001_gene_expression/TR_genelist.tsv', sep='\t')
TR_symbols = TR_df['Approved symbol'].unique()
TR_ENSG = gene_names[gene_names.isin(TR_symbols)].index # Work from symbols, not ENSG here.
print('{} TCRs will be excluded.'.format(len(TR_ENSG)))

291 TCRs will be excluded.


In [14]:
TR_drops = [gene for gene in TR_ENSG if gene in counts_df.index]
print('Dropping {} TCR genes.'.format(len(TR_drops)))
counts_df.drop(TR_drops, axis=0, inplace=True)

Dropping 291 TCR genes.


In [15]:
for greek in ['A','B','G','J']:
    leftgenes = [gene for gene in gene_names[counts_df.index].values if gene[:2] == 'TR'+greek]
    print('{} genes remaining that begin with TR{}'.format(len(leftgenes), greek))

0 genes remaining that begin with TRA
0 genes remaining that begin with TRB
0 genes remaining that begin with TRG
0 genes remaining that begin with TRJ


## Loading mutation data

### Add memory score to metadata

In [16]:
clones_map = pd.read_csv('../../data/YFV2001_gene_expression/YFV2001_metadata_with_tree_id.csv')
# Remove bulk
clones_map = clones_map[clones_map.tree_id != 's0'].copy()

In [17]:
clones_map.shape

(373, 6)

In [18]:
meta_df.reset_index().rename({'index': 'cell_id'}, axis=1).head()

Unnamed: 0,cell_id,Condition,Cell_Number,Clone_ID,Day,In_Vivo_Clone_ID,Project_ID
0,P1299_1097,In_Vivo,1.0,0,15.0,,P1299
1,P1299_1098,In_Vivo,1.0,0,15.0,,P1299
2,P1299_1099,In_Vivo,1.0,153,15.0,,P1299
3,P1299_1100,In_Vivo,1.0,0,15.0,,P1299
4,P1299_1101,In_Vivo,1.0,153,15.0,,P1299


In [19]:
meta_df.shape

(1236, 6)

In [20]:
meta_df = meta_df.reset_index().rename({'index': 'cell_id'}, axis=1).merge(
    clones_map[['cell_id', 'mem_score']], 
    how='left', 
    on='cell_id',
    validate='1:1',
)

In [21]:
meta_df.shape

(1236, 8)

## Add number of mutations on tree

In [22]:
cells_info = pd.read_csv('../../data/YFV2001_gene_expression/YFV2001_202105_metadata_with_tree_mutations.csv')
cells_info.head()

Unnamed: 0,cell_id,clone_id,day,n_mutations_tree
0,P1299_1099,153,15.0,7
1,P1299_1101,153,15.0,6
2,P1299_1103,156,15.0,11
3,P1299_1105,176,15.0,4
4,P1299_1108,159,15.0,3


In [23]:
meta_df = meta_df.merge(
    cells_info[['cell_id', 'n_mutations_tree']], 
    how='left', 
    on='cell_id',
    validate='1:1',
)

In [24]:
meta_df.head()

Unnamed: 0,cell_id,Condition,Cell_Number,Clone_ID,Day,In_Vivo_Clone_ID,Project_ID,mem_score,n_mutations_tree
0,P1299_1097,In_Vivo,1.0,0,15.0,,P1299,,
1,P1299_1098,In_Vivo,1.0,0,15.0,,P1299,,
2,P1299_1099,In_Vivo,1.0,153,15.0,,P1299,,7.0
3,P1299_1100,In_Vivo,1.0,0,15.0,,P1299,,
4,P1299_1101,In_Vivo,1.0,153,15.0,,P1299,,6.0


In [25]:
meta_df.shape

(1236, 9)

## Loading things into ScanPy

Here we move our data into ScanPy's AnnData structure.  See [the AnnData docs](https://anndata.readthedocs.io/en/latest/index.html) for more.

In [26]:
import scanpy as sc
sc.settings.verbosity = 1             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.1 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.0 pandas==1.5.3 scikit-learn==1.2.1 statsmodels==0.13.5 python-igraph==0.10.3 pynndescent==0.5.8


In [27]:
meta_df.head()

Unnamed: 0,cell_id,Condition,Cell_Number,Clone_ID,Day,In_Vivo_Clone_ID,Project_ID,mem_score,n_mutations_tree
0,P1299_1097,In_Vivo,1.0,0,15.0,,P1299,,
1,P1299_1098,In_Vivo,1.0,0,15.0,,P1299,,
2,P1299_1099,In_Vivo,1.0,153,15.0,,P1299,,7.0
3,P1299_1100,In_Vivo,1.0,0,15.0,,P1299,,
4,P1299_1101,In_Vivo,1.0,153,15.0,,P1299,,6.0


In [28]:
## Expression matrix
adata = sc.AnnData(counts_df.T) # Plunk it into an AnnData structure.

## Gene metadata
adata.var['transcript_length'] = gene_df['Transcript_length']
adata.var['ENSG_ID'] = adata.var.index
adata.var['plain_name'] = gene_names[adata.var.index]
adata.var.index = adata.var['plain_name'] # Reindex by gene name
adata.var_names_make_unique() # Use common gene names, but uniquify them.
adata.var.index.name = 'unique_name'

## Cell metadata.
meta_df_idx = meta_df.set_index('cell_id')
adata.obs['Day'] = meta_df_idx['Day'].astype(int)
adata.obs['Project_ID'] = meta_df_idx['Project_ID']+'_Day_'+adata.obs['Day'].astype(str)
adata.obs['Clone_ID'] = meta_df_idx['Clone_ID']
adata.obs['mem_score'] = meta_df_idx['mem_score']
adata.obs['n_mutations_tree'] = meta_df_idx['n_mutations_tree']
adata.obs

adata

  adata = sc.AnnData(counts_df.T) # Plunk it into an AnnData structure.


AnnData object with n_obs × n_vars = 1236 × 63386
    obs: 'Day', 'Project_ID', 'Clone_ID', 'mem_score', 'n_mutations_tree'
    var: 'transcript_length', 'ENSG_ID', 'plain_name'

In [32]:
adata.write('../../data/YFV2001_gene_expression/results/YFV2001_230127_scanpy_data.h5ad')