In [1]:
import scanpy as sc
from scipy import sparse
import numpy as np
import anndata
import pandas as pd
import matplotlib.pyplot as plt
import random

random.seed(1432)

In [2]:
adata = sc.read_h5ad("/Users/alex/Documents/BIOL0041-Project/OAC_masters_project/data/adata_with_rounded.h5ad")

In [4]:
adata.obs.ident.value_counts()

T               24998
B                6856
Fibroblast       4772
Myeloid          3055
NK               3013
Endothelial      2781
Plasmablast      2334
Mast             1631
Undetermined     1550
Epithelial        961
Cycling           436
Name: ident, dtype: int64


#### subsample from other categories for normal cell reference

# Depricated - converting count matrices to sparse -- didn't reduce file size

In [None]:
adata.X = adata.X.todense()

In [None]:
adata.layers["rounded_raw_counts"] = adata.X.round()

In [None]:
adata.layers["rounded_raw_counts"] = sparse.csr_matrix(adata.layers["rounded_raw_counts"])

In [None]:
sparse_X = sparse.csr_matrix(adata.X)
adata.X = sparse_X

In [None]:
adata.write("/Users/alex/Documents/BIOL0041-Project/OAC_masters_project/data/adata_with_rounded.h5ad")

In [None]:
adata

# Data exploration

In [None]:
adata.obs['highLevelType'].value_counts()

In [None]:
adata.obs['ident'].value_counts()

In [None]:
adata.obs['tissueType'].value_counts()

In [None]:
adata.obs['orig.ident'].value_counts()

# Exporting/subsetting data

In [5]:
adata.obs['cell_ids'] = adata.obs.index

In [6]:
adata.obs['cell_ids']

AAACCCAAGGAGACCT-1_1      AAACCCAAGGAGACCT-1_1
AAACCCAGTAGACAGC-1_1      AAACCCAGTAGACAGC-1_1
AAACCCAGTAGATCGG-1_1      AAACCCAGTAGATCGG-1_1
AAACCCAGTATCACCA-1_1      AAACCCAGTATCACCA-1_1
AAACCCAGTTGGAGGT-1_1      AAACCCAGTTGGAGGT-1_1
                                 ...          
TTTGGTTCATTGAAGA-1_10    TTTGGTTCATTGAAGA-1_10
TTTGGTTGTTGTCCCT-1_10    TTTGGTTGTTGTCCCT-1_10
TTTGGTTGTTTGACAC-1_10    TTTGGTTGTTTGACAC-1_10
TTTGTTGAGGGTCAAC-1_10    TTTGTTGAGGGTCAAC-1_10
TTTGTTGCATGGAGAC-1_10    TTTGTTGCATGGAGAC-1_10
Name: cell_ids, Length: 52387, dtype: object

## Raw epithelial counts for use w/ SCEVAN

In [7]:
adata.obs['highLevelType'].value_counts()

T               24998
B                6856
Fibroblast       4772
Myeloid          3055
NK               3013
Endothelial      2781
Plasmablast      2334
Mast             1631
Undetermined     1550
Epithelial        961
Cycling           436
Name: highLevelType, dtype: int64

In [3]:
epithelial_adata = adata[adata.obs["highLevelType"] == "Epithelial"]

In [20]:
epithelial_adata = epithelial_adata[epithelial_adata.obs['tissueType'] != 'N']

In [21]:
epithelial_adata.layers['raw_counts'] = epithelial_adata.X

  epithelial_adata.layers['raw_counts'] = epithelial_adata.X


In [22]:
epithelial_adata.X = epithelial_adata.layers['rounded_raw_counts']

In [23]:
epithelial_adata.X.max()

23007.0

In [24]:
epithelial_adata.var.index

Index(['AL627309.1', 'AL669831.5', 'LINC00115', 'FAM41C', 'AL645608.1',
       'SAMD11', 'NOC2L', 'KLHL17', 'PLEKHN1', 'PERM1',
       ...
       'AC004471.1', 'IGLV11-55', 'RNF185-AS1', 'IGLCOR22-2', 'APOL5',
       'AP000322.2', 'LINC00322', 'LINC00319', 'LINC00315', 'LINC00316'],
      dtype='object', length=25815)

In [25]:
#Extract raw counts from adata.X or adata.layers["raw"]
raw_counts = pd.DataFrame(epithelial_adata.X.toarray().T, index=epithelial_adata.var.index, columns=epithelial_adata.obs.cell_ids)
raw_counts.to_csv("epi_raw_counts_matrix.csv", sep=",", index=True, header=True)

In [None]:
epi_df = pd.DataFrame(epithelial_adata.obs)

In [123]:
epi_df['donor_chemo'] = epi_df['donor'].astype(str)+ epi_df['tumor'].astype(str)

In [124]:
epi_df['donor_chemo'].value_counts()

Pt2normal      212
Pt7tumor       209
Pt3normal      114
Pt7filtered    108
Pt8normal       42
Pt1tumor        41
Pt6tumor        33
Pt6normal       29
Pt4tumor        26
Pt4filtered     21
Pt5tumor        21
Pt8tumor        14
Pt1normal       13
Pt6filtered     10
Pt2tumor        10
Pt4normal        8
Pt7normal        8
Pt2filtered      8
Pt1filtered      8
Pt5filtered      7
Pt5normal        7
Pt3tumor         7
Pt8filtered      5
Name: donor_chemo, dtype: int64

## Subsampling for healthy reference

In [4]:
temp_adata = adata[adata.obs['tissueType'] == 'N']

In [5]:
temp_adata.obs.ident.value_counts()

B               2141
Endothelial     1702
T               1580
Fibroblast       965
Myeloid          365
Plasmablast      355
Undetermined     299
NK               230
Mast             170
Epithelial        71
Cycling           10
Name: ident, dtype: int64

In [6]:
temp_adata = temp_adata[temp_adata.obs['ident'] != 'Cycling']

In [7]:
temp_adata = temp_adata[temp_adata.obs["highLevelType"] != "Undetermined"]

In [8]:
adata_df = pd.DataFrame(temp_adata.obs).reset_index()

In [9]:
normal_ref = adata_df.groupby('highLevelType').sample(50)['index']

In [10]:
len(list(normal_ref))

450

In [11]:
normal_ref.to_csv("normal_cell_ids.csv", sep=",")

In [12]:
epi_cell_ids = list(epithelial_adata.obs.index)
len(epi_cell_ids)

961

In [13]:
normal_epi_ids = list(normal_ref) + epi_cell_ids

In [14]:
len(normal_epi_ids)

1411

## Raw counts w/ healthy cells

In [15]:
mask = adata.obs.index.isin(normal_epi_ids)

In [21]:
raw_counts_to_export = adata[mask].copy()

In [17]:
raw_counts_to_export.X = raw_counts_to_export.layers['rounded_raw_counts']

In [19]:
raw_counts_to_export = pd.DataFrame(raw_counts_to_export.X.toarray().T, index=raw_counts_to_export.var.index, columns=raw_counts_to_export.obs.index)
raw_counts_to_export.to_csv("epi_normal_raw_counts.csv", sep=",", index=True, header=True)

# Integrating SCEVAN output

In [9]:
epithelial_adata = adata[adata.obs["ident"] == "Epithelial"]

In [11]:
SCEVAN_results = pd.read_csv('/Users/alex/Documents/BIOL0041-Project/OAC_masters_project/data/SCEVAN_results.csv', header=None)

In [13]:
SCEVAN_results = SCEVAN_results.drop(0)

In [14]:
SCEVAN_results = SCEVAN_results[[0,1,3]]

In [19]:
# create mapping dictionary - cell ID matched to tumoural status and subclone (if tumoural)
tum_dict = {}
clone_dict = {}

for i in range(1, len(SCEVAN_results)+1):
    cellID = SCEVAN_results.loc[i][0].replace('.','-')
    tum_type = SCEVAN_results.loc[i][1]

    #non tumoural sublcones set to 0
    if str(SCEVAN_results.loc[i][3]) == 'nan':
        subclone = 0
    else:
        subclone = SCEVAN_results.loc[i][3]
    #print(SCEVAN_results.loc[i][0])
    #print(SCEVAN_results.loc[i][1])
    tum_dict[cellID] = tum_type
    clone_dict[cellID] = subclone


print(tum_dict)
print(clone_dict)

{'AAACGCTTCATCTACT-1_1': 'tumor', 'AAAGTGATCAACGCTA-1_1': 'tumor', 'AACCATGTCGAATGCT-1_1': 'tumor', 'AACGGGATCTTCCTAA-1_1': 'tumor', 'AAGGAATAGAAATCCA-1_1': 'normal', 'AAGTTCGAGCCGGATA-1_1': 'tumor', 'AATGGAACATCACGGC-1_1': 'tumor', 'ACCGTTCTCCTATGGA-1_1': 'filtered', 'ACTCTCGTCGTTGTAG-1_1': 'tumor', 'AGCATCACACGGAAGT-1_1': 'tumor', 'ATACCGATCCTTCTTC-1_1': 'normal', 'ATACCTTGTCTGATCA-1_1': 'normal', 'ATCGGATGTCCCTAAA-1_1': 'filtered', 'ATGCCTCAGTGCTCGC-1_1': 'tumor', 'ATTCCATGTAGAATGT-1_1': 'tumor', 'ATTCTTGCATGACCCG-1_1': 'normal', 'ATTTCACGTCCTACGG-1_1': 'normal', 'CACGGGTGTGTAGTGG-1_1': 'filtered', 'CATAAGCGTACGCTAT-1_1': 'tumor', 'CATGAGTTCTCAGTCC-1_1': 'tumor', 'CATGCGGCACATGACT-1_1': 'tumor', 'CCACACTGTGGCTTAT-1_1': 'normal', 'CCTTGTGCAACTGGTT-1_1': 'tumor', 'CCTTTGGGTTGAAGTA-1_1': 'tumor', 'CTAGACACACAGCTTA-1_1': 'tumor', 'CTATAGGCAGTCGTTA-1_1': 'normal', 'CTCATCGCACATTACG-1_1': 'filtered', 'CTCCCTCTCCCAAGTA-1_1': 'tumor', 'CTGCCATAGGGTTAAT-1_1': 'tumor', 'CTTTCGGCACTCCACT-1_1':

In [18]:
epithelial_adata.obs["tumor"] = epithelial_adata.obs.index.map(tum_dict)
epithelial_adata.obs["subclone"] = epithelial_adata.obs.index.map(clone_dict)

  epithelial_adata.obs["tumor"] = epithelial_adata.obs.index.map(tum_dict)


In [20]:
epithelial_adata.obs.tumor.value_counts()

normal      433
tumor       361
filtered    167
Name: tumor, dtype: int64

In [21]:
epithelial_adata.obs.subclone.value_counts()

0    600
1    119
4    101
5     63
2     47
3     18
6     13
Name: subclone, dtype: int64

# Integrating QuieScore output

In [22]:
G0_results = pd.read_csv('/Users/alex/Documents/BIOL0041-Project/OAC_masters_project/data/G0_scored_tumorCells.csv', header=None)

In [32]:
print(G0_results)

                   cell_id                G0_up             G0_down  \
1     AAACGCTTCATCTACT.1_1   -0.620118099015498  -0.367043788087705   
2     AAAGTGATCAACGCTA.1_1    -0.21423569170912   -0.11197447228909   
3     AACCATGTCGAATGCT.1_1   -0.131672020641099  -0.185609325374955   
4     AACGGGATCTTCCTAA.1_1  -0.0924287866796572  -0.161304266618039   
5     AAGTTCGAGCCGGATA.1_1    -0.15833299661878  -0.339731122195867   
..                     ...                  ...                 ...   
357   TTACAGGGTAGTCTGT.1_9  0.00428020378771471  -0.366214977347612   
358   TTCCTTCTCGATTGAC.1_9  -0.0802393738551048  -0.529142846919358   
359   TTCTCTCCAACCGTAT.1_9  -0.0746800601007133  -0.447225823320825   
360  AAAGAACGTTGCGTAT.1_10   -0.447551458994797  -0.456122141424817   
361  CGATCGGAGACCGCCT.1_10  -0.0006916509123297  -0.278141666276647   

                    Sample            Prolif_cap             Prolif_z  \
1     AAACGCTTCATCTACT.1_1     0.253074310927793    0.931074044198043   
2

In [29]:
G0_results.columns = ['cell_id', 'G0_up', 'G0_down', 'Sample', 'Prolif_cap', 'Prolif_z', 'split_class', 'final_class']

In [31]:
G0_results = G0_results.drop(0)

In [33]:
#convert 'inter' to 'slow_cycling' and 'slow' to G0 arrested as per labs paper

for i in range(1, len(G0_results)+1):
    if G0_results.loc[i]['final_class'] == 'inter':
        G0_results.loc[i]['final_class'] = 'slow cycling'
    elif G0_results.loc[i]['final_class'] == 'slow':
        G0_results.loc[i]['final_class'] = 'G0 arrested'
    elif G0_results.loc[i]['final_class'] == 'fast':
        G0_results.loc[i]['final_class'] = 'fast cycling'

In [None]:
G0_results.final_class.value_counts()

slow cycling    179
G0 arrested      91
fast cycling     91
Name: final_class, dtype: int64

In [37]:
# create mapping dictionary - cell ID matched to final G0 classification
G0_dict = {}

for i in range(1, len(G0_results)+1):
    cellID = G0_results.loc[i]['cell_id'].replace('.','-')
    G0_class = G0_results.loc[i]['final_class']

    G0_dict[cellID] = G0_class

print(G0_dict)

{'AAACGCTTCATCTACT-1_1': 'G0 arrested', 'AAAGTGATCAACGCTA-1_1': 'slow cycling', 'AACCATGTCGAATGCT-1_1': 'slow cycling', 'AACGGGATCTTCCTAA-1_1': 'slow cycling', 'AAGTTCGAGCCGGATA-1_1': 'slow cycling', 'AATGGAACATCACGGC-1_1': 'fast cycling', 'ACTCTCGTCGTTGTAG-1_1': 'slow cycling', 'AGCATCACACGGAAGT-1_1': 'fast cycling', 'ATGCCTCAGTGCTCGC-1_1': 'fast cycling', 'ATTCCATGTAGAATGT-1_1': 'G0 arrested', 'CATAAGCGTACGCTAT-1_1': 'slow cycling', 'CATGAGTTCTCAGTCC-1_1': 'G0 arrested', 'CATGCGGCACATGACT-1_1': 'G0 arrested', 'CCTTGTGCAACTGGTT-1_1': 'G0 arrested', 'CCTTTGGGTTGAAGTA-1_1': 'slow cycling', 'CTAGACACACAGCTTA-1_1': 'slow cycling', 'CTCCCTCTCCCAAGTA-1_1': 'G0 arrested', 'CTGCCATAGGGTTAAT-1_1': 'slow cycling', 'GAATCGTGTTCTATCT-1_1': 'slow cycling', 'GACCAATTCGCAACAT-1_1': 'slow cycling', 'GACTCAACAAGAGAGA-1_1': 'G0 arrested', 'GAGTCTAAGCAATTCC-1_1': 'slow cycling', 'GCACTAATCACCGGTG-1_1': 'G0 arrested', 'GCAGCCAGTGAGAACC-1_1': 'G0 arrested', 'GCATTAGGTTGCTCAA-1_1': 'slow cycling', 'GTCGTAA

In [38]:
epithelial_adata.obs["G0_class"] = epithelial_adata.obs.index.map(G0_dict)

In [39]:
epithelial_adata.obs.G0_class.value_counts()

slow cycling    179
G0 arrested      91
fast cycling     91
Name: G0_class, dtype: int64

In [40]:
epithelial_adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,sample_ID,donor,chemo,tissueType,dataType,Mandard,percent.mt,...,seurat_clusters,SingleR_hpca_coarse,SingleR_hpca_fine,SingleR_monaco,SingleR_immCellExp,highLevelType,ident,tumor,subclone,G0_class
AAACGCTTCATCTACT-1_1,SeuratProject,1781.981523,1169,s1,Pt1,post,T,scRNA,3,3.177287,...,16,Neurons,Neurons:Schwann_cell,Classical monocytes,"Monocytes, CD14+",Epithelial,Epithelial,tumor,1,G0 arrested
AAAGTGATCAACGCTA-1_1,SeuratProject,5444.449078,1702,s1,Pt1,post,T,scRNA,3,3.376161,...,16,Smooth_muscle_cells,Smooth_muscle_cells:vascular,Intermediate monocytes,"Monocytes, CD14+",Epithelial,Epithelial,tumor,2,slow cycling
AACCATGTCGAATGCT-1_1,SeuratProject,4589.532619,1740,s1,Pt1,post,T,scRNA,3,4.062440,...,16,Neurons,Neurons:Schwann_cell,Non-switched memory B cells,"B cells, naive",Epithelial,Epithelial,tumor,1,slow cycling
AACGGGATCTTCCTAA-1_1,SeuratProject,2711.870050,1204,s1,Pt1,post,T,scRNA,3,3.834255,...,16,Tissue_stem_cells,Tissue_stem_cells:BM_MSC:TGFb3,Classical monocytes,"T cells, CD4+, naive, stimulated",Epithelial,Epithelial,tumor,2,slow cycling
AAGGAATAGAAATCCA-1_1,SeuratProject,8078.330132,2206,s1,Pt1,post,T,scRNA,3,3.262934,...,16,Tissue_stem_cells,Tissue_stem_cells:BM_MSC:BMP2,Non-switched memory B cells,"B cells, naive",Epithelial,Epithelial,normal,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GAAGGACGTGATTCAC-1_10,SeuratProject,9198.930998,3165,s10,Pt8,post,T,scRNA,1,9.162426,...,16,Tissue_stem_cells,Tissue_stem_cells:BM_MSC:TGFb3,Progenitor cells,"Monocytes, CD16+",Epithelial,Epithelial,normal,0,
GAATCACGTTCAAACC-1_10,SeuratProject,4993.783054,1449,s10,Pt8,post,T,scRNA,1,4.111648,...,16,Tissue_stem_cells,Tissue_stem_cells:BM_MSC:BMP2,Myeloid dendritic cells,"Monocytes, CD14+",Epithelial,Epithelial,normal,0,
GACCAATGTATGGGAC-1_10,SeuratProject,21167.510738,3407,s10,Pt8,post,T,scRNA,1,9.597434,...,16,Epithelial_cells,Epithelial_cells:bronchial,Progenitor cells,"Monocytes, CD14+",Epithelial,Epithelial,normal,0,
TCAGTCCCACTGGCGT-1_10,SeuratProject,4916.686930,1302,s10,Pt8,post,T,scRNA,1,8.450583,...,16,Tissue_stem_cells,Tissue_stem_cells:iliac_MSC,Myeloid dendritic cells,"Monocytes, CD14+",Epithelial,Epithelial,normal,0,


In [213]:
epithelial_adata.obs.tissueType.value_counts()

T    890
Name: tissueType, dtype: int64

# Plots

## Epithelial by patient and treatment stage

In [214]:
epi_df = pd.DataFrame(epithelial_adata.obs).reset_index()

In [215]:
epi_df['donor_chemo'] = epi_df['donor'].astype(str)+ epi_df['chemo'].astype(str)

In [216]:
print(epi_df)

                     index     orig.ident    nCount_RNA  nFeature_RNA  \
0     AAACGCTTCATCTACT-1_1  SeuratProject   1781.981523          1169   
1     AAAGTGATCAACGCTA-1_1  SeuratProject   5444.449078          1702   
2     AACCATGTCGAATGCT-1_1  SeuratProject   4589.532619          1740   
3     AACGGGATCTTCCTAA-1_1  SeuratProject   2711.870050          1204   
4     AAGGAATAGAAATCCA-1_1  SeuratProject   8078.330132          2206   
..                     ...            ...           ...           ...   
885  GAAGGACGTGATTCAC-1_10  SeuratProject   9198.930998          3165   
886  GAATCACGTTCAAACC-1_10  SeuratProject   4993.783054          1449   
887  GACCAATGTATGGGAC-1_10  SeuratProject  21167.510738          3407   
888  TCAGTCCCACTGGCGT-1_10  SeuratProject   4916.686930          1302   
889  TCATGGATCCACGAAT-1_10  SeuratProject   4336.881505          1582   

    sample_ID donor chemo tissueType dataType  Mandard  ...  \
0          s1   Pt1  post          T    scRNA        3  ... 

In [217]:
for i in range(0, len(epi_df)):
    G0 = epi_df.loc[i]['G0_class']

    if str(G0) == 'nan':
        epi_df.at[i,'G0_class'] = 'normal epithelial'


In [218]:
epi_df = epi_df.groupby(["donor_chemo", "G0_class"]).size().reset_index(name="nb_by_treatment")

In [219]:
epi_df["count_by_condition"] = epi_df.groupby("donor_chemo")["nb_by_treatment"].transform("sum")

In [220]:
epi_df["%_by_treatment"] = (epi_df["nb_by_treatment"] / epi_df["count_by_condition"]) * 100

In [221]:
#export for plotting in R
epi_df.to_csv("epi_by_tumor.csv", sep=",", index=True, header=True)

## G0 arrested plots

In [246]:
epithelial_adata.obs.tissueType.value_counts()

T    890
Name: tissueType, dtype: int64

In [239]:
tumor_adata = epithelial_adata[epithelial_adata.obs["tumor"] == "tumor"]
tumor_plot_df = pd.DataFrame(tumor_adata.obs)

### Epithelial by patient and stage

### G0 by patient and chemo stage

In [232]:
tumor_plot_df['donor_chemo'] = tumor_plot_df['donor'].astype(str) + tumor_plot_df['chemo'].astype(str)

In [233]:
tumor_plot_df['donor_chemo'].value_counts()

Pt7pre     199
Pt1post     41
Pt6post     33
Pt4pre      26
Pt5pre      21
Pt2pre      10
Pt3post      7
Pt8post      2
Name: donor_chemo, dtype: int64

In [234]:
print(tumor_plot_df)

                          orig.ident   nCount_RNA  nFeature_RNA sample_ID  \
AAACGCTTCATCTACT-1_1   SeuratProject  1781.981523          1169        s1   
AAAGTGATCAACGCTA-1_1   SeuratProject  5444.449078          1702        s1   
AACCATGTCGAATGCT-1_1   SeuratProject  4589.532619          1740        s1   
AACGGGATCTTCCTAA-1_1   SeuratProject  2711.870050          1204        s1   
AAGTTCGAGCCGGATA-1_1   SeuratProject  1978.297462           890        s1   
...                              ...          ...           ...       ...   
TTGCATTGTGGTAACG-1_8   SeuratProject  3402.708304          1089        s8   
TTGTTCATCGACATAC-1_8   SeuratProject  1655.730503           828        s8   
TTTCATGGTACCGTGC-1_8   SeuratProject  1671.330581           730        s8   
AAAGAACGTTGCGTAT-1_10  SeuratProject  3035.604661           720       s10   
CGATCGGAGACCGCCT-1_10  SeuratProject  3193.383325          1175       s10   

                      donor chemo tissueType dataType     Mandard  percent.

In [235]:
tumor_plot_df = tumor_plot_df.groupby(["donor", "chemo", "G0_class"]).size().reset_index(name="nb_by_treatment")

In [236]:
tumor_plot_df["count_by_condition"] = tumor_plot_df.groupby("donor")["nb_by_treatment"].transform("sum")

In [237]:
tumor_plot_df["%_by_treatment"] = (tumor_plot_df["nb_by_treatment"] / tumor_plot_df["count_by_condition"]) * 100

In [238]:
#export for plotting in R
tumor_plot_df.to_csv("tumour_plot_data.csv", sep=",", index=True, header=True)

## G0 by chemo stage pooled

In [244]:
print(tumor_plot_df)

  chemo      G0_class  nb_by_treatment  count_by_condition  %_by_treatment
0  post   G0 arrested               31                  83       37.349398
1  post  fast cycling               18                  83       21.686747
2  post  slow cycling               34                  83       40.963855
3   pre   G0 arrested               59                 256       23.046875
4   pre  fast cycling               60                 256       23.437500
5   pre  slow cycling              137                 256       53.515625


In [241]:
tumor_plot_df = tumor_plot_df.groupby(["chemo", "G0_class"]).size().reset_index(name="nb_by_treatment")

In [242]:
tumor_plot_df["count_by_condition"] = tumor_plot_df.groupby("chemo")["nb_by_treatment"].transform("sum")

In [243]:
tumor_plot_df["%_by_treatment"] = (tumor_plot_df["nb_by_treatment"] / tumor_plot_df["count_by_condition"]) * 100

In [245]:
#export for plotting in R
tumor_plot_df.to_csv("G0_treatment_plotData.csv", sep=",", index=True, header=True)