In [2]:
import scanpy as sc
from scipy import sparse
import numpy as np
import anndata
import pandas as pd
import matplotlib.pyplot as plt
import random

random.seed(1432)

In [None]:
adata = sc.read_h5ad("/Users/alex/Documents/BIOL0041-Project/OAC_masters_project/data/adata_with_rounded.h5ad")

In [5]:
adata

AnnData object with n_obs × n_vars = 52387 × 25815
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample_ID', 'donor', 'chemo', 'tissueType', 'dataType', 'Mandard', 'percent.mt', 'S.Score', 'G2M.Score', 'Phase', 'old.ident', 'CC.Difference', 'nCount_SCT', 'nFeature_SCT', 'doublet', 'pANN', 'integrated_snn_res.0.7', 'seurat_clusters', 'SingleR_hpca_coarse', 'SingleR_hpca_fine', 'SingleR_monaco', 'SingleR_immCellExp', 'highLevelType', 'ident'
    uns: 'X_name'
    layers: 'logcounts', 'rounded_raw_counts', 'scaledata'

In [None]:
adata.obs.ident.value_counts()

# Data exploration

In [None]:
adata.obs['highLevelType'].value_counts()

In [None]:
adata.obs['ident'].value_counts()

In [None]:
adata.obs['tissueType'].value_counts()

In [None]:
adata.obs['orig.ident'].value_counts()

# Exporting/subsetting data

In [6]:
adata.obs['cell_ids'] = adata.obs.index

In [7]:
adata.obs['cell_ids']

AAACCCAAGGAGACCT-1_1      AAACCCAAGGAGACCT-1_1
AAACCCAGTAGACAGC-1_1      AAACCCAGTAGACAGC-1_1
AAACCCAGTAGATCGG-1_1      AAACCCAGTAGATCGG-1_1
AAACCCAGTATCACCA-1_1      AAACCCAGTATCACCA-1_1
AAACCCAGTTGGAGGT-1_1      AAACCCAGTTGGAGGT-1_1
                                 ...          
TTTGGTTCATTGAAGA-1_10    TTTGGTTCATTGAAGA-1_10
TTTGGTTGTTGTCCCT-1_10    TTTGGTTGTTGTCCCT-1_10
TTTGGTTGTTTGACAC-1_10    TTTGGTTGTTTGACAC-1_10
TTTGTTGAGGGTCAAC-1_10    TTTGTTGAGGGTCAAC-1_10
TTTGTTGCATGGAGAC-1_10    TTTGTTGCATGGAGAC-1_10
Name: cell_ids, Length: 52387, dtype: object

## Raw epithelial counts for use w/ SCEVAN

In [8]:
epithelial_adata = adata[adata.obs["highLevelType"] == "Epithelial"]

In [48]:
epithelial_adata = epithelial_adata[epithelial_adata.obs['tissueType'] != 'N']

In [10]:
epithelial_adata.layers['raw_counts'] = epithelial_adata.X

  epithelial_adata.layers['raw_counts'] = epithelial_adata.X


In [None]:
epithelial_adata.X = epithelial_adata.layers['rounded_raw_counts']

In [None]:
#Extract raw counts from adata.X or adata.layers["raw"]
raw_counts = pd.DataFrame(epithelial_adata.X.toarray().T, index=epithelial_adata.var.index, columns=epithelial_adata.obs.cell_ids)
raw_counts.to_csv("epi_raw_counts_matrix.csv", sep=",", index=True, header=True)

In [None]:
epi_df = pd.DataFrame(epithelial_adata.obs)

In [None]:
epi_df['donor_chemo'] = epi_df['donor'].astype(str)+ epi_df['tumor'].astype(str)

## Subsampling for healthy reference

In [11]:
temp_adata = adata[adata.obs['tissueType'] == 'N']

In [12]:
temp_adata.obs.ident.value_counts()

B               2141
Endothelial     1702
T               1580
Fibroblast       965
Myeloid          365
Plasmablast      355
Undetermined     299
NK               230
Mast             170
Epithelial        71
Cycling           10
Name: ident, dtype: int64

In [13]:
temp_adata = temp_adata[temp_adata.obs['ident'] != 'Cycling']

In [14]:
temp_adata = temp_adata[temp_adata.obs["highLevelType"] != "Undetermined"]

In [15]:
adata_df = pd.DataFrame(temp_adata.obs).reset_index()

In [16]:
normal_ref = adata_df.groupby('highLevelType').sample(50)['index']

In [17]:
len(list(normal_ref))

450

In [18]:
normal_ref.to_csv("normal_cell_ids.csv", sep=",")

In [19]:
epi_cell_ids = list(epithelial_adata.obs.index)
len(epi_cell_ids)

890

In [20]:
normal_epi_ids = list(normal_ref) + epi_cell_ids

In [21]:
len(normal_epi_ids)

1340

## Raw counts w/ healthy cells

In [22]:
mask = adata.obs.index.isin(normal_epi_ids)

In [23]:
raw_counts_to_export = adata[mask].copy()

In [24]:
raw_counts_to_export.X = raw_counts_to_export.layers['rounded_raw_counts']

In [25]:
raw_counts_to_export = pd.DataFrame(raw_counts_to_export.X.toarray().T, index=raw_counts_to_export.var.index, columns=raw_counts_to_export.obs.index)
raw_counts_to_export.to_csv("epi_normal_raw_counts.csv", sep=",", index=True, header=True)

# Integrating SCEVAN output

In [26]:
epithelial_adata = adata[adata.obs["ident"] == "Epithelial"]

In [27]:
SCEVAN_results = pd.read_csv('/Users/alex/Documents/BIOL0041-Project/OAC_masters_project/data/SCEVAN_results.csv', header=None)

In [32]:
SCEVAN_results

Unnamed: 0,0,1,3
1,AAACGCTTCATCTACT.1_1,tumor,1
2,AAAGTGATCAACGCTA.1_1,tumor,2
3,AACCATGTCGAATGCT.1_1,tumor,1
4,AACGGGATCTTCCTAA.1_1,tumor,2
5,AAGGAATAGAAATCCA.1_1,normal,
...,...,...,...
957,GAAGGACGTGATTCAC.1_10,normal,
958,GAATCACGTTCAAACC.1_10,normal,
959,GACCAATGTATGGGAC.1_10,normal,
960,TCAGTCCCACTGGCGT.1_10,normal,


In [29]:
SCEVAN_results = SCEVAN_results.drop(0)

In [31]:
SCEVAN_results = SCEVAN_results[[0,1,3]]

In [33]:
# create mapping dictionary - cell ID matched to tumoural status and subclone (if tumoural)
tum_dict = {}
clone_dict = {}

for i in range(1, len(SCEVAN_results)+1):
    cellID = SCEVAN_results.loc[i][0].replace('.','-')
    tum_type = SCEVAN_results.loc[i][1]

    #non tumoural sublcones set to 0
    if str(SCEVAN_results.loc[i][3]) == 'nan':
        subclone = 0
    else:
        subclone = SCEVAN_results.loc[i][3]
    #print(SCEVAN_results.loc[i][0])
    #print(SCEVAN_results.loc[i][1])
    tum_dict[cellID] = tum_type
    clone_dict[cellID] = subclone


print(tum_dict)
print(clone_dict)

{'AAACGCTTCATCTACT-1_1': 'tumor', 'AAAGTGATCAACGCTA-1_1': 'tumor', 'AACCATGTCGAATGCT-1_1': 'tumor', 'AACGGGATCTTCCTAA-1_1': 'tumor', 'AAGGAATAGAAATCCA-1_1': 'normal', 'AAGTTCGAGCCGGATA-1_1': 'tumor', 'AATGGAACATCACGGC-1_1': 'tumor', 'ACCGTTCTCCTATGGA-1_1': 'filtered', 'ACTCTCGTCGTTGTAG-1_1': 'tumor', 'AGCATCACACGGAAGT-1_1': 'tumor', 'ATACCGATCCTTCTTC-1_1': 'normal', 'ATACCTTGTCTGATCA-1_1': 'normal', 'ATCGGATGTCCCTAAA-1_1': 'filtered', 'ATGCCTCAGTGCTCGC-1_1': 'tumor', 'ATTCCATGTAGAATGT-1_1': 'tumor', 'ATTCTTGCATGACCCG-1_1': 'normal', 'ATTTCACGTCCTACGG-1_1': 'normal', 'CACGGGTGTGTAGTGG-1_1': 'filtered', 'CATAAGCGTACGCTAT-1_1': 'tumor', 'CATGAGTTCTCAGTCC-1_1': 'tumor', 'CATGCGGCACATGACT-1_1': 'tumor', 'CCACACTGTGGCTTAT-1_1': 'normal', 'CCTTGTGCAACTGGTT-1_1': 'tumor', 'CCTTTGGGTTGAAGTA-1_1': 'tumor', 'CTAGACACACAGCTTA-1_1': 'tumor', 'CTATAGGCAGTCGTTA-1_1': 'normal', 'CTCATCGCACATTACG-1_1': 'filtered', 'CTCCCTCTCCCAAGTA-1_1': 'tumor', 'CTGCCATAGGGTTAAT-1_1': 'tumor', 'CTTTCGGCACTCCACT-1_1':

In [34]:
epithelial_adata.obs["tumor"] = epithelial_adata.obs.index.map(tum_dict)
#epithelial_adata.obs["subclone"] = epithelial_adata.obs.index.map(clone_dict)

  epithelial_adata.obs["tumor"] = epithelial_adata.obs.index.map(tum_dict)


In [35]:
epithelial_adata.obs.tumor.value_counts()

normal      433
tumor       361
filtered    167
Name: tumor, dtype: int64

In [None]:
epithelial_adata.obs.subclone.value_counts()

# Integrating QuieScore output

In [36]:
G0_results = pd.read_csv('/Users/alex/Documents/BIOL0041-Project/OAC_masters_project/data/G0_scored_tumorCells.csv', header=None)

In [40]:
print(G0_results)

                   cell_id                G0_up             G0_down  \
1     AAACGCTTCATCTACT.1_1   -0.620118099015498  -0.367043788087705   
2     AAAGTGATCAACGCTA.1_1    -0.21423569170912   -0.11197447228909   
3     AACCATGTCGAATGCT.1_1   -0.131672020641099  -0.185609325374955   
4     AACGGGATCTTCCTAA.1_1  -0.0924287866796572  -0.161304266618039   
5     AAGTTCGAGCCGGATA.1_1    -0.15833299661878  -0.339731122195867   
..                     ...                  ...                 ...   
357   TTACAGGGTAGTCTGT.1_9  0.00428020378771471  -0.366214977347612   
358   TTCCTTCTCGATTGAC.1_9  -0.0802393738551048  -0.529142846919358   
359   TTCTCTCCAACCGTAT.1_9  -0.0746800601007133  -0.447225823320825   
360  AAAGAACGTTGCGTAT.1_10   -0.447551458994797  -0.456122141424817   
361  CGATCGGAGACCGCCT.1_10  -0.0006916509123297  -0.278141666276647   

                    Sample            Prolif_cap             Prolif_z  \
1     AAACGCTTCATCTACT.1_1     0.253074310927793    0.931074044198043   
2

In [38]:
G0_results.columns = ['cell_id', 'G0_up', 'G0_down', 'Sample', 'Prolif_cap', 'Prolif_z', 'split_class', 'final_class']

In [39]:
G0_results = G0_results.drop(0)

In [42]:
G0_results.final_class.value_counts()

cycling         179
fast cycling     91
G0 arrested      91
Name: final_class, dtype: int64

In [43]:
# create mapping dictionary - cell ID matched to final G0 classification
G0_dict = {}

for i in range(1, len(G0_results)+1):
    cellID = G0_results.loc[i]['cell_id'].replace('.','-')
    G0_class = G0_results.loc[i]['final_class']

    G0_dict[cellID] = G0_class

print(G0_dict)

{'AAACGCTTCATCTACT-1_1': 'fast cycling', 'AAAGTGATCAACGCTA-1_1': 'cycling', 'AACCATGTCGAATGCT-1_1': 'cycling', 'AACGGGATCTTCCTAA-1_1': 'cycling', 'AAGTTCGAGCCGGATA-1_1': 'cycling', 'AATGGAACATCACGGC-1_1': 'G0 arrested', 'ACTCTCGTCGTTGTAG-1_1': 'cycling', 'AGCATCACACGGAAGT-1_1': 'G0 arrested', 'ATGCCTCAGTGCTCGC-1_1': 'G0 arrested', 'ATTCCATGTAGAATGT-1_1': 'fast cycling', 'CATAAGCGTACGCTAT-1_1': 'cycling', 'CATGAGTTCTCAGTCC-1_1': 'fast cycling', 'CATGCGGCACATGACT-1_1': 'fast cycling', 'CCTTGTGCAACTGGTT-1_1': 'fast cycling', 'CCTTTGGGTTGAAGTA-1_1': 'cycling', 'CTAGACACACAGCTTA-1_1': 'cycling', 'CTCCCTCTCCCAAGTA-1_1': 'fast cycling', 'CTGCCATAGGGTTAAT-1_1': 'cycling', 'GAATCGTGTTCTATCT-1_1': 'cycling', 'GACCAATTCGCAACAT-1_1': 'cycling', 'GACTCAACAAGAGAGA-1_1': 'fast cycling', 'GAGTCTAAGCAATTCC-1_1': 'cycling', 'GCACTAATCACCGGTG-1_1': 'fast cycling', 'GCAGCCAGTGAGAACC-1_1': 'fast cycling', 'GCATTAGGTTGCTCAA-1_1': 'cycling', 'GTCGTAAAGATCCCGC-1_1': 'fast cycling', 'GTCTAGAGTTGCGTAT-1_1': 'fa

In [44]:
epithelial_adata.obs["G0_class"] = epithelial_adata.obs.index.map(G0_dict)

In [45]:
epithelial_adata.obs.G0_class.value_counts()

cycling         179
fast cycling     91
G0 arrested      91
Name: G0_class, dtype: int64

In [49]:
epithelial_adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,sample_ID,donor,chemo,tissueType,dataType,Mandard,percent.mt,...,seurat_clusters,SingleR_hpca_coarse,SingleR_hpca_fine,SingleR_monaco,SingleR_immCellExp,highLevelType,ident,cell_ids,tumor,G0_class
AAACGCTTCATCTACT-1_1,SeuratProject,1781.981523,1169,s1,Pt1,post,T,scRNA,3,3.177287,...,16,Neurons,Neurons:Schwann_cell,Classical monocytes,"Monocytes, CD14+",Epithelial,Epithelial,AAACGCTTCATCTACT-1_1,tumor,fast cycling
AAAGTGATCAACGCTA-1_1,SeuratProject,5444.449078,1702,s1,Pt1,post,T,scRNA,3,3.376161,...,16,Smooth_muscle_cells,Smooth_muscle_cells:vascular,Intermediate monocytes,"Monocytes, CD14+",Epithelial,Epithelial,AAAGTGATCAACGCTA-1_1,tumor,cycling
AACCATGTCGAATGCT-1_1,SeuratProject,4589.532619,1740,s1,Pt1,post,T,scRNA,3,4.062440,...,16,Neurons,Neurons:Schwann_cell,Non-switched memory B cells,"B cells, naive",Epithelial,Epithelial,AACCATGTCGAATGCT-1_1,tumor,cycling
AACGGGATCTTCCTAA-1_1,SeuratProject,2711.870050,1204,s1,Pt1,post,T,scRNA,3,3.834255,...,16,Tissue_stem_cells,Tissue_stem_cells:BM_MSC:TGFb3,Classical monocytes,"T cells, CD4+, naive, stimulated",Epithelial,Epithelial,AACGGGATCTTCCTAA-1_1,tumor,cycling
AAGGAATAGAAATCCA-1_1,SeuratProject,8078.330132,2206,s1,Pt1,post,T,scRNA,3,3.262934,...,16,Tissue_stem_cells,Tissue_stem_cells:BM_MSC:BMP2,Non-switched memory B cells,"B cells, naive",Epithelial,Epithelial,AAGGAATAGAAATCCA-1_1,normal,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
GAAGGACGTGATTCAC-1_10,SeuratProject,9198.930998,3165,s10,Pt8,post,T,scRNA,1,9.162426,...,16,Tissue_stem_cells,Tissue_stem_cells:BM_MSC:TGFb3,Progenitor cells,"Monocytes, CD16+",Epithelial,Epithelial,GAAGGACGTGATTCAC-1_10,normal,
GAATCACGTTCAAACC-1_10,SeuratProject,4993.783054,1449,s10,Pt8,post,T,scRNA,1,4.111648,...,16,Tissue_stem_cells,Tissue_stem_cells:BM_MSC:BMP2,Myeloid dendritic cells,"Monocytes, CD14+",Epithelial,Epithelial,GAATCACGTTCAAACC-1_10,normal,
GACCAATGTATGGGAC-1_10,SeuratProject,21167.510738,3407,s10,Pt8,post,T,scRNA,1,9.597434,...,16,Epithelial_cells,Epithelial_cells:bronchial,Progenitor cells,"Monocytes, CD14+",Epithelial,Epithelial,GACCAATGTATGGGAC-1_10,normal,
TCAGTCCCACTGGCGT-1_10,SeuratProject,4916.686930,1302,s10,Pt8,post,T,scRNA,1,8.450583,...,16,Tissue_stem_cells,Tissue_stem_cells:iliac_MSC,Myeloid dendritic cells,"Monocytes, CD14+",Epithelial,Epithelial,TCAGTCCCACTGGCGT-1_10,normal,


In [50]:
epithelial_adata.obs.tissueType.value_counts()

T    890
Name: tissueType, dtype: int64

# Plots

## Epithelial by patient and treatment stage

In [None]:
epi_df = pd.DataFrame(epithelial_adata.obs).reset_index()

In [None]:
epi_df['donor_chemo'] = epi_df['donor'].astype(str)+ epi_df['chemo'].astype(str)

In [None]:
print(epi_df)

In [None]:
for i in range(0, len(epi_df)):
    G0 = epi_df.loc[i]['G0_class']

    if str(G0) == 'nan':
        epi_df.at[i,'G0_class'] = 'normal epithelial'


In [None]:
epi_df = epi_df.groupby(["donor_chemo", "G0_class"]).size().reset_index(name="nb_by_treatment")

In [None]:
epi_df["count_by_condition"] = epi_df.groupby("donor_chemo")["nb_by_treatment"].transform("sum")

In [None]:
epi_df["%_by_treatment"] = (epi_df["nb_by_treatment"] / epi_df["count_by_condition"]) * 100

In [None]:
#export for plotting in R
epi_df.to_csv("epi_by_tumor.csv", sep=",", index=True, header=True)

## G0 arrested plots

In [None]:
epithelial_adata.obs.tissueType.value_counts()

In [None]:
tumor_adata = epithelial_adata[epithelial_adata.obs["tumor"] == "tumor"]
tumor_plot_df = pd.DataFrame(tumor_adata.obs)

### Epithelial by patient and stage

### G0 by patient and chemo stage

In [None]:
tumor_plot_df['donor_chemo'] = tumor_plot_df['donor'].astype(str) + tumor_plot_df['chemo'].astype(str)

In [None]:
tumor_plot_df['donor_chemo'].value_counts()

In [None]:
print(tumor_plot_df)

In [None]:
tumor_plot_df = tumor_plot_df.groupby(["donor", "chemo", "G0_class"]).size().reset_index(name="nb_by_treatment")

In [None]:
tumor_plot_df["count_by_condition"] = tumor_plot_df.groupby("donor")["nb_by_treatment"].transform("sum")

In [None]:
tumor_plot_df["%_by_treatment"] = (tumor_plot_df["nb_by_treatment"] / tumor_plot_df["count_by_condition"]) * 100

In [None]:
#export for plotting in R
tumor_plot_df.to_csv("tumour_plot_data.csv", sep=",", index=True, header=True)

## G0 by chemo stage pooled

In [None]:
print(tumor_plot_df)

In [None]:
tumor_plot_df = tumor_plot_df.groupby(["chemo", "G0_class"]).size().reset_index(name="nb_by_treatment")

In [None]:
tumor_plot_df["count_by_condition"] = tumor_plot_df.groupby("chemo")["nb_by_treatment"].transform("sum")

In [None]:
tumor_plot_df["%_by_treatment"] = (tumor_plot_df["nb_by_treatment"] / tumor_plot_df["count_by_condition"]) * 100

In [None]:
#export for plotting in R
tumor_plot_df.to_csv("G0_treatment_plotData.csv", sep=",", index=True, header=True)

# pyDESEQ2

In [51]:
tumor_adata = epithelial_adata[epithelial_adata.obs['tumor'] == 'tumor']

In [52]:
tumor_adata

View of AnnData object with n_obs × n_vars = 339 × 25815
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'sample_ID', 'donor', 'chemo', 'tissueType', 'dataType', 'Mandard', 'percent.mt', 'S.Score', 'G2M.Score', 'Phase', 'old.ident', 'CC.Difference', 'nCount_SCT', 'nFeature_SCT', 'doublet', 'pANN', 'integrated_snn_res.0.7', 'seurat_clusters', 'SingleR_hpca_coarse', 'SingleR_hpca_fine', 'SingleR_monaco', 'SingleR_immCellExp', 'highLevelType', 'ident', 'cell_ids', 'tumor', 'G0_class'
    uns: 'X_name'
    layers: 'logcounts', 'rounded_raw_counts', 'scaledata'

In [53]:
tumor_adata.write("tum_adata_G0.h5ad")

  df[key] = c
  df[key] = c


## SCEVAN no subclone