Notebook to create datasets

In [None]:
# Installing packages
#!pip install scanpy[skmisc]
#!pip install tables

In [1]:
from main_functions import *

import os
from pathlib import Path
import pandas as pd
import numpy as np
import scanpy as sc

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Training dataset - Qian 2020, EMTAB8107
#adata_train = create_adata_train('EMTAB8107_2102-Breastcancer_counts','BRCA_EMTAB8107_expression.h5','EMTAB8107_2103-Breastcancer_metadata.csv.gz')

# Save adata to h5ad
#adata_train.write_h5ad('adata_train.h5ad')

In [2]:
# Test dataset - Gao 2021, GSE148673_RAW

# For each .txt.gz file in directory, read in the count matrix and corresponding label
dir_list = Path('../GSE148673_RAW').glob('**/*.txt.gz')
obj_dict = {}

for i, curr_file_path in enumerate(dir_list):
  curr_filename = Path(curr_file_path.name).stem
  base_name = curr_filename.split('_')[0]
  print(curr_filename)
  curr_df = sc.read_text(curr_file_path, dtype = 'str').to_df().transpose()
  curr_df = curr_df.add_prefix(f'{base_name}@', axis=0)
  obj_dict[i] = curr_df
#  display(obj_dict[i].head())
#  print()

# Concatenate counts and labels from all AnnData objects
combined_df_10k = pd.concat(obj_dict.values(), ignore_index=False)
print(combined_df_10k.shape)
display(combined_df_10k.head())

GSM4476486_combined_UMIcount_CellTypes_TNBC1.txt
GSM4476485_combined_UMIcount_CellTypes_DCIS1.txt
GSM4476489_combined_UMIcount_CellTypes_TNBC4.txt
GSM4476490_combined_UMIcount_CellTypes_TNBC5.txt
GSM4476488_combined_UMIcount_CellTypes_TNBC3.txt
GSM4476487_combined_UMIcount_CellTypes_TNBC2.txt
(10424, 45085)


Unnamed: 0,copykat.pred,cluster.pred,RP11-34P13.3,FAM138A,OR4F5,RP11-34P13.7,RP11-34P13.8,RP11-34P13.14,RP11-34P13.9,FO538757.3,...,AP001468.1,AP001469.1,AP001469.2,AP001469.3,AC136352.3,AC136352.2,AC171558.3,BX004987.1,AC145212.1,MAFIP
GSM4476486@AAACCTGCACCTTGTC,T,T,0,0,0,0,0,0,0,0,...,,,,,,,,,,
GSM4476486@AAACGGGAGTCCTCCT,N,N,0,0,0,0,0,0,0,0,...,,,,,,,,,,
GSM4476486@AAACGGGTCCAGAGGA,T,T,0,0,0,0,0,0,0,0,...,,,,,,,,,,
GSM4476486@AAAGATGCAGTTTACG,T,T,0,0,0,0,0,0,0,0,...,,,,,,,,,,
GSM4476486@AAAGCAACAGGAATGC,T,T,0,0,0,0,0,0,0,0,...,,,,,,,,,,


In [None]:
#combined_df_10k.to_hdf('../outputs/GSE148673_test.h5', key = 'combined_df_10k', mode = 'w')

In [3]:
combined_df_10k.to_csv('../outputs/GSE148673_test.csv.gz')

In [None]:
# Separate labels and counts into separate dataframes
labels = ['copykat.pred', 'cluster.pred']
df_labels = combined_df_10k[labels]
df_counts = combined_df_10k[combined_df_10k.columns[~combined_df_10k.columns.isin(labels)]]
display(df_counts.head())

# Create new label column - orig_cancer_labels. Tumor if copykat and cluster agree, Normal otherwise
df_labels.index.name = 'Cell'
df_labels['orig_cancer_label'] = np.where((df_labels['copykat.pred'] == 'T') & \
                                          (df_labels['cluster.pred'] == 'T'),
                                          'Cancer', 'Normal')
df_labels.head()
df_labels.to_csv('../GSE148673_labels.csv')

In [None]:
# Generate anndata object with raw and normalized counts

norm_counts_path = ''

# Read in normalized count matrix as AnnData object
adata_norm = sc.read_10x_h5(norm_counts_path, gex_only = False)

# Load in original labels/metadata
orig_meta = pd.read_csv(orig_labels_path)
orig_meta = orig_meta.set_index('Cell')
orig_meta['orig_cancer_label'] = np.where(orig_meta['CellType'] == 'Cancer', 1, 0)

# Merge original metadata with normalized AnnData from TISCH
adata_norm.obs = pd.merge(adata_norm.obs, orig_meta, left_index = True, right_index = True, how = 'inner')

# Load in raw counts into AnnData object - subset using TISCH cells and genes
# Based on extension, choose appropriate function
# If folder - use sc.read_10x_mtx
# If .txt, use read_text

raw_counts_ann = sc.read_10x_mtx(raw_counts_path, gex_only = False)
raw_counts_ann.obs['in_tisch'] = raw_counts_ann.obs.index.isin(adata_norm.obs_names)
raw_counts_ann.var['in_tisch'] = raw_counts_ann.var.index.isin(adata_norm.var_names)
raw_subset = raw_counts_ann[raw_counts_ann.obs['in_tisch'], raw_counts_ann.var['in_tisch']].copy()

# Add in labels to raw_subset
raw_subset.obs = pd.merge(raw_subset.obs, orig_meta, left_index = True, right_index = True, how = 'inner')

# Ensure shape of both normalized and raw matrices are the same
adata_norm = adata_norm[raw_subset.obs_names, raw_subset.var_names].copy()

# Print number of cells and dataset size
print(f'Dataset has {raw_subset.n_obs} cells and {raw_subset.n_vars} features')
print(f'Size of raw dataset: ')
print_size_in_MB(raw_subset)
print(f'Size of normalized dataset:')
print_size_in_MB(adata_norm)

# Create a single anndata object that contains both raw and normalized layers
adata_all = adata_norm.copy()
adata_all.layers['norm'] = adata_norm.X.copy()

In [None]:
# Dataset 3 - Chi 2020