Notebook to create datasets

In [None]:
# Installing packages
#!pip install scanpy[skmisc]

In [None]:
# For Colab
from google.colab import drive
drive.mount('/content/drive')

# Set up paths to files
main_dir = '/content/drive/MyDrive/_Course materials/S6 M2-4 699 - Capstone/'
data_path = '/content/drive/MyDrive/_Course materials/S6 M2-4 699 - Capstone/Capstone - Shared folder/Data/'
scripts_path = main_dir + 'scripts/'

# Change into correct directory for importing custom preprocessing function
%cd {main_dir}scripts/Cancer_Prediction_10x
# %pwd

In [1]:
from main_functions import *

import os
from pathlib import Path
import pandas as pd
import numpy as np
import scanpy as sc

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Training dataset - Qian 2020, EMTAB8107
#adata_train = create_adata_train('EMTAB8107_2102-Breastcancer_counts','BRCA_EMTAB8107_expression.h5','EMTAB8107_2103-Breastcancer_metadata.csv.gz')

# Save adata to h5ad
#adata_train.write_h5ad('adata_train.h5ad')

In [None]:
# Test dataset - Gao 2021, GSE148673_RAW

# For each .txt.gz file in directory, read in the count matrix and corresponding label
dir_list = Path('../GSE148673_RAW').glob('**/*.txt.gz')
#dir_list = Path(data_path + 'GSE148673_RAW').glob('**/*.txt.gz')
obj_dict = {}

for i, curr_file_path in enumerate(dir_list):
  curr_filename = Path(curr_file_path.name).stem
  base_name = curr_filename.split('_')[0]
  print(curr_filename)
  curr_df = sc.read_text(curr_file_path, dtype = 'str').to_df().transpose()
  curr_df = curr_df.add_prefix(f'{base_name}@', axis=0)
  # Add column for patient number
  curr_df['PatientNumber'] = base_name
  obj_dict[i] = curr_df
#  display(obj_dict[i].head())
#  print()

# Concatenate counts and labels from all AnnData objects
combined_df_10k = pd.concat(obj_dict.values(), ignore_index=False)
print(combined_df_10k.shape)
#display(combined_df_10k['PatientNumber'].head())

In [None]:
#combined_df_10k.to_hdf('../outputs/GSE148673_test.h5', key = 'combined_df_10k', mode = 'w')

In [None]:
combined_df_10k = pd.read_csv('../outputs/GSE148673_test.csv.gz', index_col = 0) #data_path + 'GSE148673_test.csv.gz'
print(combined_df_10k.shape)
display(combined_df_10k.head())

In [None]:
combined_df_10k.to_csv('../outputs/GSE148673_test.csv.gz')

In [None]:
# Separate labels and counts into separate dataframes
labels = ['copykat.pred', 'cluster.pred', 'PatientNumber']
df_labels = combined_df_10k[labels]
df_counts = combined_df_10k.drop(labels, axis = 1)
display(df_counts.head())
#df_counts.to_csv(data_path + 'GSE148673_test_v2.csv.gz')
#df_counts.to_csv('../outputs/GSE148673_test_v2.csv.gz')

# Create new label column - orig_cancer_labels. Tumor if copykat and cluster agree, Normal otherwise
df_labels.index.name = 'Cell'
df_labels['CellType'] = np.where((df_labels['copykat.pred'] == 'T') & \
                                          (df_labels['cluster.pred'] == 'T'),
                                          'Cancer', 'Normal')
display(df_labels.head())

# Need to add PatientNumber to label

#df_labels.to_csv('../GSE148673_labels.csv')
#df_labels.to_csv(data_path + 'GSE148673_labels.csv') #, index = False

In [None]:
df_labels = pd.read_csv(data_path + 'GSE148673_labels.csv')
display(df_labels.head())
#df_labels.to_csv(data_path + 'GSE148673_labels.csv') #, index = False

In [None]:
# Generate anndata object with raw and normalized counts
adata_10k_test = create_adata_train(data_path + 'GSE148673_test.csv.gz',
                               data_path + 'BRCA_GSE148673_expression.h5',
                               data_path + 'GSE148673_labels.csv')
adata_10k_test.write_h5ad(scripts_path + 'Outputs/adata_10k_test_GSE148673.h5ad')

In [2]:
# Great Lakes
adata_10k_test = create_adata_train('../outputs/GSE148673_test.csv.gz',
                               '../BRCA_GSE148673_expression.h5',
                               '../GSE148673_labels.csv')
adata_10k_test.write_h5ad('../outputs/adata_10k_test_GSE148673.h5ad')

raw_counts_df:


Unnamed: 0,RP11-34P13.3,FAM138A,OR4F5,RP11-34P13.7,RP11-34P13.8,RP11-34P13.14,RP11-34P13.9,FO538757.3,FO538757.2,AP006222.2,...,AP001468.1,AP001469.1,AP001469.2,AP001469.3,AC136352.3,AC136352.2,AC171558.3,BX004987.1,AC145212.1,MAFIP
GSM4476486@AAACCTGCACCTTGTC,0.0,0,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,,,,,,,,,,
GSM4476486@AAACGGGAGTCCTCCT,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
GSM4476486@AAACGGGTCCAGAGGA,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
GSM4476486@AAAGATGCAGTTTACG,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
GSM4476486@AAAGCAACAGGAATGC,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,




raw_counts_ann: AnnData object with n_obs × n_vars = 10424 × 45083
Dataset has 10359 cells and 19203 features
Size of raw dataset: 
Size: 1596.7 MB
Size of normalized dataset:
Size: 315.68 MB


In [None]:
# Create AnnData object directly from dataframes
adata_10k_test = sc.AnnData(df_counts, obs = df_labels)
adata_10k_test
adata_10k_test.var.head()

In [None]:
# Dataset 3 - Chi 2020