Notebook to create datasets

In [1]:
# Installing packages
!pip install scanpy[skmisc]

Defaulting to user installation because normal site-packages is not writeable


In [1]:
# For Colab
from google.colab import drive
drive.mount('/content/drive')

# Set up paths to files
main_dir = '/content/drive/MyDrive/_Course materials/S6 M2-4 699 - Capstone/'
data_path = '/content/drive/MyDrive/_Course materials/S6 M2-4 699 - Capstone/Capstone - Shared folder/Data/'
scripts_path = main_dir + 'scripts/'

# Change into correct directory for importing custom preprocessing function
%cd {main_dir}scripts/Cancer_Prediction_10x
# %pwd

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/_Course materials/S6 M2-4 699 - Capstone/scripts/Cancer_Prediction_10x


In [2]:
from main_functions import *

import os
from pathlib import Path
import pandas as pd
import numpy as np
import scanpy as sc

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Training dataset - Qian 2020, EMTAB8107
#adata_train = create_adata_train('EMTAB8107_2102-Breastcancer_counts','BRCA_EMTAB8107_expression.h5','EMTAB8107_2103-Breastcancer_metadata.csv.gz')

# Save adata to h5ad
#adata_train.write_h5ad('adata_train.h5ad')

In [6]:
# Test dataset - Gao 2021, GSE148673_RAW

# For each .txt.gz file in directory, read in the count matrix and corresponding label
dir_list = Path('../GSE148673_RAW').glob('**/*.txt.gz')
#dir_list = Path(data_path + 'GSE148673_RAW').glob('**/*.txt.gz')
obj_dict = {}

for i, curr_file_path in enumerate(dir_list):
  curr_filename = Path(curr_file_path.name).stem
  base_name = curr_filename.split('_')[0]
  print(curr_filename)
  curr_df = sc.read_text(curr_file_path, dtype = 'str').to_df().transpose()
  curr_df = curr_df.add_prefix(f'{base_name}@', axis=0)
  # Add column for patient number
  curr_df['PatientNumber'] = base_name
  obj_dict[i] = curr_df
#  display(obj_dict[i].head())
#  print()

# Concatenate counts and labels from all AnnData objects
combined_df_10k = pd.concat(obj_dict.values(), ignore_index=False)
print(combined_df_10k.shape)
display(combined_df_10k['PatientNumber'].head())

GSM4476486_combined_UMIcount_CellTypes_TNBC1.txt
GSM4476485_combined_UMIcount_CellTypes_DCIS1.txt
GSM4476489_combined_UMIcount_CellTypes_TNBC4.txt
GSM4476490_combined_UMIcount_CellTypes_TNBC5.txt
GSM4476488_combined_UMIcount_CellTypes_TNBC3.txt
GSM4476487_combined_UMIcount_CellTypes_TNBC2.txt
(10424, 45086)


GSM4476486@AAACCTGCACCTTGTC    GSM4476486
GSM4476486@AAACGGGAGTCCTCCT    GSM4476486
GSM4476486@AAACGGGTCCAGAGGA    GSM4476486
GSM4476486@AAAGATGCAGTTTACG    GSM4476486
GSM4476486@AAAGCAACAGGAATGC    GSM4476486
Name: PatientNumber, dtype: object

In [None]:
#combined_df_10k.to_hdf('../outputs/GSE148673_test.h5', key = 'combined_df_10k', mode = 'w')

In [6]:
combined_df_10k = pd.read_csv('../outputs/GSE148673_test.csv.gz', index_col = 0) #data_path + 'GSE148673_test.csv.gz'
print(combined_df_10k.shape)
display(combined_df_10k.head())

KeyboardInterrupt: 

In [None]:
combined_df_10k.to_csv('../outputs/GSE148673_test.csv.gz')

In [10]:
# Separate labels and counts into separate dataframes
labels = ['copykat.pred', 'cluster.pred', 'PatientNumber']
df_labels = combined_df_10k[labels]
df_counts = combined_df_10k.drop(labels, axis = 1)
display(df_counts.head())
#df_counts.to_csv(data_path + 'GSE148673_test_v2.csv.gz')
#df_counts.to_csv('../outputs/GSE148673_test_v2.csv.gz')

# Create new label column - orig_cancer_labels. Tumor if copykat and cluster agree, Normal otherwise
df_labels.index.name = 'Cell'
df_labels['CellType'] = np.where((df_labels['copykat.pred'] == 'T') & \
                                          (df_labels['cluster.pred'] == 'T'),
                                          'Cancer', 'Normal')
display(df_labels.head())

# Need to add PatientNumber to label

df_labels.to_csv('../GSE148673_labels.csv')
#df_labels.to_csv(data_path + 'GSE148673_labels.csv') #, index = False

Unnamed: 0_level_0,RP11-34P13.3,FAM138A,OR4F5,RP11-34P13.7,RP11-34P13.8,RP11-34P13.14,RP11-34P13.9,FO538757.3,FO538757.2,AP006222.2,...,AP001468.1,AP001469.1,AP001469.2,AP001469.3,AC136352.3,AC136352.2,AC171558.3,BX004987.1,AC145212.1,MAFIP
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GSM4476486@AAACCTGCACCTTGTC,0,0,0,0,0,0,0,0,1,0,...,,,,,,,,,,
GSM4476486@AAACGGGAGTCCTCCT,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
GSM4476486@AAACGGGTCCAGAGGA,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
GSM4476486@AAAGATGCAGTTTACG,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
GSM4476486@AAAGCAACAGGAATGC,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_labels['CellType'] = np.where((df_labels['copykat.pred'] == 'T') & \


Unnamed: 0_level_0,copykat.pred,cluster.pred,PatientNumber,CellType
Cell,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
GSM4476486@AAACCTGCACCTTGTC,T,T,GSM4476486,Cancer
GSM4476486@AAACGGGAGTCCTCCT,N,N,GSM4476486,Normal
GSM4476486@AAACGGGTCCAGAGGA,T,T,GSM4476486,Cancer
GSM4476486@AAAGATGCAGTTTACG,T,T,GSM4476486,Cancer
GSM4476486@AAAGCAACAGGAATGC,T,T,GSM4476486,Cancer


In [3]:
df_labels = pd.read_csv(data_path + 'GSE148673_labels.csv')
display(df_labels.head())
#df_labels.to_csv(data_path + 'GSE148673_labels.csv') #, index = False

Unnamed: 0,Cell,copykat.pred,cluster.pred,CellType
0,GSM4476485@AAACCTGCAGTGACAG,T,T,Cancer
1,GSM4476485@AAACCTGGTCGAGATG,N,N,Normal
2,GSM4476485@AAACCTGTCACCGGGT,N,N,Normal
3,GSM4476485@AAACGGGGTGCACTTA,T,T,Cancer
4,GSM4476485@AAACGGGTCACGGTTA,T,T,Cancer


In [4]:
# Generate anndata object with raw and normalized counts
adata_10k_test = create_adata_train(data_path + 'GSE148673_test.csv.gz',
                               data_path + 'BRCA_GSE148673_expression.h5',
                               data_path + 'GSE148673_labels.csv')
adata_10k_test.write_h5ad(scripts_path + 'Outputs/adata_10k_test_GSE148673.h5ad')

ValueError: could not convert string to float: 'T'

In [None]:
# Dataset 3 - Chi 2020