Objective :  Input loom file -> output train and test split files 

In [1]:
# import libraries
import scanpy as sp
import pandas as pd
import numpy as np
import loompy
from sklearn.model_selection import train_test_split

In [2]:
# Read loom file using Scanpy, this step will take a while due to the high number of cells
# Maybe we should consider downsampling the data set?
loom_filepath = "D:/milestone project/SIADS_694_695_Milestone_II/data/cells.loom"
print("Reading loom file . . .")
cells = sp.read_loom(loom_filepath, sparse=False)
print("Complete!")

Reading loom file . . .
Complete!


In [3]:
# Create a dataframe the raw counts
cell_df = pd.DataFrame(cells.X, columns=cells.var_names, index=cells.obs_names)
print(cell_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 51770 entries, AAACATACCTGAGT_1 to TTGACTTCATAAAGGT_22
Columns: 22956 entries, FO538757.2 to AC240274.1
dtypes: float32(22956)
memory usage: 4.4+ GB
None


In [4]:
target = pd.DataFrame(cells.obs.type)
print(target.info())

<class 'pandas.core.frame.DataFrame'>
Index: 51770 entries, AAACATACCTGAGT_1 to TTGACTTCATAAAGGT_22
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    51770 non-null  object
dtypes: object(1)
memory usage: 2.8+ MB
None


In [5]:
# add the target values to the data froma 
cell_df = cell_df.merge(target, how='outer', left_index=True, right_index=True)

In [6]:
# taking a look at the data, looks sparse
cell_df.sample(20)
print(cell_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 51770 entries, AAACATACCTGAGT_1 to TTGACTTCATAAAGGT_22
Columns: 22957 entries, FO538757.2 to type
dtypes: float32(22956), object(1)
memory usage: 4.4+ GB
None


In [7]:
# types of cells
for item in cell_df.type.unique():
    print (item)

CD4 T
CD8 T
B cell
RBC
Tumor
Myeloid
Stromal
?
pDC
Endothelial
NK
Mast
DC
Alveolar
Epithelial


In [8]:
# filter out '?'
cell_df = cell_df[cell_df.type != '?']

In [9]:
print(cell_df.type.unique())
print(cell_df.info())

['CD4 T' 'CD8 T' 'B cell' 'RBC' 'Tumor' 'Myeloid' 'Stromal' 'pDC'
 'Endothelial' 'NK' 'Mast' 'DC' 'Alveolar' 'Epithelial']
<class 'pandas.core.frame.DataFrame'>
Index: 50435 entries, AAACATACCTGAGT_1 to TTGACTTCATAAAGGT_22
Columns: 22957 entries, FO538757.2 to type
dtypes: float32(22956), object(1)
memory usage: 4.3+ GB
None


In [10]:
cell_df['type']

CellID
AAACATACCTGAGT_1         CD4 T
AAAGAGACATCGTG_1         CD4 T
AACGCATGCTTAGG_1         CD4 T
AACTGTCTTGACAC_1         CD4 T
AAGAATCTCATGAC_1         CD4 T
                        ...   
TGGTTCCCAAACGTGG_22    Stromal
TTAGTTCTCGCGGATC_22    Stromal
TTCGGTCCAGACAAGC_22    Stromal
TTCTCCTCATGTCGAT_22    Stromal
TTGACTTCATAAAGGT_22    Stromal
Name: type, Length: 50435, dtype: object

In [11]:
# Shuffle defaults to true, stratifing to make sure each cell type represented
X_train, X_test, y_train, y_test = train_test_split(cell_df.drop('type', axis = 1),
                                                    cell_df['type'],
                                                    test_size=0.20, stratify = cell_df['type'],
                                                    random_state=42)

In [12]:
print("shape x train", X_train.shape)
print("shape x test", X_test.shape)
print("shape y train", y_train.shape)
print("shape y test", y_test.shape)

shape x train (37826, 22956)
shape x test (12609, 22956)
shape y train (37826,)
shape y test (12609,)


In [13]:
# create csv files for downstream use
# fyi not normalized, due to data leakage i think here would be an appropriate place to normalize,
# or in the next step of pipeline
my_path = "D:/milestone project/SIADS_694_695_Milestone_II/data/"
X_train.to_csv(my_path+"X_train.csv")
X_test.to_csv(my_path+"X_test.csv")
y_train.to_csv(my_path+"y_train.csv")
y_test.to_csv(my_path+"y_test.csv")