In [2]:
import os
import random
import scanpy as sc
import argparse
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Dense, Softmax
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from scipy.stats import pearsonr
from scipy.optimize import nnls
from scipy import sparse
from joblib import dump
from joblib import load
from sklearn.metrics import confusion_matrix
from sklearn.inspection import permutation_importance
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
import scipy.sparse as sp

Using TensorFlow backend.


In [3]:
# Feature Selection:
# Removal of uninformative genes that have either 0 expression or expression variance below 0.1. (10k genes for training left)
# Don't care about dataset issues. Our job is just to genereate bulkRNAseq data

#Metrics for comparison: RMSE, Pearson's corellation coefficient, the slope + intercept of regression fitted for ground0truth and predicted cell fractions, and Lin's concordance correlation coefficient
#Measure it's failure to generalize

#NOTE: making simulated data more similar to bulk data can improve performance

#Preprocessing:
#Corresponding cell-gene matrices, filtered for cells with less than 500 detected genes and genes expressed in less than five cells
#Count matrix was filtered for outliers w/ high or low numbers of counts
#Gene expression was normalized to library size via scanpy.normalize_per_cell
#Data was saved
#Cell type labels were reassigned to make it consistent across datasets
#Important: preprocessed scRNAseq data was used (gene expression matrix and cell type labels)
#How do we plan to keep within-subject relationships?
#Take rand of whole list of cell types
#list must add to 1
#Multiply list by total number of cells per sample to get cells per type
#Sample each number of cells from their appropriate cell type (Nc cells of cell type c)
#Overcount if necessary
#Take all results and add them together to get aggregated cell expressions

#Also create "Sparse Samples" to be samples missing cell types or having a bias as to certain cell types
#First, randomly drop a certain number of cell types. Then, drop them. Then, proceed as above

#Low expression valued genes were removed
#data transformed into log space
# Add 1 and take log, base 2
# scale using MinMaxScaler

In [None]:
#What I need to do:
#1. Have expression vals
#2. Need to filter outlier cells
#3. Normalize to library too
#4. Save data
#5. Load data
#6. Generate lists of random numbers per sample we want
#7. Sample num of cells from appropriate cell type
#8. Aggregate expression values. Store as one input data
#9. Have sparse samples to create sparse ones too
#10. Save data
#11. Data transform into log space then scale

In [11]:
# Load your actual bulk RNAseq data
# Make sure to adjust the path to the file containing your data
cfrna_data = pd.read_csv('./Dataset/arp3_protein_coding_feature_counts.txt',
                         sep='\t', header=None, names=['gene_names', 'counts'])

# Convert counts to float
cfrna_data['counts'] = cfrna_data['counts'].astype(float)

# Normalize the cfRNAseq data
cpm = cfrna_data['counts'] / cfrna_data['counts'].sum() * 1e6
log_cpm = np.log1p(cpm)

# Create a DataFrame with gene names and normalized expression values
cfrna_df = pd.DataFrame({'GeneName': cfrna_data['gene_names'], 'Expression': log_cpm})

# Set the GeneName column as the index of cfRNA_df
cfrna_df = cfrna_df.set_index('GeneName')
print("Contents of cfrna_data:\n", cfrna_df)

# Transpose to match X
cfrna_df = cfrna_df.transpose()

# Alphabetically ordering the genes
cfrna_df = cfrna_df.sort_index(axis=1)
cfrna_df = cfrna_df.append([cfrna_df]*999)
print("Contents of cfrna_df:\n", cfrna_df)


Contents of cfrna_data:
           Expression
GeneName            
OR4F5       0.000000
OR4F29      0.000000
OR4F16      0.000000
SAMD11      3.119614
NOC2L       4.175462
...              ...
MT-ND4L     5.746842
MT-ND4      8.235097
MT-ND5      8.278623
MT-ND6      7.389071
MT-CYB      8.207368

[20012 rows x 1 columns]
Contents of cfrna_df:
 GeneName        A1BG      A1CF       A2M     A2ML1   A3GALT2    A4GALT  \
Expression  2.273285  3.943163  4.693073  4.905578  2.150275  3.131951   
Expression  2.273285  3.943163  4.693073  4.905578  2.150275  3.131951   
Expression  2.273285  3.943163  4.693073  4.905578  2.150275  3.131951   
Expression  2.273285  3.943163  4.693073  4.905578  2.150275  3.131951   
Expression  2.273285  3.943163  4.693073  4.905578  2.150275  3.131951   
...              ...       ...       ...       ...       ...       ...   
Expression  2.273285  3.943163  4.693073  4.905578  2.150275  3.131951   
Expression  2.273285  3.943163  4.693073  4.905578  2.150275 

In [16]:
# LOAD SCRNA-SEQ DATA
print("Loading scRNA-seq data...")
adata = sc.read_h5ad(filename = "./Dataset/TabulaSapiens.h5ad", backed='r')

print("Loaded scRNAseq data")

print(adata)

Loading scRNA-seq data...


KeyboardInterrupt: 

In [None]:
# PREPROCESS INTERSECTED SCRNASEQ DATASET USING THE PROCESSED EXPRESSION VALUES, NOT RAW COUNTS --> SKIP IF USING RAW COUNTS
# NOT USEFUL FOR SIMULATION
print("Normalizing and filtering scRNA-seq data...")
# print("Sum of data after loading:", np.sum(scrnaseq_data.X))

# NORMALIZE SC_RNASEQ DATA
print("Before normalization and filtering:", scrnaseq_data.shape)
sc.pp.normalize_total(scrnaseq_data, target_sum=1e6)
print("After normalization:", scrnaseq_data.shape)
sc.pp.log1p(scrnaseq_data)
print("After logarithmization:", scrnaseq_data.shape)

# print("Normalized scRNAseq data")

# FILTER SC_RNASEQ DATA
print("\nFiltering scRNAseq data")

scrnaseq_data.raw = scrnaseq_data
min_genes = 500
max_mito = 0.05
mito_genes = scrnaseq_data.var_names.str.startswith('mt-')
scrnaseq_data.obs['percent_mito'] = np.sum(scrnaseq_data[:, mito_genes].X, axis=1) / np.sum(scrnaseq_data.X, axis=1)
scrnaseq_data.obs['n_genes'] = np.sum(scrnaseq_data.X > 0, axis=1)
scrnaseq_data = scrnaseq_data[scrnaseq_data.obs['n_genes'] > min_genes, :]
scrnaseq_data = scrnaseq_data[scrnaseq_data.obs['percent_mito'] < max_mito, :]

print("Filtered scRNAseq data")
print("After filtering:", scrnaseq_data.shape)

print("\nPerforming feature selection...")

# FEATURE SELECTION
print("Before feature selection:", scrnaseq_data.shape)
sc.pp.highly_variable_genes(scrnaseq_data, n_top_genes=6000)
scrnaseq_data_fs = scrnaseq_data[:, scrnaseq_data.var['highly_variable']]

# # STANDARD SCALING
# scaler = StandardScaler()
# scaled_scrnaseq_data = scaler.fit_transform(scrnaseq_data_fs.X)
# scrnaseq_data_fs.X = scaled_scrnaseq_data

# Use scanpy's integrated scaling algorithm
sc.pp.scale(scrnaseq_data_fs, max_value=10)
print("Any NaN in adata after scaling:", np.isnan(scrnaseq_data_fs.X).any())
# Store the means and standard deviations for applying to the cfRNA data
gene_means = scrnaseq_data_fs.var['mean']
gene_stds = scrnaseq_data_fs.var['std']

print("After feature selection:", scrnaseq_data_fs.shape)

In [None]:
# PREPARE INTERSECTED PREPROCESSED SCRNASEQ DATA FOR SIMULATION --> USES RAW COUNTS, NOT PROCESSED EXPRESSION DATA
print("Preparing input data...")

# Extract cluster labels
cell_type_labels = scrnaseq_data.obs['cell_ontology_class']
print("Cell type labels extracted.")
print("\nUnique cell_type_labels labels:\n", cell_type_labels.unique(), sep='\n')

# Convert the sparse matrix to a dense array
dense_X = scrnaseq_data.X.toarray() if sparse.issparse(scrnaseq_data.X) else scrnaseq_data.X
# Convert adata.X to DataFrame
expression_data = pd.DataFrame(dense_X, columns=scrnaseq_data.var_names)
print("Converted scrnaseq_data.X to DataFrame.")
print("\nContents of expression_data before adding cluster labels:\n", expression_data.head())

# Convert cell type labels to a regular series with the same index as expression_data
cell_type_labels = pd.Series(cell_type_labels.values, index=expression_data.index, name='cell_type')
print("--> Converted cluster labels to a regular series.")
print("\n--> Added cell labels to scRNAseq dataframe")

with pd.option_context('display.max_rows', None):
    print("\nAll unique cell ontology class counts in the scRNAseq data:\n\n", pd.Series(cell_type_labels.value_counts()))

# Insert cell_type_labels as the first column in expression_data
expression_data.insert(0, 'cell', cell_type_labels)

print("\nscRNAseq dataframe with cell type labels:\n\n", expression_data.head())