In [1]:
import csv
import os
import scipy.io
import numpy as np
import pandas as pd
from collections import Counter

In [None]:
"""" Read, preprocess and save scRNA-seq data""" 

#### Main functions

In [2]:
# Read/save data

def read_raw_data(path:str) -> list : 
    """" Read count matrix data, gene data (features) and cells data (barcodes) and save them as arrays"""
    
    mat = scipy.io.mmread(os.path.join(path, "matrix.mtx"))
    print(f"Matrix shape: {mat.shape}")
    
    features_path = os.path.join(path, "features.tsv")
    feature_ids = [row[0] for row in csv.reader(open(features_path), delimiter="\t")]
    print(f"Number of feature's (genes) ids: {len(feature_ids)}")
    feature_names = [row[1] for row in csv.reader(open(features_path), delimiter="\t")]
    print(f"Number of feature's (genes) names: {len(feature_names)}")
    feature_types = [row[2] for row in csv.reader(open(features_path), delimiter="\t")]
    print(f"Number of feature's (genes) types: {len(feature_types)}")

    barcodes_path = os.path.join(path, "barcodes.tsv")
    barcodes = [row[0] for row in csv.reader(open(barcodes_path), delimiter="\t")]
    print(f"Number of barcodes (cells): {len(barcodes)}")
    
    return mat,feature_ids,feature_names,feature_types,barcodes

def save_data_to_txt(mat,feature_ids:list,gene_names:list,feature_types:list,barcodes:list): 
    """" Save count matrix data, gene data (features) and cells data (barcodes) as .txt files """ 

    # Genes (name,id,type) and cells (barcodes) data
    np.savetxt("./gene_id.txt.gz",feature_ids,delimiter = ",",fmt = '%s')
    np.savetxt("./gene_names.txt.gz",gene_names,delimiter = ",",fmt = '%s')
    np.savetxt("./feature_types.txt.gz",feature_types,delimiter = ",",fmt = '%s')
    np.savetxt("./barcodes.txt.gz",barcodes,delimiter = ",",fmt = '%s')
    # Count matrix (columns = cells, filas = genes)
    #X = mat.toarray().astype(np.uint8)
    np.savetxt("./X.txt.gz",mat.transpose().toarray().astype(np.uint8),delimiter = ",")
    
    return "Saved."

def create_and_save_dataframe(mat, barcodes, feature_ids, gene_names, feature_types): 
    """" Save count matrix data and gene data (features) in a unique Dataframe """ 
    matrix = pd.DataFrame.sparse.from_spmatrix(mat)
    matrix.columns = barcodes
    matrix.insert(loc=0, column="feature_id", value=feature_ids)
    matrix.insert(loc=0, column="gene", value=gene_names)
    matrix.insert(loc=0, column="feature_type", value=feature_types)
    
    # Save the table as a CSV (note the CSV will be a very large file) !!!!
    matrix.to_csv("mex_matrix.csv", index=False)
    return "CSV saved."

# Analyse data

def analyse_features(feature_ids: list, gene_names: list):
    """ Check that gene ids and gene names are different."""

    if len(feature_ids)==len(set(feature_ids)):
        print("All gene ids are different.")
    else:
        print(f"There are {len(feature_ids)-len(set(feature_ids))} gene ids repeated.")

    if len(gene_names)==len(set(gene_names)):
        print("All gene names are different.")
    else:
        print(f"There are {len(gene_names)-len(set(gene_names))} gene names repeated.")

def analyse_barcodes(barcodes: list): 
    """ Check that cells barcodes (ids) are different."""

    if len(barcodes)==len(set(barcodes)):
        print("All samples are unique.")
    else:
        print(f"There are {len(barcodes)-len(set(barcodes))} cell ids repeated (sample repeated too?).")

# Preprocess data

def preprocess_data(mat, barcodes: list, feature_ids: list, synthetic_data: dict):
    """" Create dataframe with count matrix and cell data """

    matrix = pd.DataFrame.sparse.from_spmatrix(mat)
    # Transpose original matrix (genes x cells)
    count_matrix = matrix.transpose()
    count_matrix.index = barcodes
    count_matrix.index.name = "cell_id"
    count_matrix.columns = feature_ids
    # Add synthetic data related to samples
    count_matrix["reprogramming_type"] = synthetic_data["reprogramming_type"]
    count_matrix["reprogramming_stage"] = synthetic_data["reprogramming_stage"]
    count_matrix["cell_state"] = synthetic_data["cell_state"]
    count_matrix["cell_type"] = synthetic_data["cell_type"]
    count_matrix["cell_line"] = synthetic_data["cell_line"]

    return count_matrix

def preprocess_metadata(gene_names: list, feature_ids: list, feature_types: list):  
    """" Create dataframe with gene metadata """
    genes_metadata = pd.DataFrame()
    genes_metadata.insert(loc=0, column="gene_name", value=gene_names)
    genes_metadata.insert(loc=0, column="gene_type", value=feature_types)
    genes_metadata.index = feature_ids
    genes_metadata.index.name = "gene_id"
    return genes_metadata



## 1. hADSCS-1013

### 1.1 Somatic, without reprogramming

In [3]:
# Download data from -> https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM6592448
# Read raw data 
# path = ".\\data\\hADSC-1013"
path = "./data/hADSC-1013"
mat,feature_ids,gene_names,feature_types,barcodes = read_raw_data(path)

Matrix shape: (32738, 5404)
Number of feature's (genes) ids: 32738
Number of feature's (genes) names: 32738
Number of feature's (genes) types: 32738
Number of barcodes (cells): 5404


#### Analyse data

In [4]:
### Analyse genes
# Check all genes ids and names are unique
analyse_features(feature_ids, gene_names)
# Check genes types 
print(f"Tipos de genes presentes: {Counter(feature_types)}")

### Analyse cells
# Check all samples are unique 
analyse_barcodes(barcodes)

All gene ids are different.
There are 95 gene names repeated.
Tipos de genes presentes: Counter({'Gene Expression': 32738})
All samples are unique.


En relación a los genes(features), hay nombres repetidos, pero no los ids. Nos fiamos de que son genes diferentes y usamos los ids para las columnas, en un futuro usaremos estos ids para encontrar los nombres reales de los genes. 

Por otro lado, todas las células son únicas, por lo tanto no tenemos que eliminar ninguna muestra. Aquí podemos analizar la distribución y eliminar aquellas en las que la varianza sea menor del 95% por ejemplo.

#### Data preprocessing

In [5]:
# Organise data in two files

##### Count matrix: cells x genes -> barcodes x genes_ids 
synthetic_data = {
                    "reprogramming_type": False, # Sin reprogramar
                    "reprogramming_stage": False,
                    "cell_state": "somatic",
                    "cell_type": "hADCS",
                    "cell_line": "1013"
                }
count_matrix = preprocess_data(mat,barcodes,feature_ids,synthetic_data)

##### Gene metadata
gene_metadata = preprocess_metadata(gene_names,feature_ids,feature_types)


#### Save data

In [6]:
count_matrix.head(2)

Unnamed: 0_level_0,ENSG00000243485,ENSG00000237613,ENSG00000186092,ENSG00000238009,ENSG00000239945,ENSG00000237683,ENSG00000239906,ENSG00000241599,ENSG00000228463,ENSG00000237094,...,ENSG00000215635,ENSG00000268590,ENSG00000251180,ENSG00000215616,ENSG00000215611,reprogramming_type,reprogramming_stage,cell_state,cell_type,cell_line
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGAAGAGCA-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,False,False,somatic,hADCS,1013
AAACCCACAAAGTGTA-1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,False,False,somatic,hADCS,1013


In [7]:
gene_metadata.head(2)

Unnamed: 0_level_0,gene_type,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,Gene Expression,MIR1302-10
ENSG00000237613,Gene Expression,FAM138A


In [11]:
# Save Dataframe as .txt.gz files (compressed)
# np.savetxt(r'./hADSC1013_matrix.txt.gz', count_matrix, fmt='%s')
# np.savetxt(r'./hADSC1013_genes_metadata.txt.gz', genes_metadata, fmt='%s')

# Save Dataframe as .csv.gz file
# count_matrix.to_csv(f"{path}/hADSC1013_matrix.csv.gz", compression='gzip')
# windows local -> f"{path}\\hADSC1013_genes_metadata.csv"
gene_metadata.to_csv(f"{path}/hADSC1013_genes_metadata.csv")

# Save Dataframe as .pkl file compressed
count_matrix.to_pickle(f"{path}/hADSC1013_matrix.pkl.gz", compression='gzip')

### 1.2 Intermediate plastic state, Stage II Day 16 

In [12]:
# Download data from -> https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSM6592454
# Read raw data
# path = ".\\data\\hADSC-1013_SIID16"
path = "./data/hADSC-1013_SIID16"
mat,feature_ids,gene_names,feature_types,barcodes = read_raw_data(path)

Matrix shape: (32738, 4924)
Number of feature's (genes) ids: 32738
Number of feature's (genes) names: 32738
Number of feature's (genes) types: 32738
Number of barcodes (cells): 4924


#### Analyse data

In [13]:
### Analyse genes
# Check all genes ids and names are unique
analyse_features(feature_ids, gene_names)
# Check genes types 
print(f"Tipos de genes presentes: {Counter(feature_types)}")

### Analyse cells
# Check all samples are unique 
analyse_barcodes(barcodes)

All gene ids are different.
There are 95 gene names repeated.
Tipos de genes presentes: Counter({'Gene Expression': 32738})
All samples are unique.


En relación a los genes(features), hay nombres repetidos, pero no los ids. Nos fiamos de que son genes diferentes y usamos los ids para las columnas, en un futuro usaremos estos ids para encontrar los nombres reales de los genes. 

Por otro lado, todas las células son únicas, por lo tanto no tenemos que eliminar ninguna muestra. Aquí podemos analizar la distribución y eliminar aquellas en las que la varianza sea menor del 95% por ejemplo.

#### Data preprocessing

In [14]:
# Organise data in two files
##### Count matrix: cells x genes -> barcodes x genes_ids 
synthetic_data = {
                    "reprogramming_type": "chemical reprogramming",
                    "reprogramming_stage": "stage II day 16",
                    "cell_state": "intermediate plastic state with a regeneration-like program",
                    "cell_type": "hADCS",
                    "cell_line": "1013"
                }
count_matrix = preprocess_data(mat,barcodes,feature_ids,synthetic_data)

##### Gene metadata
gene_metadata = preprocess_metadata(gene_names,feature_ids,feature_types)

#### Save data

In [15]:
count_matrix.head(2)

Unnamed: 0_level_0,ENSG00000243485,ENSG00000237613,ENSG00000186092,ENSG00000238009,ENSG00000239945,ENSG00000237683,ENSG00000239906,ENSG00000241599,ENSG00000228463,ENSG00000237094,...,ENSG00000215635,ENSG00000268590,ENSG00000251180,ENSG00000215616,ENSG00000215611,reprogramming_type,reprogramming_stage,cell_state,cell_type,cell_line
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AAACCCAAGTCACGAG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,chemical reprogramming,stage II day 16,intermediate plastic state with a regeneration...,hADCS,1013
AAACCCATCTAGATCG-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,chemical reprogramming,stage II day 16,intermediate plastic state with a regeneration...,hADCS,1013


In [16]:
gene_metadata.head(2)

Unnamed: 0_level_0,gene_type,gene_name
gene_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000243485,Gene Expression,MIR1302-10
ENSG00000237613,Gene Expression,FAM138A


In [17]:
# Save Dataframe as .txt.gz files (compressed)
# np.savetxt(r'./hADSC1013_SIID16_matrix.txt.gz', count_matrix, fmt='%s')
# np.savetxt(r'./hADSC1013_SIID16_genes_metadata.txt.gz', genes_metadata, fmt='%s')

# Save Dataframe as .csv file
# count_matrix.to_csv(f"{path}/hADSC1013_SIID16_matrix.csv.gz", compression="gzip")
# windows path -> f"{path}\\hADSC1013_SIID16_genes_metadata.csv"
gene_metadata.to_csv(f"{path}/hADSC1013_SIID16_genes_metadata.csv")

# Save Dataframe as .pkl file compressed
count_matrix.to_pickle(f"{path}/hADSC1013_SIID16_matrix.pkl.gz", compression='gzip')

### 2.Create AnnData objects

In [None]:
import gzip
import anndata
import pickle
import scanpy as sc

In [None]:
def create_anndata_object(df_data, df_var_names): 
    adata = sc.AnnData(
                    X=df_data.iloc[:,:-5].values, 
                    obs=df_data[['reprogramming_type',
                                'reprogramming_stage', 
                                'cell_state', 'cell_type', 'cell_line']]
                    )
    adata.var_names = df_var_names.tolist()
    # We know there are some gene names repeated, so we fix it
    adata.var_names_make_unique()
    adata.obs_names_make_unique()
    return adata

In [None]:
# Somatic cells
with gzip.open('./data/hADSC-1013/hADSC1013_matrix.pkl.gz', 'rb') as file:
    somatic_data = pickle.load(file)
# Same in plastic and somatic cells 
gene_names = pd.read_csv("./data/hADSC-1013/hADSC1013_genes_metadata.csv")["gene_name"]
print(f"Somatic cells data loaded: {somatic_data.shape} cells x genes")

# Intermediate plastic state cells
with gzip.open('./data/hADSC-1013_SIID16/hADSC1013_SIID16_matrix.pkl.gz', 'rb') as file:
    plastic_data = pickle.load(file)
print(f"Intermediate plastic state cells data loaded: {plastic_data.shape} cells x genes")

# Change reprogramming_type and reprogramming_stage bool to string to avoid errors when save anndata object
somatic_data['reprogramming_type'] = somatic_data['reprogramming_type'].astype('string')
plastic_data['reprogramming_type'] = plastic_data['reprogramming_type'].astype('string')
somatic_data['reprogramming_stage'] = somatic_data['reprogramming_stage'].astype('string')
plastic_data['reprogramming_stage'] = plastic_data['reprogramming_stage'].astype('string')

# Our matrix is cells x genes 
df_data = pd.concat([somatic_data, plastic_data], axis=0)
print(f"Data shape: {df_data.shape}")

# Create AnnData
adata = create_anndata_object(df_data, gene_names)
adata_somatic = create_anndata_object(somatic_data, gene_names)
adata_plastic = create_anndata_object(plastic_data, gene_names)

# Save them
adata.write_h5ad("./data/adata.h5ad.gz", compression="gzip")
adata_somatic.write_h5ad("./data/hADSC-1013/hADSC1013_adata.h5ad.gz", compression="gzip")
adata_plastic.write_h5ad("./data/hADSC-1013_SIID16/hADSC1013_SIID16_adata.h5ad.gz", compression="gzip")