# Import necessary packages

In [31]:
from aeon.transformations.collection.shapelet_based import (
    RandomDilatedShapeletTransform
)
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import rdata
import scanpy as sc
import anndata
import importlib

# Load single cell data

In [32]:
anndata = sc.read_h5ad('GSE226824_HSPC-all_filtered.h5ad')

# filter out control cells
anndata = anndata[anndata.obs.time != 'control']

In [33]:
# lognormalize data
#anndata.layers['norm'] = sc.pp.normalize_total(anndata, target_sum=1, exclude_highly_expressed=True, inplace=False)['X']

In [34]:
data = anndata.to_df()
data = data.T

In [35]:
# remove rows that sum to zero
data = data.loc[(data != 0).any(axis=1)]

In [36]:
# paste anndata.obs time and clusters together
time_cluster = anndata.obs['time'].astype(str).apply(lambda x: x.replace(' ', '')) + '_' + anndata.obs['clusters'].astype(str).apply(lambda x: x.replace(' ', ''))
time_cluster.values

array(['3h_HSCs#1', '3h_HSCs#2', '3h_LMPPs#1', ..., '72h_ery.prog.#1',
       '72h_LMPPs#1', '72h_HSCs#2'], dtype=object)

In [37]:
# change data colnames to time_cluster
data.columns = time_cluster.values

In [38]:
data

Unnamed: 0,3h_HSCs#1,3h_HSCs#2,3h_LMPPs#1,3h_myel.prog.#1,3h_LMPPs#1.1,3h_myel.prog.#2,3h_eosinophilprog.,3h_myel.prog.#1.1,3h_myel.prog.#1.2,3h_LMPPs#2,...,72h_myel.prog.#1,72h_ery.prog.#1,72h_HSCs#2,72h_LMPPs#1,72h_ery.prog.#1.1,72h_myel.prog.#1.1,72h_MKprog.,72h_ery.prog.#1.2,72h_LMPPs#1.1,72h_HSCs#2.1
Mrpl15,1.0,4.0,0.0,2.0,1.0,5.0,0.0,4.0,3.0,6.0,...,1.0,2.0,3.0,1.0,0.0,7.0,2.0,0.0,3.0,1.0
Lypla1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,...,2.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
Gm37988,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tcea1,0.0,2.0,1.0,2.0,2.0,3.0,4.0,7.0,1.0,1.0,...,2.0,2.0,1.0,2.0,0.0,4.0,1.0,1.0,1.0,0.0
Rgs20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC125149.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AC168977.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PISD,1.0,0.0,2.0,0.0,2.0,1.0,4.0,7.0,2.0,0.0,...,12.0,2.0,2.0,1.0,2.0,5.0,1.0,4.0,2.0,4.0
DHRSX,0.0,1.0,1.0,1.0,0.0,2.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,2.0,0.0,0.0


In [39]:
# Reset index to move genes into a column
data = data.reset_index().rename(columns={'index': 'gene'})

# Melt the dataframe to long format
data_melted = pd.melt(data, id_vars=['gene'], var_name='time_celltype', value_name='expression')

# Split 'time_celltype' into 'timepoint' and 'celltype'
data_melted[['timepoint', 'celltype']] = data_melted['time_celltype'].str.split('_', expand=True)

# Combine 'gene' and 'celltype' into a single column
data_melted['gene_celltype'] = data_melted['gene'] + '_' + data_melted['celltype']

In [40]:
data_melted

Unnamed: 0,gene,time_celltype,expression,timepoint,celltype,gene_celltype
0,Mrpl15,3h_HSCs#1,1.0,3h,HSCs#1,Mrpl15_HSCs#1
1,Lypla1,3h_HSCs#1,0.0,3h,HSCs#1,Lypla1_HSCs#1
2,Gm37988,3h_HSCs#1,0.0,3h,HSCs#1,Gm37988_HSCs#1
3,Tcea1,3h_HSCs#1,0.0,3h,HSCs#1,Tcea1_HSCs#1
4,Rgs20,3h_HSCs#1,0.0,3h,HSCs#1,Rgs20_HSCs#1
...,...,...,...,...,...,...
137102051,AC125149.2,72h_HSCs#2,0.0,72h,HSCs#2,AC125149.2_HSCs#2
137102052,AC168977.1,72h_HSCs#2,0.0,72h,HSCs#2,AC168977.1_HSCs#2
137102053,PISD,72h_HSCs#2,4.0,72h,HSCs#2,PISD_HSCs#2
137102054,DHRSX,72h_HSCs#2,0.0,72h,HSCs#2,DHRSX_HSCs#2


In [41]:
# remove duplicated rows from data_melted by averaging expression
data_melted_group_mean = data_melted[['expression', 'timepoint', 'gene_celltype']].groupby(['gene_celltype', 'timepoint']).mean().reset_index()
data_melted_group_mean

Unnamed: 0,gene_celltype,timepoint,expression
0,0610007P14Rik_HSCs#1,24h,0.744186
1,0610007P14Rik_HSCs#1,3h,0.267857
2,0610007P14Rik_HSCs#1,72h,0.588235
3,0610007P14Rik_HSCs#2,24h,1.468917
4,0610007P14Rik_HSCs#2,3h,0.710106
...,...,...,...
653467,mt-Nd6_myel.prog.#2,3h,0.407143
653468,mt-Nd6_myel.prog.#2,72h,0.377778
653469,mt-Nd6_myel.prog.#3,24h,0.275862
653470,mt-Nd6_myel.prog.#3,3h,0.228571


In [42]:
# Now create a new dataframe with 'gene_celltype' as rows and 'timepoint' as columns
data_pivot = data_melted_group_mean.pivot(index='gene_celltype', columns='timepoint', values='expression')
data_pivot = data_pivot[['3h', '24h', '72h']]
data_pivot

timepoint,3h,24h,72h
gene_celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0610007P14Rik_HSCs#1,0.267857,0.744186,0.588235
0610007P14Rik_HSCs#2,0.710106,1.468917,1.157447
0610007P14Rik_LMPPs#1,1.063492,1.606145,1.319066
0610007P14Rik_LMPPs#2,0.846939,1.282528,0.967742
0610007P14Rik_MKprog.,0.996198,1.826923,1.510740
...,...,...,...
mt-Nd6_ery.prog.#2,0.188034,0.192708,0.188776
mt-Nd6_ery.prog.#3,0.247312,0.271605,0.314286
mt-Nd6_myel.prog.#1,0.298429,0.327586,0.422581
mt-Nd6_myel.prog.#2,0.407143,0.375758,0.377778


In [43]:
data_pivot.to_csv('data_ready_HSPC.csv')