In [None]:
import scanpy as sc
import anndata as ad
from scipy.sparse import csr_matrix
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

The data for this study was downloaded from NCBI GEO [GSE190094](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE190094). It is a bit of a pain to sift through, but for this analysis, we only need the samples marked as 'UMOD-WT/WT-1x' and 'UMOD-KI/KI-1x'.

In [None]:
marshall_directory = '' # Replace with wherever you saved the data.
WT_samples = {'01':['1a', '1b', '1c', '1d', '1e']}
KI_samples = {'01':['1a', '1b', '1c', '1d', '1e']}

In [None]:
# Define the WT data first
WT_datasets = []

for sample in WT_samples:
    sub_samples = WT_samples[sample]
    for sub in sub_samples:
        wt_beadlocations_qc = pd.read_csv(marshall_directory + '/WT/' + str(sample) + '/WT_' + str(sub) + '_BeadLocationsForR_qc.csv.gz', index_col=0)
        wt_beadlocations_qc.index.name = None
        
        wt_dge = pd.read_csv(marshall_directory + '/WT/' + str(sample) + '/WT_' + str(sub) + '_MappedDGEForR.csv.gz', index_col=0).T
        
        adata = sc.AnnData(X=csr_matrix(wt_dge.to_numpy()), var=pd.DataFrame(index=wt_dge.columns), obs=pd.DataFrame(index=wt_dge.index))
        
        # Subset based on the cell types
        adata = adata[wt_beadlocations_qc.index, :]
        adata.obs = wt_beadlocations_qc
        adata.obs['sample'] = 'WT'
        adata.obs['sub_sample'] = 'WT_' + str(sample) + '_' + str(sub)
        adata.X[np.isnan(adata.X.toarray())] = 0.0
        
        WT_datasets.append(adata)
        
adata_wt = WT_datasets[0].concatenate(WT_datasets[1:], join='outer')

In [None]:
KI_datasets = []

# for sample in OB_samples:
for sample in KI_samples:
    sub_samples = KI_samples[sample]
    for sub in sub_samples:
        ki_beadlocations_qc = pd.read_csv(marshall_directory + '/KI/' + str(sample) + '/KI_' + str(sub) + '_BeadLocationsForR_qc.csv.gz', index_col=0)
        ki_beadlocations_qc.index.name = None
        
        ki_dge = pd.read_csv(marshall_directory + '/KI/' + str(sample) + '/KI_' + str(sub) + '_MappedDGEForR.csv.gz', index_col=0).T
        
        adata = sc.AnnData(X=csr_matrix(ki_dge.to_numpy()), var=pd.DataFrame(index=ki_dge.columns), obs=pd.DataFrame(index=ki_dge.index))
        
        # Subset based on the cell types
        adata = adata[ki_beadlocations_qc.index, :]
        adata.obs = ki_beadlocations_qc
        adata.obs['sample'] = 'KI'
        adata.obs['sub_sample'] = 'KI_' + str(sample) + '_' + str(sub)
        adata.X[np.isnan(adata.X.toarray())] = 0.0
        
        KI_datasets.append(adata)
        
adata_ki = KI_datasets[0].concatenate(KI_datasets[1:], join='outer')

In [None]:
# Define the merged data now
adata_marshall = adata_wt.concatenate(adata_ki, join='outer')

In [None]:
sc.pp.filter_genes(adata_marshall, min_cells=1) # Filter out genes with low expression

adata_marshall.layers['counts'] = adata_marshall.X.copy() # Store raw counts
sc.pp.normalize_total(adata_marshall, inplace=True, target_sum=1e4) # Normalise
sc.pp.log1p(adata_marshall) # Log-transform

In [None]:
# Save the data now
adata_marshall.write(marshall_data_directory + '/marshall22_umod_merged.h5ad')