In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
from pathlib import Path
import glob
import warnings
warnings.filterwarnings('ignore')

import os
os.chdir('/lustre/scratch/kiviaho/prostate_spatial/')

### Formatting Dong et al. 2020 data

In [9]:
sc_files = glob.glob('sc-reference/dong_2020/*txt') 
dong_annot = pd.read_csv('./sc-reference/dong_2020/dong_2020_annot.csv',sep=';',index_col=0)
dong_annot = dong_annot.rename(columns={'cells':'celltype_orig'})

# Download the files into a list and concatenate together
adata_list = []
for file in sc_files:
    s_abbr = '_'.join(file.split('/')[2].split('_')[0:2])
    
    with open(file) as x:
        ncols = len(x.readline().split('\t'))

    df = pd.read_csv(file, usecols=range(1,ncols),delimiter='\t',index_col=0)
    adata = ad.AnnData(df).T

    #### ADDING METADATA ####
    adata.obs_names = s_abbr + '_' + adata.obs_names
    meta = adata.obs.copy()
    meta['sample'] = s_abbr
    meta['patient'] = s_abbr
    meta = meta.merge(dong_annot,how='left',left_index=True,right_index=True)
    meta['phenotype'] = 'CRPC'
    meta['dataset'] = 'dong_2020'

    adata.obs = meta.copy()
    ##########
    adata.obs_names_make_unique()

    # Since the genes were originally named with ENSEMBL ID, we have to make them unique.
    adata.var_names_make_unique()
    adata_list.append(adata)
    
adata_concat = ad.concat(adata_list, join='outer', fill_value=0)

adata_concat.obs
adata_concat.write('sc-reference/dong_2020/adata_obj.h5ad')

### Formatting Chen et al. 2021 data

In [18]:

adata = sc.read_csv('sc-reference/chen_2021/GSM4203181_data.raw.matrix.txt',delimiter='\t')
adata
adata = adata.T

chen_obs = adata.obs
#### ADDING METADATA ####

chen_obs['sample'] = ['chen_'+s.split('-')[1] for s in chen_obs.index]
chen_obs['patient'] = chen_obs['sample']
chen_obs['celltype_orig'] = 'unknown'
chen_obs['phenotype'] = 'PCa'
chen_obs['dataset'] = 'chen_2021'

##########


if (chen_obs.index == adata.obs_names).all():
    adata.obs = chen_obs
adata.obs_names = adata.obs['sample'] + '_' + [s.split('-')[0] for s in adata.obs_names] + '.1'
adata.obs.index = adata.obs.index.set_names(['cell'])
adata.obs

adata.write('sc-reference/chen_2021/adata_obj.h5ad')

### Formatting Song et al. 2022 data

In [19]:
# This information is from supplementary file 1 of the article Song et al. 2022 Nature Comms
# The cell type annotations are available without cell IDs, so merging isn't possible.
# Number of analysed cells is 21743
song_samples = ['AUG_PB1A', 'AUG_PB1B','MAY_PB1A','MAY_PB1B', 'MAY_PB2A','MAY_PB2B', # BIOPSIES
                'PR5186','PR5196','PR5199','PR5269', # UNPAIRED RPs
                'PR5249_N','PR5249_T', # NORMAL PAIRED RPs
                'PR5251_N','PR5251_T', # NORMAL PAIRED RPs
                'PR5254_N','PR5254_T', # NORMAL PAIRED RPs
                'PR5261_N','PR5261_T'] # NORMAL PAIRED RPs

song_patients = ['P1','P1','P2','P2','P3','P3',
                 'P4','P5','P6','P7',
                 'P8','P8',
                 'P9','P9',
                 'P10','P10',
                 'P11','P11']
song_phenotype = list(np.repeat('PCa',10)) + ['normal','PCa','normal','PCa','normal','PCa','normal','PCa']


# Replace some of the idents with those matching file names
song_file_names = song_samples.copy()
song_file_names[:] = ['AUG_PB_1A' if x=='AUG_PB1A' else x for x in song_file_names]
song_file_names[:] = ['AUG_PB_1B' if x=='AUG_PB1B' else x for x in song_file_names]

song_file_names[:] = ['PB1A' if x=='MAY_PB1A' else x for x in song_file_names]
song_file_names[:] = ['PB1B' if x=='MAY_PB1B' else x for x in song_file_names]

song_file_names[:] = ['PB2A' if x=='MAY_PB2A' else x for x in song_file_names]
song_file_names[:] = ['PB2B' if x=='MAY_PB2B' else x for x in song_file_names]


adata_samples_list = []
for idx,file_abbr in enumerate(song_file_names):

    # Find all files generated from one sample
    file_name_list = glob.glob('sc-reference/song_2022/*'+file_abbr+'*')
    sample_abbr = song_samples[idx]
    patient_abbr = song_patients[idx]
    phenot_abbr = song_phenotype[idx]

    sample_adata = []

    # Read in each file and append them to a sample-specific list
    for f in file_name_list:
        # exp_abbr = f.split('/')[2].split('_')[0]
        adata = sc.read_csv(f,dtype=np.int16,delimiter='\t').T
        adata.obs_names = sample_abbr +'_'+ adata.obs_names + '-1'
        sample_adata.append(adata)
    adata_concat_one_sample = ad.concat(sample_adata, join='outer', fill_value=0)

    # Concatenate together data from the same sample but from different sequencing runs.
    adata_concat_one_sample.obs['sample'] = sample_abbr
    adata_concat_one_sample.obs['patient'] = 'song_'+patient_abbr
    adata_concat_one_sample.obs['celltype_orig'] = 'unknown'
    adata_concat_one_sample.obs['phenotype'] = phenot_abbr
    adata_concat_one_sample.obs['dataset'] = 'song_2022'
    adata_samples_list.append(adata_concat_one_sample)

adata_concat_all = ad.concat(adata_samples_list, join='outer', fill_value=0)
adata_concat_all.obs

adata_concat_all.write('sc-reference/song_2022/adata_obj.h5ad')


### Formatting Cheng et al. 2022 data 

In [20]:
data_dirs = glob.glob('sc-reference/cheng_2022/results/*')
new_names = pd.read_csv('sc-reference/cheng_2022/sample_shorthands.txt',sep='\t')

adata_list = []
for dir in data_dirs:
    # Get the shorhand
    sample = dir.split('/')[-1]
    shorthand = new_names[new_names['old']==sample]['new'].item()
    patient = shorthand.split('_')[0]
    if 'CRPC' in patient:
        phenot_abbr = 'CRPC'
    else:
        phenot_abbr = 'PCa'

    adata = sc.read_10x_mtx(dir+'/outs/filtered_feature_bc_matrix')

    adata.obs['sample'] = shorthand
    adata.obs['patient'] = 'cheng_'+patient
    adata.obs['celltype_orig'] = 'unknown'
    adata.obs['phenotype'] = phenot_abbr
    adata.obs['dataset'] = 'cheng_2022'

    adata.obs_names = shorthand + '_' + adata.obs_names
    adata_list.append(adata)

adata_concat = ad.concat(adata_list, join='outer', fill_value=0)
adata_concat.obs
adata_concat.write('sc-reference/cheng_2022/adata_obj.h5ad')

### Formatting Wong et al. 2022 data

In [32]:
dat = sc.read_h5ad('sc-reference/wong_2022/wong_2022_data.h5ad')
annot = dat.obs.copy()
cell_annot = pd.read_csv('sc-reference/wong_2022/GSE185344_PH_scRNA.rename_cluster.csv')
cell_annot.index = cell_annot['Unnamed: 0']
merged_obs = pd.merge(annot,cell_annot,left_index=True,right_index=True,how='left')

if (merged_obs.index == dat.obs.index).all():
        dat.obs = merged_obs
        del dat.raw

# Format the phenotypes
phenot = [s.split('_')[2] for s in dat.obs['orig.ident']]
phenot = [w.replace('Benign', 'normal') for w in phenot]
phenot = [w.replace('Tumor', 'PCa') for w in phenot]

# Format the patient 
patient = ['_'.join(s.split('_')[:2]) for s in dat.obs['orig.ident']]

new_obs = pd.DataFrame()
new_obs.index = dat.obs.index.copy()
new_obs['sample'] = 'wong2022_'+dat.obs['orig.ident'].copy()
new_obs['patient'] = ['wong2022_'+ p for p in patient]
new_obs['celltype_orig'] = dat.obs['cellactivity_clusters'].copy()
new_obs['phenotype'] = phenot
new_obs['dataset'] = 'wong_2022'

dat.obs = new_obs

# Lose the unnecessary column 'features'
dat.var = dat.var.drop(columns='features')

dat.write('sc-reference/wong_2022/adata_obj.h5ad')


### Formatting Chen (gonghong) et al. 2022 data

In [21]:
data_dirs = glob.glob('sc-reference/chen_2022/results/*')

adata_list = []
for dir in data_dirs:
    # Get the shorhand
    print(dir)
    sample = dir.split('/')[-1]
    if 'PCa' in sample:
        phenot_abbr = 'PCa'
    else:
        phenot_abbr = 'normal'

    adata = sc.read_10x_mtx(dir+'/outs/filtered_feature_bc_matrix')

    adata.obs['sample'] = 'chen2022_'+sample
    adata.obs['patient'] = 'chen2022_'+sample
    adata.obs['celltype_orig'] = 'unknown'
    adata.obs['phenotype'] = phenot_abbr
    adata.obs['dataset'] = 'chen_2022'

    adata.obs_names = sample + '_' + adata.obs_names
    adata_list.append(adata)

adata_concat = ad.concat(adata_list, join='outer', fill_value=0)
adata_concat.obs
adata_concat.write('sc-reference/chen_2022/adata_obj.h5ad')


sc-reference/chen_2022/results/P2
sc-reference/chen_2022/results/P1
sc-reference/chen_2022/results/P3
sc-reference/chen_2022/results/PCa1


### Formatting Hirz et al. 2023 data 

In [51]:

# Download the files into a list and concatenate together
# There are no counts for GSM5494349_SCG-PCA2-T-LG.count.csv
# PCA24 samples have been prepared another way, excluded

sc_files = sorted(glob.glob('sc-reference/hirz_2023/*SCG*')) # adjacent normal tissue
print(sc_files)
print()
hirz_annot = pd.read_csv('sc-reference/hirz_2023/GSE181294_scRNAseq.ano.csv',index_col=0)
hirz_annot = hirz_annot.drop(columns=['sample'])
hirz_annot = hirz_annot.rename(columns={'cells':'celltype_orig'})

adata_list = []
for f in sc_files:
    name_split = f.split('/')[-1].split('_')[-1].split('-')
    sample = ('_').join(name_split[1:3])
    patient = name_split[1]
    if name_split[2] == 'N':
        phenot_abbr = 'normal'
    else:
        phenot_abbr = 'PCa'
    
    adata = sc.read_csv(f,dtype=np.int16)
    adata = adata.T

    ####### Add metadata columns
    meta = adata.obs.copy()
    meta['sample'] = 'hirz_'+sample
    meta['patient'] = 'hirz_'+patient
    meta = meta.merge(hirz_annot,how='left',left_index=True,right_index=True,)
    meta['phenotype'] = phenot_abbr
    meta['dataset'] = 'hirz_2023'

    if (meta.index == adata.obs_names).all():
        adata.obs = meta.copy()
    print(sample)
    adata_list.append(adata)

adata_concat = ad.concat(adata_list, join='outer', fill_value=0)

adata_concat.obs
adata_concat.write('sc-reference/hirz_2023/adata_obj.h5ad')

['sc-reference/hirz_2023/GSM5494347_SCG-PCA21-N-LG.count.csv', 'sc-reference/hirz_2023/GSM5494348_SCG-PCA21-T-LG.count.csv', 'sc-reference/hirz_2023/GSM5494350_SCG-PCA3-T-LG.count.csv', 'sc-reference/hirz_2023/GSM5494351_SCG-PCA3-N-LG.count.csv', 'sc-reference/hirz_2023/GSM5494352_SCG-PCA4-T-HG.count.csv', 'sc-reference/hirz_2023/GSM5494353_SCG-PCA4-N-HG.count.csv', 'sc-reference/hirz_2023/GSM5494354_SCG-PCA5-T-LG.count.csv', 'sc-reference/hirz_2023/GSM5494355_SCG-PCA5-N-LG.count.csv', 'sc-reference/hirz_2023/GSM5494356_SCG-PCA6-T-HG.count.csv', 'sc-reference/hirz_2023/GSM5494357_SCG-PCA6-N-HG.count.csv', 'sc-reference/hirz_2023/GSM5494358_SCG-PCA7-T-HG.count.csv', 'sc-reference/hirz_2023/GSM5494359_SCG-PCA8-T-HG.count.csv', 'sc-reference/hirz_2023/GSM5494360_SCG-PCA9-T-LG.count.csv', 'sc-reference/hirz_2023/GSM5494361_SCG-PCA9-N-LG.count.csv', 'sc-reference/hirz_2023/GSM5494362_SCG-PCA10-T-LG.count.csv', 'sc-reference/hirz_2023/GSM5494363_SCG-PCA11-T-LG.count.csv', 'sc-reference/hirz_