In [23]:
import pandas as pd
import numpy as np
import os
from glob import glob
from pathlib import Path
from tqdm import tqdm

### STNET to SPCS

In [2]:
out_path = '/data/ani/repos/SPCS/test_data/stnet_copy'

Path(os.path.join(out_path, 'counts')).mkdir(parents=True, exist_ok=True)
Path(os.path.join(out_path, 'coords')).mkdir(parents=True, exist_ok=True)

counts_paths = glob('/data/ani/repos/TRIPLEX/data/stnet/ST-cnts/*')
coord_paths = glob('/data/ani/repos/TRIPLEX/data/stnet/ST-spotfiles/*')

counts_paths.sort()
coord_paths.sort()

In [3]:
for counts_path, coord_path in zip(counts_paths, coord_paths):
    # Read counts as parquet file
    counts = pd.read_parquet(counts_path).T
    
    # Read coordinates as csv file
    coords = pd.read_csv(coord_path, sep='\t')
    
    # Extract the index column
    names = coords['Unnamed: 0'].tolist()
    
    # From counts, extract the columns that are in names
    counts = counts[names]
    
    # Save the counts as txt file
    counts_save_path = os.path.join(out_path, 'counts', Path(counts_path).stem + '.txt')
    counts.to_csv(counts_save_path, sep=',')
    
    # Open the counts file and delete the first character from the first line and save again
    with open(counts_save_path, 'r') as f:
        lines = f.readlines()
        lines[0] = lines[0][1:]
        
    with open(counts_save_path, 'w') as f:
        f.writelines(lines)
    
    # Remove columns 'pixel_x' and 'pixel_y' from coords
    coords = coords.drop(columns=['pixel_x', 'pixel_y'])

    # Rename columns
    coords.columns = ['', 'coord1', 'coord2']
    
    # Save the coords as txt file
    coords_save_path = os.path.join(out_path, 'coords', Path(counts_path).stem + '.txt')
    coords.to_csv(coords_save_path, sep=',', index=False)
    
    # Open the counts file and delete the first character from the first line and save again
    with open(coords_save_path, 'r') as f:
        lines = f.readlines()
        lines[0] = lines[0][1:]
        
    with open(coords_save_path, 'w') as f:
        f.writelines(lines)
    

### SPCS to STNET

In [19]:
out_path = '/data/ani/repos/TRIPLEX/data/stnet/ST-cnts-smoothed'

Path(out_path).mkdir(parents=True, exist_ok=True)

counts_paths = glob('/data/ani/repos/TRIPLEX/data/stnet/ST-cnts/*')
coord_paths = glob('/data/ani/repos/TRIPLEX/data/stnet/ST-spotfiles/*')
smoothed_counts_paths = glob('/data/ani/repos/SPCS/test_data/stnet/smoothed/*')

counts_paths.sort()
coord_paths.sort()
smoothed_counts_paths.sort()

In [20]:
for counts_path, coord_path, smoothed_counts_path in zip(counts_paths, coord_paths, smoothed_counts_paths):
    counts = pd.read_parquet(counts_path).T
    coords = pd.read_csv(coord_path, sep='\t')
    smoothed_counts = pd.read_csv(smoothed_counts_path, sep=',', index_col=0)
    
    names = coords['Unnamed: 0'].tolist()
    
    counts[names] = smoothed_counts
    
    counts_save_path = os.path.join(out_path, Path(counts_path).name)
    
    # Save the counts as parquet file
    counts.T.to_parquet(counts_save_path)


### SPCS to HGGE

In [30]:
out_path = '/data/ani/datasets/indiana_gene_subset'

Path(out_path).mkdir(parents=True, exist_ok=True)

smoothed_counts_paths = glob('/data/ani/repos/smoothing_tools/SPCS/test_data/indiana/smoothed/*')
smoothed_counts_paths.sort()

In [32]:
for smoothed_counts_path in tqdm(smoothed_counts_paths):
    smoothed_counts = pd.read_csv(smoothed_counts_path, sep=',', index_col=0).T
    
    # Save counts as npy file
    np.save(os.path.join(out_path, 'counts_spcs', Path(smoothed_counts_path).stem + '.npy'), smoothed_counts.values)
    
    # Save genes as csv file
    cols = pd.DataFrame(smoothed_counts.columns)
    cols.to_csv(os.path.join(out_path, 'features', Path(smoothed_counts_path).stem + '.csv'), index=False, header=False)

100%|██████████| 4/4 [00:05<00:00,  1.36s/it]


### HGGE to SPCS

In [18]:
data_path = '/data/ani/datasets/indiana_gene_subset'
out_path = '/data/ani/repos/smoothing_tools/SPCS/test_data/indiana'

Path(os.path.join(out_path, 'counts')).mkdir(parents=True, exist_ok=True)
Path(os.path.join(out_path, 'coords')).mkdir(parents=True, exist_ok=True)

counts_paths = glob(os.path.join(data_path, 'counts', '*'))
features_paths = glob(os.path.join(data_path, 'features', '*'))
tissue_positions_paths = glob(os.path.join(data_path, 'tissue_positions', '*'))

counts_paths.sort()
features_paths.sort()
tissue_positions_paths.sort()

In [19]:
# Check if the file names are the same
for counts_path, features_path, tissue_positions_path in zip(counts_paths, features_paths, tissue_positions_paths):
    assert Path(counts_path).stem == Path(features_path).stem == Path(tissue_positions_path).stem

In [21]:
for counts_path, features_path, tissue_positions_path in zip(counts_paths, features_paths, tissue_positions_paths):
    # Read the data
    counts = np.load(counts_path)
    features = pd.read_csv(features_path, header=None)
    tissue_positions = pd.read_csv(tissue_positions_path, header=None)
    
    # Only keep the spots that are in the tissue
    tissue_positions = tissue_positions[tissue_positions[1]==1]
    
    counts_df = pd.DataFrame(counts, columns=features[0].tolist())
    counts_df.index = tissue_positions[0].tolist()
    
    # Transpose the counts so that the spots are in the columns
    counts_df = counts_df.T
    
    # Save the counts as text file
    counts_save_path = os.path.join(out_path, 'counts', Path(counts_path).stem + '.txt')
    counts_df.to_csv(counts_save_path, sep=',', index=True)
    
    # Open the counts file and delete the first character from the first line and save again
    with open(counts_save_path, 'r') as f:
        lines = f.readlines()
        lines[0] = lines[0][1:]
        
    with open(counts_save_path, 'w') as f:
        f.writelines(lines)
        
    # Save the coordinates as text file
    coords_save_path = os.path.join(out_path, 'coords', Path(counts_path).stem + '.txt')
    coords_df = tissue_positions.drop(columns=[1,4,5])
    coords_df.columns = ['', 'coord1', 'coord2']
    
    coords_df.to_csv(coords_save_path, sep=',', index=False)
    
    # Open the counts file and delete the first character from the first line and save again
    with open(coords_save_path, 'r') as f:
        lines = f.readlines()
        lines[0] = lines[0][1:]
        
    with open(coords_save_path, 'w') as f:
        f.writelines(lines)