# Producing R-compatible TSV files from .loom

Markdown text

In [1]:
import numpy as np
import pandas
import scanpy as sc
import anndata
import json
import matplotlib.pyplot as plt
import scvelo as scv
import os
import requests
import shutil
import boto3
import scanpy as sc
from collections import defaultdict

In [2]:
# determines which count matrices to load -- can be 'filtered' or 'all'
TYPE = ['filtered', 'all']

# determines which layers to use for the output -- can be 'matrix' (all counts), 'spliced', 'unspliced', 'ambiguous'
LAYER = ['spliced']

# determines which time series to include -- can be a tuple of any of 'm5', 'm3' or 'm1'
TIMESERIES = [('m5', 'm3'), ('m3', 'm1'), ('m5', 'm1')]

## Step 1 -- Read count matrix.

This will read the count matrix that is specified by `TYPE`. It will also trim the observation names returned by `velocyto` to make it easier to parse by R.

In [3]:
def read_count_matrix(type):
    m = sc.read_loom(os.path.join('count_matrices', type + '.loom'))
    m.obs.index = ['_'.join(i.split(":")[1].split('.')[0:2]) for i in m.obs.index]
    
    return m

## Step 2 -- Filter by timeseries.

We will now fetch the time series we want to compare.

In [4]:
def filter_by_timeseries(matrix, comparison):
    filter_list = []
    
    for index in matrix.obs.index:
        contained = False

        for timeseries in comparison:
            if timeseries in index:
                contained = True
                break

        filter_list.append(contained)
    
    return m[filter_list, :]

## Step 3 -- Create Pandas DataFrame from matrix

This will also reorder the columns to create the appropriate ordering for later stages

In [5]:
def reorder_and_create_df(matrix, type, layer, comparison):
    df = pandas.DataFrame(data=matrix.layers[layer].toarray().T, columns=list(matrix.obs_names), index=list(matrix.var_names))
    df = df.astype(int)
    
    columns_by_timeseries = defaultdict(list)
    
    for col in df.columns:
        timeseries, _ = col.split("_")
        columns_by_timeseries[timeseries].append(col)
    
    reordered_list = []
    
    for timeseries in comparison:
        reordered_list += columns_by_timeseries[timeseries]
    
    df = df[reordered_list]
    
    return df

In [6]:
for type in TYPE:
    for layer in LAYER:
        for comparison in TIMESERIES:
            print("Working on type:", type, ', layer:', layer, 'between', comparison)
            
            print('Reading count matrix..')
            m = read_count_matrix(type)
            print(m.obs.index)
            
            print("Filtering count matrix to include only relevant timeseries...")
            m = filter_by_timeseries(m, comparison)
            print(m.obs.index)
            
            print("Creating dataframe and reordering columns...")
            m = reorder_and_create_df(m, type, layer, comparison)
            print(m)
            
            filename = 'matrix_' + type + '_' + layer + '_' + '-'.join(comparison) + '.tsv'
            path = os.path.join('tsv_matrices', filename)
            print("saving to", os.path.abspath(path))
            
            with open(path, 'w', newline='') as f:
                m.to_csv(f, sep="\t")
            
            print("")

Working on type: filtered , layer: spliced between ('m5', 'm3')
Reading count matrix..
Index(['m1_1', 'm1_2', 'm1_3', 'm1_4', 'm3_1', 'm3_2', 'm3_3', 'm3_4', 'm5_1',
       'm5_2', 'm5_3', 'm5_4'],
      dtype='object')
Filtering count matrix to include only relevant timeseries...
Index(['m3_1', 'm3_2', 'm3_3', 'm3_4', 'm5_1', 'm5_2', 'm5_3', 'm5_4'], dtype='object')
Creating dataframe and reordering columns...
           m5_1  m5_2  m5_3  m5_4  m3_1  m3_2  m3_3  m3_4
Y74C9A.6      0     0     0     0     0     0     0     0
homt-1       28    57    35    47    94    65    61    68
rcor-1      170   149    93   209   107   243   292   269
Y74C9A.9      0     0     0     0     0     0     0     0
sesn-1       21    23    11    11     1    10    20    31
...         ...   ...   ...   ...   ...   ...   ...   ...
T23E7.8       0     0     0     0     0     0     0     0
T23E7.2       8     0     1     0    29     0     0    27
cgt-2         0     0     0     0     0     0     0     0
6R55.