# Format DESeq2 results for visualization

For our Python-based DEG visualization tool, we'll generate .pkl files for DEG data from each comparison within each cell type, and .json files that provide metadata used by the visualization tool.

## Load packages

In [1]:
from datetime import date
import hisepy
import json
import os
import pandas
import pickle
import re
import tarfile

## Output directories

In [2]:
if not os.path.isdir('output'):
    os.mkdir('output')

In [3]:
if not os.path.isdir('data'):
    os.mkdir('data')

## Parameters

In [4]:
adjp_cutoff = 0.05

## Helper functions

In [5]:
def read_csv_uuid(csv_uuid):
    csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid)
    if not os.path.isdir(csv_path):
        hise_res = hisepy.reader.cache_files([csv_uuid])
    csv_filename = os.listdir(csv_path)[0]
    csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename)
    df = pandas.read_csv(csv_file)
    return df

In [6]:
def format_cell_type(cell_type):
    cell_type = re.sub('\\+', 'pos', cell_type)
    cell_type = re.sub('-', 'neg', cell_type)
    cell_type = re.sub(' ', '_', cell_type)
    return cell_type

## DESeq2 results

### Retrieve DESeq2 results from HISE

In [7]:
deg_uuids = {
    'cohort': '4ed05e81-d2e4-4bce-a9d0-105844937a7a',
    'cmv': '654e8879-0ff5-40b1-b681-f7bf6ff56d6b',
    'sex': 'a6ebf838-a20c-418d-a90d-7d75255f6499',
    'cmvneg_cohort': '6a9efa94-bad1-42dc-a783-6fb13784111c',
    'cmvpos_cohort': '600fc816-7f37-445b-9231-daf569a701a8',
    'female_cohort': '7c7f5510-b4f1-48f7-9949-60c02c444daa',
    'male_cohort': '0242e417-cd3e-4def-b1c3-029296d6588c'
}

In [8]:
deg_dfs = {}
for contrast, uuid in deg_uuids.items():
    df = read_csv_uuid(uuid)
    df = df[['cell_type', 'fg', 'n_fg', 'bg', 'n_bg', 
             'gene', 'log2FoldChange', 
             'padj', 'pvalue', 'stat']]
    deg_dfs[contrast] = df

In [9]:
aim_names = {
    'cohort': 'Young Adult vs Older Adult',
    'cmv': 'CMV-Positive vs CMV-Negative',
    'sex': 'Female vs Male',
    'cmvneg_cohort': 'CMV-Negative Young Adult vs Older Adult',
    'cmvpos_cohort': 'CMV-Positive Young Adult vs Older Adult',
    'female_cohort': 'Female Young Adult vs Older Adult',
    'male_cohort': 'Male Young Adult vs Older Adult'
}

### Split DataFrames by cell type and save them to .pkl files

We'll nest these in the `./data/` output directory in a separate subdirectory for each contrast group.

As we iterate through the data, we'll also build a dictionary of metadata and a dictionary containing DEG counts based on the `adjp_cutoff` selected above.

In [10]:
df.head()

Unnamed: 0,cell_type,fg,n_fg,bg,n_bg,gene,log2FoldChange,padj,pvalue,stat
0,BaEoMaP cell,BR1,0,BR1,0,AL669831.5,0.176099,0.999989,0.904798,0.119602
1,BaEoMaP cell,BR1,0,BR1,0,NOC2L,-0.642299,0.999989,0.730353,-0.344656
2,BaEoMaP cell,BR1,0,BR1,0,ISG15,-0.854694,0.999989,0.373794,-0.889389
3,BaEoMaP cell,BR1,0,BR1,0,SDF4,0.614844,0.999989,0.509396,0.659778
4,BaEoMaP cell,BR1,0,BR1,0,B3GALT6,0.466746,0.999989,0.787854,0.269099


In [12]:
# Used to collect files for tar bundling
pkl_files = []

# Used to generate metadata
json_dict = {}
json_idx = 0

# Used to store DEG counts based on adjp_cutoff
n_deg_dict = {}

# Used to convert names for pseudobulk metadata
cell_type_names = {}

for contrast, df in deg_dfs.items():
    aim = aim_names[contrast]
    
    out_path = './data/deg/{c}'.format(c = contrast)
    if not os.path.isdir(out_path):
        os.makedirs(out_path)
    
    n_deg_dict[aim] = {}
    
    df = df.groupby('cell_type')
    for group_name, group_df in df:
        cell_type = format_cell_type(group_name)
        cell_type_names[cell_type] = group_name
        
        # Store significant gene counts in n_deg_dict
        sig_df = group_df[group_df['padj'] < adjp_cutoff]
        n_up = sum(sig_df['log2FoldChange'] > 0)
        n_dn = sum(sig_df['log2FoldChange'] < 0)
        
        n_deg_dict[aim][group_name] = {
            'fg': group_df['fg'].to_list()[0],
            'bg': group_df['bg'].to_list()[0],
            'n_up': n_up,
            'n_dn': n_dn,
            'n_tested': group_df.shape[0]
        }

        # Save results to .pkl file
        out_file = '{p}/{c}.pkl'.format(p = out_path, c = cell_type)
        with open(out_file, 'wb') as f:
            pickle.dump(group_df, f)
        pkl_files.append(out_file)

        # Store metadata in json_dict
        json_dict[str(json_idx)] = {
            'file': out_file,
            'analysis': {
                'aim': aim,
                'modality': 'scRNA_DE',
                'celltype': group_name
            },
            'type_differential': {
               'feat_header': 'gene',
               'es_header': 'log2FoldChange',
               'p_header': 'padj',
               'meta_header': ['fg', 'bg', 'pvalue', 'stat']
            }
        }
        
        json_idx += 1

### Write metadata dictionary to JSON

In [14]:
out_deg_meta = 'data/deg_meta.json'
with open(out_deg_meta, 'w') as f:
    json.dump(json_dict, f, indent = 4)

### Write DEG counts dictionary to JSON

In [15]:
out_deg_counts = 'data/deg_counts.json'
with open(out_deg_counts, 'w') as f:
    json.dump(n_deg_dict, f, indent = 4)

## Pseudobulk expression

We also need to structure the pseudobulk expression data for display. We can retrieve this from the .csv files that store pseudobulk expression data.

### Retrieve and extract csv files

In [16]:
expr_uuid = '0cbe7468-62df-45a4-be72-e7182904cbad'

In [17]:
expr_path = '/home/jupyter/cache/{u}'.format(u = expr_uuid)
if not os.path.isdir(expr_path):
    hise_res = hisepy.reader.cache_files([expr_uuid])
expr_filename = os.listdir(expr_path)[0]
expr_file = '{p}/{f}'.format(p = expr_path, f = expr_filename)

In [18]:
tar = tarfile.open(expr_file)
tar.extractall()

In [19]:
expr_csvs = tar.getnames()

### Read csvs into DataFrames

In [20]:
expr_dfs = {}
for expr_csv in expr_csvs:
    df = pandas.read_csv(expr_csv)
    cell_type = re.sub('_seurat.+', '', expr_csv)
    cell_type = re.sub('.+pbmc_', '', cell_type)
    expr_dfs[cell_type] = df

### Convert to pkl for visualization

#### All samples: for top-level comparisons

In [21]:
out_path = './data/expr/all/'
if not os.path.isdir(out_path):
    os.makedirs(out_path)
for cell_type, df in expr_dfs.items():
    out_file = '{p}/{t}_expr.pkl'.format(p = out_path, t = cell_type)
    with open(out_file, 'wb') as f:
        pickle.dump(df, f)
    pkl_files.append(out_file)

In [29]:
df.head()

Unnamed: 0,barcodes,gene,normalized_counts,subject.subjectGuid,subject.biologicalSex,subject.cmv,cohort.cohortGuid,sample.visitName,sample.subjectAge
1,SOX4+-naive-CD8-T-cell.BR1002.Flu-Year-1-Day-0,MIR1302-2HG,0,BR1002,Male,Negative,BR1,Flu Year 1 Day 0,28
3,SOX4+-naive-CD8-T-cell.BR1004.Flu-Year-1-Day-0,MIR1302-2HG,0,BR1004,Male,Negative,BR1,Flu Year 1 Day 0,30
5,SOX4+-naive-CD8-T-cell.BR1006.Flu-Year-1-Day-0,MIR1302-2HG,0,BR1006,Male,Negative,BR1,Flu Year 1 Day 0,31
6,SOX4+-naive-CD8-T-cell.BR1007.Flu-Year-1-Day-0,MIR1302-2HG,0,BR1007,Male,Positive,BR1,Flu Year 1 Day 0,30
10,SOX4+-naive-CD8-T-cell.BR1011.Flu-Year-1-Day-0,MIR1302-2HG,0,BR1011,Male,Negative,BR1,Flu Year 1 Day 0,31


#### Subset samples

In [22]:
subset_filters = {
    'cmvpos': ('subject.cmv', 'Positive'),
    'cmvneg': ('subject.cmv', 'Negative'),
    'female': ('subject.biologicalSex', 'Female'),
    'male': ('subject.biologicalSex', 'Male')
}

In [23]:
for subset_name, filter in subset_filters.items():
    out_path = './data/expr/{s}'.format(s = subset_name)
    if not os.path.isdir(out_path):
        os.makedirs(out_path)
    for cell_type, df in expr_dfs.items():
        filter_col = filter[0]
        filter_value = filter[1]
        df = df[df[filter_col] == filter_value]
        
        out_file = '{p}/{t}_expr.pkl'.format(p = out_path, t = cell_type)
        with open(out_file, 'wb') as f:
            pickle.dump(df, f)
        pkl_files.append(out_file)

KeyboardInterrupt: 

#### Build metadata dictionary

In [None]:
aim_sets = {
    'cohort': 'all',
    'cmv': 'all',
    'sex': 'all',
    'cmvneg_cohort': 'cmvneg',
    'cmvpos_cohort': 'cmvpos',
    'female_cohort': 'female',
    'male_cohort': 'male'
}

In [None]:
aim_grouping = {
    'cohort': 'cohort.cohortGuid',
    'cmv': 'subject.cmv',
    'sex': 'subject.biologicalSex',
    'cmvneg_cohort': 'cohort.cohortGuid',
    'cmvpos_cohort': 'cohort.cohortGuid',
    'female_cohort': 'cohort.cohortGuid',
    'male_cohort': 'cohort.cohortGuid'
}

In [None]:
pb_json = {}
json_idx = 0

for contrast, aim_set in aim_sets.items():
    pkl_path = './data/expr/{s}/'.format(s = aim_set)
    pkl_files = os.listdir(pkl_path)
    for pkl_file in pkl_files:
        cell_type = re.sub('_expr.pkl','',pkl_file)
        pb_json[json_idx] = {
            'file': '{p}/{f}'.format(p = out_path, f = pkl_file),
            'analysis': {
                'aim': aim_names[contrast],
                'modality': 'scRNA_DE',
                'celltype': cell_type_names[cell_type]
            },
            'type_expression_xsec': {
                'feat_header': 'gene',
                'exp_header': 'normalized_counts',
                'obs_header': 'subject.subjectGuid',
                'meta_header': [
                    'subject.subjectGuid',
                    'sample.subjectAge',
                    'subject.biologicalSex',
                    'subject.cmv',
                    'cohort.cohortGuid',
                    'sample.visitName',
                    'n_cells'
                ],
                'group_header': aim_grouping[contrast],
                'paired_data_header': {},
                'paired_meta_header': {}
            }
        }

### Assemble files as a .tar for storage

In [11]:
out_tar = 'output/ref_pbmc_deseq_results_pkl_{d}.tar'.format(d = date.today())
tar = tarfile.open(out_tar, 'w')
for pkl_file in pkl_files:
    tar.add(pkl_file)
tar.close()

## Upload results to HISE

In [13]:
study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'PBMC Ref. DESeq2 for visualization {d}'.format(d = date.today())

In [14]:
in_files = list(deg_uuids.values())

In [15]:
in_files

['19220f75-6958-401c-99d7-55c60cfe0423',
 '2cf77dfc-c431-4d22-859f-90eda495b499',
 'f1d2aa19-0455-445b-8a9d-729ae739be42',
 'c0469a5f-fa10-43dd-8dc5-5aed3b7ce179',
 '5363fc80-7ad0-480b-851d-aaea5f45d332',
 '2d208f29-50e4-4a88-9323-8e9b0931fc6f',
 '8536f675-aeab-45c3-8929-b7e85bec71e7']

In [16]:
out_files = [out_tar, out_json]

In [17]:
out_files

['output/ref_pbmc_deseq_results_pkl_2024-04-05.tar',
 'output/ref_pbmc_deseq_results_2024-04-05.json']

In [55]:
import session_info
session_info.show()