# Format DESeq2 results for visualization

For our Python-based DEG visualization tool, we'll generate .pkl files for DEG data from each comparison within each cell type, and .json files that provide metadata used by the visualization tool.

## Load packages

In [1]:
from datetime import date
import hisepy
import json
import os
import pandas
import pickle
import re
import tarfile

## Output directories

In [2]:
if not os.path.isdir('output'):
    os.mkdir('output')

In [3]:
if not os.path.isdir('data'):
    os.mkdir('data')

## Parameters

In [18]:
adjp_cutoff = 0.05

## Helper functions

In [4]:
def read_csv_uuid(csv_uuid):
    csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid)
    if not os.path.isdir(csv_path):
        hise_res = hisepy.reader.cache_files([csv_uuid])
    csv_filename = os.listdir(csv_path)[0]
    csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename)
    df = pandas.read_csv(csv_file)
    return df

In [5]:
def format_cell_type(cell_type):
    cell_type = re.sub('\\+', 'pos', cell_type)
    cell_type = re.sub('-', 'neg', cell_type)
    cell_type = re.sub(' ', '_', cell_type)
    return cell_type

## Retrieve DESeq2 results from HISE

In [6]:
deg_uuids = {
    'cohort': '',
    'cmv': '',
    'sex': '',
    'cmvneg_cohort': '',
    'cmvpos_cohort': '',
    'female_cohort': '',
    'male_cohort': ''
}

In [22]:
deg_dfs = {}
for contrast, uuid in deg_uuids.items():
    df = read_csv_uuid(uuid)
    df = df[['cell_type', 'fg', 'n_fg', 'bg', 'n_bg', 
             'gene', 'log2FoldChange', 
             'padj', 'pvalue', 'stat']]
    deg_dfs[contrast] = df

In [23]:
aim_names = {
    'cohort': 'Young Adult vs Older Adult',
    'cmv': 'CMV-Positive vs CMV-Negative',
    'sex': 'Female vs Male',
    'cmvneg_cohort': 'CMV-Negative Young Adult vs Older Adult',
    'cmvpos_cohort': 'CMV-Positive Young Adult vs Older Adult',
    'female_cohort': 'Female Young Adult vs Older Adult',
    'male_cohort': 'Male Young Adult vs Older Adult'
}

## Split DataFrames by cell type and save them to .pkl files

We'll nest these in the `./data/` output directory in a separate subdirectory for each contrast group.

As we iterate through the data, we'll also build a dictionary of metadata and a dictionary containing DEG counts based on the `adjp_cutoff` selected above.

In [24]:
df.head()

Unnamed: 0,cell_type,fg,n_fg,bg,n_bg,gene,log2FoldChange,padj,pvalue,stat
0,BaEoMaP cell,BR1,0,BR1,0,AL669831.5,0.176099,0.999989,0.904798,0.119602
1,BaEoMaP cell,BR1,0,BR1,0,NOC2L,-0.642299,0.999989,0.730353,-0.344656
2,BaEoMaP cell,BR1,0,BR1,0,ISG15,-0.854694,0.999989,0.373794,-0.889389
3,BaEoMaP cell,BR1,0,BR1,0,SDF4,0.614844,0.999989,0.509396,0.659778
4,BaEoMaP cell,BR1,0,BR1,0,B3GALT6,0.466746,0.999989,0.787854,0.269099


In [35]:
pkl_files = []

json_dict = {}
json_idx = 0

n_deg_dict = {}

for contrast, df in deg_dfs.items():
    aim = aim_names[contrast]
    
    out_path = './data/{c}'.format(c = contrast)
    if not os.path.isdir(out_path):
        os.mkdir(out_path)

    n_deg_dict[aim] = {}
    
    df = df.groupby('cell_type')
    for group_name, group_df in df:
        cell_type = format_cell_type(group_name)

        # Store significant gene counts in n_deg_dict
        sig_df = group_df[group_df['padj'] < adjp_cutoff]
        n_up = sum(sig_df['log2FoldChange'] > 0)
        n_dn = sum(sig_df['log2FoldChange'] < 0)
        
        n_deg_dict[aim][group_name] = {
            'fg': group_df['fg'].to_list()[0],
            'bg': group_df['bg'].to_list()[0],
            'n_up': n_up,
            'n_dn': n_dn,
            'n_tested': group_df.shape[0]
        }

        # Save results to .pkl file
        out_file = '{p}/{c}.pkl'.format(p = out_path, c = cell_type)
        with open(out_file, 'wb') as f:
            pickle.dump(group_df, f)
        pkl_files.append(out_file)

        # Store metadata in json_dict
        json_dict[str(json_idx)] = {
            'file': out_file,
            'analysis': {
                'aim': aim,
                'modality': 'scRNA_DE',
                'celltype': group_name
            },
            'type_differential': {
               'feat_header': 'gene',
               'es_header': 'log2FoldChange',
               'p_header': 'padj',
               'meta_header': ['fg', 'bg', 'pvalue', 'stat']
            }
        }
        
        json_idx += 1

## Assemble files as a .tar for storage

In [11]:
out_tar = 'output/ref_pbmc_deseq_results_pkl_{d}.tar'.format(d = date.today())
tar = tarfile.open(out_tar, 'w')
for pkl_file in pkl_files:
    tar.add(pkl_file)
tar.close()

## Write metadata dictionary to JSON

In [12]:
out_meta = 'output/ref_pbmc_deseq_results_meta_{d}.json'.format(d = date.today())
with open(out_meta, 'w') as f:
    json.dump(json_dict, f, indent = 4)

## Write DEG counts dictionary to JSON

In [None]:
out_deg = 'output/ref_pbmc_deseq_results_deg_counts_{d}.json'.format(d = date.today())
with open(out_deg, 'w') as f:
    json.dump(n_deg_dict, f, indent = 4)

## Upload results to HISE

In [13]:
study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'PBMC Ref. DESeq2 for visualization {d}'.format(d = date.today())

In [14]:
in_files = list(deg_uuids.values())

In [15]:
in_files

['19220f75-6958-401c-99d7-55c60cfe0423',
 '2cf77dfc-c431-4d22-859f-90eda495b499',
 'f1d2aa19-0455-445b-8a9d-729ae739be42',
 'c0469a5f-fa10-43dd-8dc5-5aed3b7ce179',
 '5363fc80-7ad0-480b-851d-aaea5f45d332',
 '2d208f29-50e4-4a88-9323-8e9b0931fc6f',
 '8536f675-aeab-45c3-8929-b7e85bec71e7']

In [16]:
out_files = [out_tar, out_json]

In [17]:
out_files

['output/ref_pbmc_deseq_results_pkl_2024-04-05.tar',
 'output/ref_pbmc_deseq_results_2024-04-05.json']

In [55]:
import session_info
session_info.show()