# Format DESeq2 results for visualization

For our Python-based DEG visualization tool, we'll generate .pkl files for DEG data from each comparison within each cell type, and .json files that provide metadata used by the visualization tool.

## Change Pandas version

For compatibility with our visualization tooling, we'll set pandas to run in v1.3.5

In [None]:
!pip install -q pandas==1.3.5

## Load packages

In [1]:
from datetime import date
import hisepy
import json
import os
import pandas
import pickle
import re
import tarfile

## Output directories

In [2]:
if not os.path.isdir('output'):
    os.mkdir('output')

In [3]:
if not os.path.isdir('data'):
    os.mkdir('data')

## Parameters

In [4]:
adjp_cutoff = 0.05

## Helper functions

In [5]:
def read_csv_uuid(csv_uuid):
    csv_path = '/home/jupyter/cache/{u}'.format(u = csv_uuid)
    if not os.path.isdir(csv_path):
        hise_res = hisepy.reader.cache_files([csv_uuid])
    csv_filename = os.listdir(csv_path)[0]
    csv_file = '{p}/{f}'.format(p = csv_path, f = csv_filename)
    df = pandas.read_csv(csv_file)
    return df

In [6]:
def format_cell_type(cell_type):
    cell_type = re.sub('\\+', 'pos', cell_type)
    cell_type = re.sub('-', 'neg', cell_type)
    cell_type = re.sub(' ', '_', cell_type)
    return cell_type

## DESeq2 results

### Retrieve DESeq2 results from HISE

In [7]:
deg_uuids = {
    'cohort': '4ed05e81-d2e4-4bce-a9d0-105844937a7a',
    'cmv': '654e8879-0ff5-40b1-b681-f7bf6ff56d6b',
    'sex': 'a6ebf838-a20c-418d-a90d-7d75255f6499',
    'cmvneg_cohort': '6a9efa94-bad1-42dc-a783-6fb13784111c',
    'cmvpos_cohort': '600fc816-7f37-445b-9231-daf569a701a8',
    'female_cohort': '7c7f5510-b4f1-48f7-9949-60c02c444daa',
    'male_cohort': '0242e417-cd3e-4def-b1c3-029296d6588c'
}

In [8]:
deg_dfs = {}
for contrast, uuid in deg_uuids.items():
    df = read_csv_uuid(uuid)
    df = df[['cell_type', 'fg', 'n_fg', 'bg', 'n_bg', 
             'gene', 'log2FoldChange', 
             'padj', 'pvalue', 'stat']]
    deg_dfs[contrast] = df

In [9]:
aim_names = {
    'cohort': 'Young Adult vs Older Adult',
    'cmv': 'CMV-Positive vs CMV-Negative',
    'sex': 'Female vs Male',
    'cmvneg_cohort': 'CMV-Negative Young Adult vs Older Adult',
    'cmvpos_cohort': 'CMV-Positive Young Adult vs Older Adult',
    'female_cohort': 'Female Young Adult vs Older Adult',
    'male_cohort': 'Male Young Adult vs Older Adult'
}

### Split DataFrames by cell type and save them to .pkl files

We'll nest these in the `./data/` output directory in a separate subdirectory for each contrast group.

As we iterate through the data, we'll also build a dictionary of metadata and a dictionary containing DEG counts based on the `adjp_cutoff` selected above.

In [10]:
df.head()

Unnamed: 0,cell_type,fg,n_fg,bg,n_bg,gene,log2FoldChange,padj,pvalue,stat
0,BaEoMaP cell,BR1,0,BR2,0,AL669831.5,0.176099,0.999989,0.904798,0.119602
1,BaEoMaP cell,BR1,0,BR2,0,NOC2L,-0.642299,0.999989,0.730353,-0.344656
2,BaEoMaP cell,BR1,0,BR2,0,ISG15,-0.854694,0.999989,0.373794,-0.889389
3,BaEoMaP cell,BR1,0,BR2,0,SDF4,0.614844,0.999989,0.509396,0.659778
4,BaEoMaP cell,BR1,0,BR2,0,B3GALT6,0.466746,0.999989,0.787854,0.269099


In [11]:
# Used to collect files for tar bundling
out_pkl_files = []

# Used to generate metadata
json_dict = {}
json_idx = 0

# Used to store DEG counts based on adjp_cutoff
n_deg_dict = {}

# Used to convert names for pseudobulk metadata
cell_type_names = {}

# Used to filter pseudobulk data for genes that were tested
test_genes = {}

for contrast, df in deg_dfs.items():
    aim = aim_names[contrast]
    test_genes[contrast] = {}
    
    out_path = './data/deg/{c}'.format(c = contrast)
    if not os.path.isdir(out_path):
        os.makedirs(out_path)
    
    n_deg_dict[aim] = {}
    
    df = df.groupby('cell_type')
    for group_name, group_df in df:
        cell_type = format_cell_type(group_name)
        cell_type_names[cell_type] = group_name

        # Store tested genes for pseudobulk expression
        test_genes[contrast][cell_type] = group_df['gene'].tolist()
        
        # Store significant gene counts in n_deg_dict
        sig_df = group_df[group_df['padj'] < adjp_cutoff]
        n_up = sum(sig_df['log2FoldChange'] > 0)
        n_dn = sum(sig_df['log2FoldChange'] < 0)
        
        n_deg_dict[aim][group_name] = {
            'fg': group_df['fg'].to_list()[0],
            'bg': group_df['bg'].to_list()[0],
            'n_up': n_up,
            'n_dn': n_dn,
            'n_tested': group_df.shape[0]
        }

        # Save results to .pkl file
        out_file = '{p}/{c}_deg.pkl'.format(p = out_path, c = cell_type)
        with open(out_file, 'wb') as f:
            pickle.dump(group_df, f, protocol = 4)
        out_pkl_files.append(out_file)

        # Store metadata in json_dict
        json_dict[str(json_idx)] = {
            'file': out_file,
            'analysis': {
                'aim': aim,
                'modality': 'scRNA_DE',
                'celltype': group_name
            },
            'type_differential': {
               'feat_header': 'gene',
               'es_header': 'log2FoldChange',
               'p_header': 'padj',
               'meta_header': ['fg', 'bg', 'pvalue', 'stat']
            }
        }
        
        json_idx += 1

### Write metadata dictionary to JSON

In [12]:
out_deg_meta = 'data/deg_meta.json'
with open(out_deg_meta, 'w') as f:
    json.dump(json_dict, f, indent = 4)

### Write DEG counts dictionary to JSON

In [13]:
out_deg_counts = 'data/deg_counts.json'
with open(out_deg_counts, 'w') as f:
    json.dump(n_deg_dict, f, indent = 4)

## Pseudobulk expression

We also need to structure the pseudobulk expression data for display. We can retrieve this from the .csv files that store pseudobulk expression data.

### Retrieve and extract csv files

In [14]:
expr_uuid = 'f27070c0-8da8-445b-8887-796d3271e463'

In [15]:
expr_path = '/home/jupyter/cache/{u}'.format(u = expr_uuid)
if not os.path.isdir(expr_path):
    hise_res = hisepy.reader.cache_files([expr_uuid])
expr_filename = os.listdir(expr_path)[0]
expr_file = '{p}/{f}'.format(p = expr_path, f = expr_filename)

In [16]:
tar = tarfile.open(expr_file)
tar.extractall()

In [17]:
expr_csvs = tar.getnames()

### Read csvs into DataFrames

In [18]:
friendly_names = {
    'subject.subjectGuid': 'Subject',
    'subject.biologicalSex': 'Biological Sex',
    'subject.cmv': 'CMV Status',
    'cohort.cohortGuid': 'Cohort',
    'sample.visitName': 'Visit',
    'sample.subjectAge': 'Age (Years)',
    'n_cells': 'N Cells'
}

In [19]:
expr_dfs = {}
for expr_csv in expr_csvs:
    df = pandas.read_csv(expr_csv)
    cell_type = re.sub('_seurat.+', '', expr_csv)
    cell_type = re.sub('.+pbmc_', '', cell_type)
    df = df.rename(friendly_names, axis = 1)
    expr_dfs[cell_type] = df

In [20]:
df.head()

Unnamed: 0,barcodes,gene,normalized_counts,Subject,Biological Sex,CMV Status,Cohort,Visit,Age (Years),N Cells
0,pDC.BR1001.Flu-Year-1-Day-0,MIR1302-2HG,0,BR1001,Female,Negative,BR1,Flu Year 1 Day 0,32,98
1,pDC.BR1002.Flu-Year-1-Day-0,MIR1302-2HG,0,BR1002,Male,Negative,BR1,Flu Year 1 Day 0,28,145
2,pDC.BR1003.Flu-Year-1-Day-0,MIR1302-2HG,0,BR1003,Female,Negative,BR1,Flu Year 1 Day 0,30,79
3,pDC.BR1004.Flu-Year-1-Day-0,MIR1302-2HG,0,BR1004,Male,Negative,BR1,Flu Year 1 Day 0,30,82
4,pDC.BR1005.Flu-Year-1-Day-0,MIR1302-2HG,0,BR1005,Female,Negative,BR1,Flu Year 1 Day 0,27,96


### Convert to pkl for visualization

#### All samples: for top-level comparisons

In [21]:
out_path = './data/expr/all/'
if not os.path.isdir(out_path):
    os.makedirs(out_path)
for cell_type, df in expr_dfs.items():
    keep_genes = test_genes['cohort'][cell_type]
    df = df[df['gene'].isin(keep_genes)]
    
    out_file = '{p}/{t}_expr.pkl'.format(p = out_path, t = cell_type)
    with open(out_file, 'wb') as f:
        pickle.dump(df, f, protocol = 4)
    out_pkl_files.append(out_file)

#### Subset samples

In [22]:
subset_filters = {
    'cmvpos_cohort': ('CMV Status', 'Positive'),
    'cmvneg_cohort': ('CMV Status', 'Negative'),
    'female_cohort': ('Biological Sex', 'Female'),
    'male_cohort': ('Biological Sex', 'Male')
}

In [23]:
for subset_name, filter in subset_filters.items():
    out_path = './data/expr/{s}'.format(s = subset_name)
    if not os.path.isdir(out_path):
        os.makedirs(out_path)
    for cell_type, df in expr_dfs.items():
        # Filter data for sample subsets
        filter_col = filter[0]
        filter_value = filter[1]
        df = df[df[filter_col] == filter_value]
        
        # Filter data for tested genes
        keep_genes = test_genes[subset_name][cell_type]
        df = df[df['gene'].isin(keep_genes)]
        
        out_file = '{p}/{t}_expr.pkl'.format(p = out_path, t = cell_type)
        with open(out_file, 'wb') as f:
            pickle.dump(df, f, protocol = 4)
        out_pkl_files.append(out_file)

#### Build metadata dictionary

In [24]:
aim_sets = {
    'cohort': 'all',
    'cmv': 'all',
    'sex': 'all',
    'cmvneg_cohort': 'cmvneg_cohort',
    'cmvpos_cohort': 'cmvpos_cohort',
    'female_cohort': 'female_cohort',
    'male_cohort': 'male_cohort'
}

In [25]:
aim_grouping = {
    'cohort': 'Cohort',
    'cmv': 'CMV Status',
    'sex': 'Biological Sex',
    'cmvneg_cohort': 'Cohort',
    'cmvpos_cohort': 'Cohort',
    'female_cohort': 'Cohort',
    'male_cohort': 'Cohort'
}

In [26]:
expr_json = {}
json_idx = 0

for contrast, aim_set in aim_sets.items():
    pkl_path = './data/expr/{s}'.format(s = aim_set)
    pkl_list = os.listdir(pkl_path)
    for pkl_file in pkl_list:
        cell_type = re.sub('_expr.pkl','',pkl_file)
        expr_json[json_idx] = {
            'file': '{p}/{f}'.format(p = pkl_path, f = pkl_file),
            'analysis': {
                'aim': aim_names[contrast],
                'modality': 'scRNA_DE',
                'celltype': cell_type_names[cell_type]
            },
            'type_expression_xsec': {
                'feat_header': 'gene',
                'exp_header': 'normalized_counts',
                'obs_header': 'Subject',
                'meta_header': [
                    'Subject',
                    'Age (Years)',
                    'Biological Sex',
                    'CMV Status',
                    'Cohort',
                    'Visit',
                    'N Cells'
                ],
                'group_header': aim_grouping[contrast],
                'paired_data_header': {},
                'paired_meta_header': {}
            }
        }
        json_idx += 1

In [27]:
out_expr_meta = 'data/expr_meta.json'
with open(out_expr_meta, 'w') as f:
    json.dump(expr_json, f, indent = 4)

### Assemble files as a .tar for storage

In [28]:
all_out_files = out_pkl_files + [out_deg_meta, out_deg_counts, out_expr_meta]

out_tar = 'output/ref_pbmc_deseq_results_vis_{d}.tar'.format(d = date.today())
tar = tarfile.open(out_tar, 'w')
for out_file in all_out_files:
    tar.add(out_file)
tar.close()

## Upload results to HISE

In [29]:
study_space_uuid = '64097865-486d-43b3-8f94-74994e0a72e0'
title = 'PBMC Ref. DESeq2 for Vis Tools {d}'.format(d = date.today())

In [30]:
in_files = list(deg_uuids.values())

In [31]:
in_files

['4ed05e81-d2e4-4bce-a9d0-105844937a7a',
 '654e8879-0ff5-40b1-b681-f7bf6ff56d6b',
 'a6ebf838-a20c-418d-a90d-7d75255f6499',
 '6a9efa94-bad1-42dc-a783-6fb13784111c',
 '600fc816-7f37-445b-9231-daf569a701a8',
 '7c7f5510-b4f1-48f7-9949-60c02c444daa',
 '0242e417-cd3e-4def-b1c3-029296d6588c']

In [32]:
out_files = [out_tar]
out_files

['output/ref_pbmc_deseq_results_vis_2024-04-16.tar']

In [33]:
hisepy.upload.upload_files(
    files = out_files,
    study_space_id = study_space_uuid,
    title = title,
    input_file_ids = in_files,
    destination = 'diha_deg_pseudobulk_vis'
)

Cannot determine the current notebook.
1) /home/jupyter/scRNA-Reference-IH-A/08-Differential_expression/36a-Python_pseudobulk_deseq2_vis_format.ipynb
2) /home/jupyter/Untitled.ipynb
3) /home/jupyter/scRNA-Reference-IH-A/visualizations/04-Python_other_markers.ipynb
Please select (1-3) 


 1


you are trying to upload file_ids... ['output/ref_pbmc_deseq_results_vis_2024-04-16.tar']. Do you truly want to proceed?


(y/n) y


{'trace_id': 'e9e5838c-da9f-48d2-9ace-a636f7827da2',
 'files': ['output/ref_pbmc_deseq_results_vis_2024-04-16.tar']}

In [34]:
import session_info
session_info.show()