# prepare-gsea
Generate input files for GSEA in the correct formats (.gct, .cls).

## Requirements
- py3 environment: python, pandas
- `oscutils/gsea_converters.py`: download from https://github.com/auberginekenobi/oscutils
- **library size normalized** gene expression matrix. This can be generated by `deseq2_shL1CAM-shSHTN1.ipynb`. See documentation for `DESeq2::counts` or `edgeR::cpm`.
- sample metadata table

In [None]:
# Load dependencies
import pandas as pd
import sys
from pathlib import Path

oscutils_path=f'{str(Path.home())}/projects/oscutils'
sys.path.append(oscutils_path)
import gsea_converters

In [None]:
# File I/O
Path("results/gsea").mkdir(parents=True, exist_ok=True)
counts_file = 'results/deseq2/shL1CAM-shSHTN1_batch4.deseq2norm.tsv'
counts_outfile = 'results/gsea/shL1CAM-shSHTN1_batch4.gct'
annotations_file = 'anno/sample_metadata.tsv'
annotations_outfile = 'results/gsea/shL1CAM-shSHTN1_batch4.cls'

In [None]:
# read, format and write expression matrix
cts = pd.read_csv(counts_file,sep='\t',index_col=0)
gsea_converters.exp2gct(cts,counts_outfile)
cts.head()

In [None]:
# read, format and write phenotypes
annots = pd.read_csv(annotations_file,sep='\t')
annots = annots.loc[cts.columns,'sample']
gsea_converters.labels2cls(annots,annotations_outfile)
annots