# ERV differentiation

Two main research questions:
1. Are ERV's differentially expressed in SCC versus AC?
2. Are ERV's involved in immune surveillance in lung cancer

We have to 
* identify ERV's in Adeno and SCC
* identify granzy mand perforin expression per subtype
* identify IFN gene expression per subtype
* silencing of ERV by methylation in normal tissue



In [117]:
import vaex as vx
import pandas as pd
import pyarrow.parquet as pq
import numpy as np
import scipy as sc

import os
import sys
import re
from collections import Counter
from collections import namedtuple

In [118]:
os.chdir('/media/koekiemonster/DATA-FAST/genetic_expression/hackathon_2/Lung/')

In [119]:
dd = {}
dd['RNAex'] = vx.open('Lung_GeneExpression.hdf5')
dd['mutation'] = vx.open('Lung_Mutation.hdf5')
dd['methylation'] = vx.open('Lung_Methylation.hdf5')
dd['methylation_meta'] = vx.open('HumanMethylation450_meta.hdf5')
dd['CNV'] = vx.open('Lung_CNV.hdf5')

meta = pd.read_csv('Lung_Phenotype_Metadata.txt', sep='\t')
meta = meta.loc[~pd.isna(meta.Diagnosis)]
meta = meta[~meta.SampleID.duplicated(keep='last')]
meta.set_index('SampleID', inplace=True)

In [120]:
erv_genes = [_gene for _gene in dd['RNAex'].Gene.unique() if 'ERV' in _gene]
erv_genes += [_gene for _gene in dd['methylation'].Gene.unique() if 'ERV' in _gene]
erv_genes = list(set(erv_genes))

In [121]:
df_erv = {}
for _key in dd:  
    d = dd[_key]
    if 'Gene' in d.columns:
        temps = []
        for _gene in erv_genes:
            temps.append(d[d.Gene == _gene].to_pandas_df())            
        df_erv[_key] = pd.concat(temps, axis=0)

In [122]:
probe_map = df_erv['methylation'][['probeID', 'Chr', 'Start', 'Stop', 'Strand', 'Gene', 'Relation_CpG_Island']].reset_index().drop('index', axis=1)
df_erv['methylation'] = df_erv['methylation'].drop(['Chr', 'Start', 'Stop', 'Strand', 'Gene', 'Relation_CpG_Island'], axis=1)
df_erv['methylation'].set_index('probeID', inplace=True)
df_erv['methylation'] = df_erv['methylation'].transpose()
df_erv['methylation'].index = [re.sub(r'_', '-', _old_key) for _old_key in df_erv['methylation'].index.tolist()]

In [123]:
exp_map = df_erv['RNAex'][['Gene', 'Chr', 'Start', 'Stop', 'Strand']].reset_index().drop('index', axis=1)
df_erv['RNAex'] = df_erv['RNAex'].drop(['Chr', 'Start', 'Stop', 'Strand'], axis=1)
df_erv['RNAex'].set_index('Gene', inplace=True)
df_erv['RNAex'] = df_erv['RNAex'].transpose()
df_erv['RNAex'].index = [re.sub(r'_', '-', _old_key) for _old_key in df_erv['RNAex'].index.tolist()]

In [124]:
df_erv['mutation'] = df_erv['mutation'][df_erv['mutation'].Effect!='upstream_gene_variant']
mut_map = df_erv['mutation'][['Amino_Acid_Change', 'Gene', 'Chr', 'Start', 'Stop', 'Ref', 'Alt']].reset_index().drop('index', axis=1)
df_erv['mutation'] = df_erv['mutation'].drop(['Gene', 'Chr', 'Start', 'Stop', 'Ref', 'Alt'], axis=1)
df_erv['mutation'] = df_erv['mutation'].reset_index().drop('index', axis=1)
df_erv['mutation'] = df_erv['mutation'].rename(index=str, columns={'Sample_ID': 'SampleID'})
df_erv['mutation'].set_index('SampleID', inplace=True)

In [125]:
cnv_map = df_erv['CNV'][['Gene', 'Chr', 'Start', 'Stop', 'Strand']]
df_erv['CNV'] = df_erv['CNV'].drop(['Chr', 'Start', 'Stop', 'Strand'], axis=1)
df_erv['CNV'] = df_erv['CNV'].set_index('Gene').reset_index()
df_erv['CNV'] = df_erv['CNV'].transpose()

In [126]:
for _key in df_erv:
    df_erv[_key] = df_erv[_key].join(meta[['Gender', 'Diagnosis', 'Age At Diagnosis (Years)', 'Overall Survival Status', 'Reponse To Therapy',
                                  'Pack Years', 'Smoking Status']])

## Check class/survival differentiation per feature

## Check class/survival differentiation per feature combination 

## Check differentiation class/survival with/without ERV mutations