In [1]:
import os
from os.path import join as pj
import pandas as pd
from tqdm import tqdm

In [2]:
%%capture
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', None)

In [3]:
dir_home      = os.path.expanduser('~')
dir_project   = '/mnt/4TB/TCGA_Liver'
path_manifest = pj(dir_project, 'scripts', '3-processing', '1-manifest.csv') 
dir_vars      = pj(dir_project, 'variants')
dir_analysis  = pj(dir_project, 'scripts', '4-analysis')

In [4]:
os.chdir(dir_project)

In [5]:
pwd

'/mnt/4TB/TCGA_Liver'

---

### Manifest

In [6]:
df_manifest = pd.read_csv(path_manifest)

In [7]:
if 'vital_status' not in df_manifest.columns.tolist():
    raise Exception("Error - Prior to running this script, you'll need to add `vital_status` column to your manifest file that denotes the file as either `recurred` or `disease_free`")

In [8]:
# df_manifest = df_manifest[df_manifest['in_cohort']=='yes']

In [9]:
# files_dead = df_manifest[df_manifest['vital_status']=='Dead']['file_id'].tolist()

In [10]:
# files_alive = df_manifest[df_manifest['vital_status']=='Alive']['file_id'].tolist()

In [11]:
# num_dead, num_alive = len(files_dead), len(files_alive)

In [12]:
# if num_dead != num_alive:
#     raise Exception (f"Error - number of recurred ({num_dead}) and disease_free ({num_alive}) patients not balanced")

---

# Matrices

We'll create a matrix that aggregates high and moderate impact mutations at the gene level for each patient.

We do them all (dead/alive) together to ensure that we have the same genes/columns for every file

In [13]:
file_ids = df_manifest['file_id'].tolist()

In [14]:
vitals = df_manifest['vital_status'].tolist()

## Gene Matrix

In [15]:
dfs = []
for i, file_id in enumerate(file_ids):
    file_path = f"{file_id}.qc"
    file_path = pj(dir_vars, file_path)

    df = pd.read_parquet(file_path)
    df = df.filter(['VEP_IMPACT','VEP_SYMBOL','QC_Weight',])
    df['Gene_Impact'] = df['VEP_IMPACT'] + '_' + df['VEP_SYMBOL']

    df = df.groupby('Gene_Impact')['QC_Weight'].sum()
    df = pd.DataFrame(df).reset_index()

    cols = df['Gene_Impact'].tolist()
    df = df.T
    df.columns = cols
    df = df.reset_index(drop=True)#necessary for `loc`
    df = df.loc[1:1] #[1] results in series
    
    vital_status = vitals[i]
    df.insert(0, 'vital_status', [vital_status])
    df.insert(0, 'file_id', [file_id])
    
    dfs.append(df)

In [16]:
df_genes = pd.concat(dfs)
df_genes = df_genes.fillna(0)
df_genes.reset_index(drop=True,inplace=True)

In [17]:
df_genes.head(3)

Unnamed: 0,file_id,vital_status,HIGH_AADACL3,HIGH_ABCB5,HIGH_ABCF1,HIGH_ABO,HIGH_ACOT8,HIGH_ACSM2A,HIGH_ADAM33,HIGH_AFF3,...,MODERATE_PRDX1,MODERATE_RAPGEF2,MODERATE_SHISA6,MODERATE_SLITRK1,MODERATE_TM4SF1,MODERATE_TOR4A,MODERATE_TRHR,MODERATE_TRMT10A,MODERATE_WNT7B,MODERATE_ZNF672
0,83f5edc7-c91c-453c-8c21-3519d63c2049,Alive,0.00357,0.28914,0.0,0.0,0.79009,0.61475,0.26348,0.33776,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,36e340e3-b6c2-416c-ba4e-72fb08101902,Alive,0.00357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4ac8b0a1-7189-4f66-b81e-2bbec2177da2,Alive,0.00357,0.0,0.993481,0.0,0.0,0.61475,0.13174,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
df_genes.shape

(109, 19190)

In [19]:
path_matrixGene = pj(dir_analysis,'1-matrix_trainGeneImpact.pq')

In [20]:
df_genes.to_parquet(path_matrixGene)