In [1]:
import os
from os.path import join as pj
import pandas as pd
from tqdm import tqdm

In [2]:
%%capture
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', None)

In [3]:
dir_home      = os.path.expanduser('~')
dir_project   = '/mnt/4TB/TCGA_Lung'
path_manifest = pj(dir_project, 'scripts', '3-processing', '1-manifest.csv') 
dir_vars      = pj(dir_project, 'variants')
dir_analysis  = pj(dir_project, 'scripts', '4-analysis')

In [4]:
os.chdir(dir_project)

In [5]:
pwd

'/mnt/4TB/TCGA_Colorectal'

---

### Manifest

In [6]:
df_manifest = pd.read_csv(path_manifest)

In [7]:
if 'vital_status' not in df_manifest.columns.tolist():
    raise Exception("Error - Prior to running this script, you'll need to add `vital_status` column to your manifest file that denotes the file as either `recurred` or `disease_free`")

In [None]:
# df_manifest = df_manifest[df_manifest['in_cohort']=='yes']

In [None]:
# files_dead = df_manifest[df_manifest['vital_status']=='Dead']['file_id'].tolist()

In [None]:
# files_alive = df_manifest[df_manifest['vital_status']=='Alive']['file_id'].tolist()

In [None]:
# num_dead, num_alive = len(files_dead), len(files_alive)

In [None]:
# if num_dead != num_alive:
#     raise Exception (f"Error - number of recurred ({num_dead}) and disease_free ({num_alive}) patients not balanced")

---

# Matrices

We'll create a matrix that aggregates high and moderate impact mutations at the gene level for each patient.

We do them all (dead/alive) together to ensure that we have the same genes/columns for every file

In [8]:
file_ids = df_manifest['file_id'].tolist()

In [9]:
vitals = df_manifest['vital_status'].tolist()

## Gene Matrix

In [10]:
dfs = []
for i, file_id in enumerate(file_ids):
    file_path = f"{file_id}.qc"
    file_path = pj(dir_vars, file_path)

    df = pd.read_parquet(file_path)
    df = df.filter(['VEP_IMPACT','VEP_SYMBOL','QC_Weight',])
    df['Gene_Impact'] = df['VEP_IMPACT'] + '_' + df['VEP_SYMBOL']

    df = df.groupby('Gene_Impact')['QC_Weight'].sum()
    df = pd.DataFrame(df).reset_index()

    cols = df['Gene_Impact'].tolist()
    df = df.T
    df.columns = cols
    df = df.reset_index(drop=True)#necessary for `loc`
    df = df.loc[1:1] #[1] results in series
    
    vital_status = vitals[i]
    df.insert(0, 'vital_status', [vital_status])
    df.insert(0, 'file_id', [file_id])
    
    dfs.append(df)

In [11]:
df_genes = pd.concat(dfs)
df_genes = df_genes.fillna(0)
df_genes.reset_index(drop=True,inplace=True)

In [12]:
df_genes.head(3)

Unnamed: 0,file_id,vital_status,HIGH_A2M,HIGH_AASDH,HIGH_ABCA5,HIGH_ABCF1,HIGH_ABO,HIGH_ACTN3,HIGH_ADGRD2,HIGH_AGAP6,...,MODERATE_CDCA8,MODERATE_CELF2,MODERATE_GPX2,MODERATE_PDE6H,MODERATE_PHKG2,MODERATE_RAB8A,MODERATE_RASGRP2,MODERATE_SLC25A1,MODERATE_STYX,MODERATE_VWC2L
0,43cbb4c9-4dc8-4e7a-ac69-779f5a76dced,Alive,0.0009,0.99812,0.99815,0.99553,0.0,0.0,0.98304,0.00023,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,89b770d2-f4ac-4a53-b1aa-4bbefccc6dff,Alive,0.0,0.99812,0.0,0.99553,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,7f9e0853-64b8-4ab9-b5d0-ae2afeaacffd,Alive,0.0009,0.99812,0.0,0.99553,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
df_genes.shape

(119, 21143)

In [14]:
path_matrixGene = pj(dir_analysis,'1-matrix_trainGeneImpact.pq')

In [15]:
df_genes.to_parquet(path_matrixGene)