In [1]:
pwd

'/mnt/4TB/TCGA_Colorectal/scripts/3-processing'

In [2]:
from guppy import hpy

In [3]:
print("start")
import sys, os
from os.path import join as pj
import pandas as pd

# ====== PROJECT SCAFFOLD ======
# Arguments passed into script by Docker command
project = sys.argv[1]
file_id = sys.argv[2]

# Mounted volume
wd  = "/mnt/4TB/TCGA_Colorectal/scripts/3-processing"
boop = pj(wd, '43cbb4c9-4dc8-4e7a-ac69-779f5a76dced.vep')

# ====== START PROCESSING ======
print('starting parse')
# Load our vep-annotated vcf file
with open(boop) as f:
    lines = f.readlines()

# Extract VEP the schema above from file header
prefix_schema = "Ensembl VEP. Format: "
for line in lines:
    if (prefix_schema in line):              
        # We want everything after format and before closing syntax – broken into list
        vep_cols = line.split(prefix_schema)[1].split('">')[0].split('|')
        vep_cols = [f"VEP_{c}" for c in vep_cols]
        break

# Where file header ends and real data begins
prefix_cols = "#CHROM"
header_lineNum = None
for i, line in enumerate(lines):
    if (line.startswith(prefix_cols)): 
        # Pandas wants line number - 1 so don't worry about zero-based index
        header_lineNum = i
        break
if (header_lineNum==None):
    msg = 'Error - VEP did not write data to VCF\nPlease verify you downloaded the indexed version of VEP cache.\nAnd that the VCF file name you are trying to create does not exist already.'
    raise Exception(msg)
        
# Read vcf into a dataframe
df_vcf = pd.read_csv(boop, sep='\t', header=header_lineNum)
df_vcf.rename(columns={'#CHROM':'CHROM'}, inplace=True)

# Everything before `;CSQ=` becomes INFO col / after becomes VEP col
df_vcf[['INFO', 'VEP']] = df_vcf['INFO'].str.split(';CSQ=', n=1, expand=True)

# Explode the new VEP col
df_vcf[vep_cols] = df_vcf['VEP'].str.split('|',expand=True)
df_vcf = df_vcf.drop(columns=['VEP'])

"""
Tabular VCF FORMAT Fields
All of my variants were using the same format fields.
"""
# Check that format is the same for all variants
if (len(set(df_vcf['FORMAT'])) == 1):
    # Break the format keys into a list
    format_cols = df_vcf['FORMAT'][0].split(':')
    format_cols = [f"FORMAT_{c}" for c in format_cols]

    # Need to find the case-specific column that is paired with FORMAT 
    # e.g. column named TCGA-F4-6459-10A-01D-1771-10
    vcf_cols   = df_vcf.columns.tolist()
    format_idx = [i for i, col in enumerate(vcf_cols) if 'FORMAT'==col][0]
    case_idx   = format_idx + 1
    case_col   = vcf_cols[case_idx] 

    # Explode the formatted sample-specific data 
    df_vcf[format_cols] = df_vcf[case_col].str.split(':',expand=True)
    df_vcf = df_vcf.drop(columns=[case_col, 'FORMAT'])
else:
    print("Warning – FORMAT could not be exploded as format not the same for all variants")

"""
Tabular VCF INFO Fields
Remember, at this point, the VEP fields are no longer in the VEP INFO column.
This handles sparsity; some of my variants didn't have all of the INFO fields
"""
# Sparsity means parsing rows individually
info_entries = []
for row in df_vcf['INFO']: 
    # Break the info string into key-value pairs
    row_pairs = dict(item.split("=") for item in row.split(";"))
    info_entries.append(row_pairs)

# Easy to make df from dict, but doing so makes separate df
df_info = pd.DataFrame.from_dict(info_entries)

# Explode the info column
info_cols = df_info.columns.tolist()
info_colsRename = [f"INFO_{c}" for c in info_cols]
renames = dict(zip(info_cols, info_colsRename))
df_info = df_info.rename(columns=renames)

# Merge back with the main df
df_vcf = pd.concat([df_vcf, df_info], axis=1)
df_vcf = df_vcf.drop(columns=['INFO'])

h = hpy()
print(h.heap())
del df_info
print('finished parse')

start
starting parse
Partition of a set of 10051814 objects. Total size = 6799517450 bytes.
 Index  Count   %     Size   % Cumulative  % Kind (class / dict of class)
     0      2   0 4653608366  68 4653608366  68 pandas.core.frame.DataFrame
     1 8759854  87 832345323  12 5485953689  81 str
     2     78   0 630723278   9 6116676967  90 numpy.ndarray
     3 963213  10 627823800   9 6744500767  99 dict (no owner)
     4   4709   0 16610632   0 6761111399  99 list
     5 130839   1 11049224   0 6772160623 100 tuple
     6  57343   1  4414002   0 6776574625 100 bytes
     7  29041   0  4202992   0 6780777617 100 types.CodeType
     8  26580   0  3827520   0 6784605137 100 function
     9   3553   0  3336248   0 6787941385 100 type
<1228 more rows. Type e.g. '_.more' to view.>
finished parse


In [6]:
# Save it
beep = pj(wd, '43cbb4c9-4dc8-4e7a-ac69-779f5a76dced.pq')
df_vcf.to_parquet(beep)
print("end")

end
