# Prepare input file (yaml) for Hartwig Medical Foundation (HMF) pipeline at gcloud (platinum)

This notebook is to create yaml file to run samples with platinum pipeline (Hartwig Medical Foundation pipeline at gcloud): https://github.com/hartwigmedical/platinum \
The starting point are the fastq files, which need to be uploaded at a bucket in gcloud.\
This notebook takes the original FASTQ files, annotated in excels (LOPEBIG_13, LOPEBIG_31, LOPEBIG_14, LOPEBIG_30, LOPEBIG_42, LOPEBIG_37).\
From the CRAM files deposited in EGA, first extract the FASTQ files, upload them in a bucket at gcloud and then build the yaml file.\
Alternatively, HMF pipeline can be run from BAM files (only CRAM > BAM conversion needs to be performed), in the Oncoanalyser pipeline:  https://github.com/scwatts/oncoanalyser

In [18]:
import pandas as pd
import json
import numpy as np

In [13]:
#import dictionary with sample id annotation of each case
with open('../../../../cases_ids.json','r') as f:
    samples_dict = json.load(f)
samples_dict

{'case1': {'normal': 'AQ5175',
  'tumor1': 'AQ5181',
  'tumor2': 'AQ5187',
  'sex': 'female'},
 'case2': {'normal': 'AQ5176',
  'tumor1': 'AQ5182',
  'tumor2': 'AQ5188',
  'sex': 'male'},
 'case3': {'normal': 'AQ5174',
  'tumor1': 'AQ5180',
  'tumor2': 'AQ5186',
  'sex': 'female',
  'kidney': 'AX4954',
  'liver': 'AX4955',
  'pancreas': 'AX4956',
  'heart': 'AX4957',
  'clone1': 'AX4958',
  'clone2': 'AX4961',
  'mother': 'AW8063',
  'father': 'AW8064',
  'lung': 'AX4962',
  'medulla': 'AX4963',
  'spleen': 'AX4964',
  'brain': 'AX4965',
  'bma': 'AX4966'},
 'case4': {'normal': 'AW8061',
  'tumor1': 'AW8050',
  'tumor2': 'AW8051',
  'sex': 'female'}}

In [14]:
#read files with FASTQ info of each tumor sample
df1 = pd.read_excel('/workspace/datasets/sjd_seq/20200805/LOPEBIG_13.xls',skiprows=[0,1])
df1 = df1.rename(columns={'FLOWCELL (Read Length indicated)':'FLOWCELL'})
df2 = pd.read_excel('/workspace/datasets/sjd_seq/20220321/LOPEBIG_31.xls',skiprows=[0,1])
tum_df = pd.concat([df1,df2],ignore_index=True)
tum_df.groupby('SAMPLE BARCODE').count()

Unnamed: 0_level_0,FLOWCELL,LANE,MULTIPLEX INDEX,APPLICATION,LIBRARY BARCODE (internal id),SAMPLE NAME,SPECIES,MACHINE,Unnamed: 9
SAMPLE BARCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AQ5180,12,12,12,12,12,12,12,12,12
AQ5181,12,12,12,12,12,12,12,12,12
AQ5182,12,12,12,12,12,12,12,12,12
AQ5183,12,12,12,12,12,12,12,12,12
AQ5184,16,16,16,16,16,16,16,16,16
AQ5185,12,12,12,12,12,12,12,12,12
AQ5186,12,12,12,12,12,12,12,12,12
AQ5187,12,12,12,12,12,12,12,12,12
AQ5188,8,8,8,8,8,8,8,8,8
AQ5189,12,12,12,12,12,12,12,12,12


In [15]:
#read files with FASTQ info of each normal sample
df1 = pd.read_excel('/workspace/datasets/sjd_seq/20200811/LOPEBIG_14.xls',skiprows=[0,1])
df1 = df1.rename(columns={'FLOWCELL (Read Length indicated)':'FLOWCELL'})
df1['FLOWCELL'] = df1['FLOWCELL'].str.split('(',expand=True)[0]
df2 = pd.read_excel('/workspace/datasets/sjd_seq/20220502/LOPEBIG_30.xls',skiprows=[0,1])
df3 = pd.read_excel('/workspace/datasets/sjd_seq/20220629/LOPEBIG_42.xls',skiprows=[0,1])
df4 = pd.read_excel('/workspace/datasets/sjd_seq/20220804/LOPEBIG_37.xls',skiprows=[0,1])
ref_df = pd.concat([df1,df2,df4],ignore_index=True)
ref_df = pd.merge(ref_df,df3,how='outer',indicator=True)
ref_df.groupby('SAMPLE BARCODE').count()

Unnamed: 0_level_0,FLOWCELL,LANE,MULTIPLEX INDEX,APPLICATION,LIBRARY BARCODE (internal id),SAMPLE NAME,SPECIES,MACHINE,Unnamed: 9,_merge
SAMPLE BARCODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AQ5174,16,16,16,16,16,16,16,16,16,16
AQ5175,16,16,16,16,16,16,16,16,16,16
AQ5176,16,16,16,16,16,16,16,16,16,16
AQ5177,16,16,16,16,16,16,16,16,16,16
AQ5178,16,16,16,16,16,16,16,16,16,16
AQ5179,16,16,16,16,16,16,16,16,16,16
AW8058,20,20,20,20,20,20,20,20,20,20
AW8059,20,20,20,20,20,20,20,20,20,20
AW8060,16,16,16,16,16,16,16,16,16,16
AW8061,12,12,12,12,12,12,12,12,12,12


In [16]:
list(tum_df)

['FLOWCELL',
 'LANE',
 'MULTIPLEX INDEX',
 'APPLICATION',
 'LIBRARY BARCODE (internal id)',
 'SAMPLE BARCODE',
 'SAMPLE NAME',
 'SPECIES',
 'MACHINE',
 'Unnamed: 9']

In [19]:
#dictionary with tumor sample to fastq file
tum_df = tum_df[['SAMPLE BARCODE','Unnamed: 9']]
tum_np = tum_df.groupby('SAMPLE BARCODE',as_index=False).apply(lambda x: list(np.unique(x)))
tum_list= list(tum_np)
tum_dict = {}
for sample in tum_list:
    key1 = sample[0]
    key2 = sample[1:len(sample)]
    tum_dict[key1] = key2
tum_dict

{'AQ5180': ['H2TKWDSXY_1_31UDI-idt-UMI',
  'H2TKWDSXY_2_31UDI-idt-UMI',
  'H2TKWDSXY_3_31UDI-idt-UMI',
  'H2TKWDSXY_4_31UDI-idt-UMI',
  'H2VF5DSXY_1_31UDI-idt-UMI',
  'H2VF5DSXY_2_31UDI-idt-UMI',
  'H2VF5DSXY_3_31UDI-idt-UMI',
  'H2VF5DSXY_4_31UDI-idt-UMI',
  'H2VY7DSXY_1_31UDI-idt-UMI',
  'H2VY7DSXY_2_31UDI-idt-UMI',
  'H2VY7DSXY_3_31UDI-idt-UMI',
  'H2VY7DSXY_4_31UDI-idt-UMI'],
 'AQ5181': ['H2TKWDSXY_1_43UDI-idt-UMI',
  'H2TKWDSXY_2_43UDI-idt-UMI',
  'H2TKWDSXY_3_43UDI-idt-UMI',
  'H2TKWDSXY_4_43UDI-idt-UMI',
  'H2VF5DSXY_1_43UDI-idt-UMI',
  'H2VF5DSXY_2_43UDI-idt-UMI',
  'H2VF5DSXY_3_43UDI-idt-UMI',
  'H2VF5DSXY_4_43UDI-idt-UMI',
  'H2VY7DSXY_1_43UDI-idt-UMI',
  'H2VY7DSXY_2_43UDI-idt-UMI',
  'H2VY7DSXY_3_43UDI-idt-UMI',
  'H2VY7DSXY_4_43UDI-idt-UMI'],
 'AQ5182': ['H2TKWDSXY_1_55UDI-idt-UMI',
  'H2TKWDSXY_2_55UDI-idt-UMI',
  'H2TKWDSXY_3_55UDI-idt-UMI',
  'H2TKWDSXY_4_55UDI-idt-UMI',
  'H2VF5DSXY_1_55UDI-idt-UMI',
  'H2VF5DSXY_2_55UDI-idt-UMI',
  'H2VF5DSXY_3_55UDI-idt-UMI',
  'H2VF

In [20]:
#Dictionary with ref sample to fastq file
ref_df = ref_df[['SAMPLE BARCODE','Unnamed: 9']]
ref_np = ref_df.groupby('SAMPLE BARCODE',as_index=False).apply(lambda x: list(np.unique(x)))
ref_list= list(ref_np)
ref_dict = {}
for sample in ref_list:
    key1 = sample[0]
    key2 = sample[1:len(sample)]
    ref_dict[key1] = key2
ref_dict

{'AQ5174': ['H2TKWDSXY_1_54UDI-idt-UMI',
  'H2TKWDSXY_2_54UDI-idt-UMI',
  'H2TKWDSXY_3_54UDI-idt-UMI',
  'H2TKWDSXY_4_54UDI-idt-UMI',
  'H2VY7DSXY_1_54UDI-idt-UMI',
  'H2VY7DSXY_2_54UDI-idt-UMI',
  'H2VY7DSXY_3_54UDI-idt-UMI',
  'H2VY7DSXY_4_54UDI-idt-UMI',
  'HK2CCDSX3_1_54UDI-idt-UMI',
  'HK2CCDSX3_2_54UDI-idt-UMI',
  'HK2CCDSX3_3_54UDI-idt-UMI',
  'HK2CCDSX3_4_54UDI-idt-UMI',
  'HK5YWDSX3_1_54UDI-idt-UMI',
  'HK5YWDSX3_2_54UDI-idt-UMI',
  'HK5YWDSX3_3_54UDI-idt-UMI',
  'HK5YWDSX3_4_54UDI-idt-UMI'],
 'AQ5175': ['H2TKWDSXY_1_66UDI-idt-UMI',
  'H2TKWDSXY_2_66UDI-idt-UMI',
  'H2TKWDSXY_3_66UDI-idt-UMI',
  'H2TKWDSXY_4_66UDI-idt-UMI',
  'H2VY7DSXY_1_66UDI-idt-UMI',
  'H2VY7DSXY_2_66UDI-idt-UMI',
  'H2VY7DSXY_3_66UDI-idt-UMI',
  'H2VY7DSXY_4_66UDI-idt-UMI',
  'HK2CCDSX3_1_66UDI-idt-UMI',
  'HK2CCDSX3_2_66UDI-idt-UMI',
  'HK2CCDSX3_3_66UDI-idt-UMI',
  'HK2CCDSX3_4_66UDI-idt-UMI',
  'HK5YWDSX3_1_66UDI-idt-UMI',
  'HK5YWDSX3_2_66UDI-idt-UMI',
  'HK5YWDSX3_3_66UDI-idt-UMI',
  'HK5YWDSX3_4_66U

## Build the yaml file

```gs://input_bucket/``` : bucket name on gcloud\
```project-name```: name of the project at gcloud\
```output-bucket```: name of the output bucket on gcloud\
```'batch': {'size':2, 'delay':1440}```: optional arguments to run on gcloud (see platinum documentation)


In [23]:
import yaml

In [39]:
#Build the input file
all_samples_list = []
for pt in samples_dict.keys():
    all_tumors_list = []

    ref = samples_dict[pt]['normal']        
    files = ref_dict[ref]                
    ref_lanes_list = []
    for file in files:
        path_1 = "gs://input_bucket/"+file+"_1.fastq.gz"
        path_2 = "gs://input_bucket/"+file+"_2.fastq.gz"
        lanenumber = file[10]
        lane_dict = {"read1": path_1,
                    "read2": path_2}
        ref_lanes_list.append(lane_dict)
        
    ref_name = ref
    reference_dict = {"name":ref_name,
                 "fastq":ref_lanes_list}    
    
    for t in ['tumor1','tumor2']:
            
        tumor_lanes_list = []
        tumor = samples_dict[pt][t]
        files = tum_dict[tumor]

        for file in files:
            path_1 = "gs://input_bucket/"+file+"_1.fastq.gz"
            path_2 = "gs://input_bucket/"+file+"_2.fastq.gz"
            lane_dict = {"read1": path_1,
                        "read2": path_2}
            tumor_lanes_list.append(lane_dict)

        tumor_name = tumor

        tumor_dict = {"name":tumor_name,
                         "fastq":tumor_lanes_list}

        all_tumors_list.append(tumor_dict)
        
    sample_dict = {"name":pt,
                  "tumors":all_tumors_list,
                  "normal":reference_dict}
    all_samples_list.append(sample_dict)
    

input_dict = {'gcp': {'project': 'project-name', 'region': 'europe-west3'},
                'image': 'eu.gcr.io/hmf-images/pipeline5:platinum-5.28',
                'outputBucket': 'output-bucket',
                'argumentOverrides': {'image_project': 'hmf-images',
                'image_name': 'hmf-public-pipeline-v5-28',
                'ref_genome_version': 38},
                'batch': {'size':2, 'delay':1440},
                "samples":all_samples_list}
input_dict

{'gcp': {'project': 'project-name', 'region': 'europe-west3'},
 'image': 'eu.gcr.io/hmf-images/pipeline5:platinum-5.28',
 'outputBucket': 'output-bucket',
 'argumentOverrides': {'image_project': 'hmf-images',
  'image_name': 'hmf-public-pipeline-v5-28',
  'ref_genome_version': 38},
 'batch': {'size': 2, 'delay': 1440},
 'samples': [{'name': 'case1',
   'tumors': [{'name': 'AQ5181',
     'fastq': [{'read1': 'gs://input_bucket/H2TKWDSXY_1_43UDI-idt-UMI_1.fastq.gz',
       'read2': 'gs://input_bucket/H2TKWDSXY_1_43UDI-idt-UMI_2.fastq.gz'},
      {'read1': 'gs://input_bucket/H2TKWDSXY_2_43UDI-idt-UMI_1.fastq.gz',
       'read2': 'gs://input_bucket/H2TKWDSXY_2_43UDI-idt-UMI_2.fastq.gz'},
      {'read1': 'gs://input_bucket/H2TKWDSXY_3_43UDI-idt-UMI_1.fastq.gz',
       'read2': 'gs://input_bucket/H2TKWDSXY_3_43UDI-idt-UMI_2.fastq.gz'},
      {'read1': 'gs://input_bucket/H2TKWDSXY_4_43UDI-idt-UMI_1.fastq.gz',
       'read2': 'gs://input_bucket/H2TKWDSXY_4_43UDI-idt-UMI_2.fastq.gz'},
      {'re

In [40]:
file = 'input.yaml'
with open(file, "w") as f:       
    yaml.dump(input_dict, f, sort_keys=False, width=1000, indent=4)