# Handling TCGA data
### Overall goals
- Work with TCGA data (e.g. of a specific study or primary site)
- Downoad data (or further data after a while)
- Only download data matching specific cases

### Filtering and selecting TCGA data
- Filter and select TCGA data from the TCGA GDC data portal as explained in "TCGA_steps_explained.ipynb"
- Do these steps every time for your new analyses, also when you later on have new aspects or file types to consider
- Create a folder called "sample_sheets" in your analysis path, create the folders "manifests" and "prior_sample_sheets" in the "sample_sheets" folder

### Combine manifest with sample sheet, filter for relevant files
- Adapt the data/config.yaml
- manifest files

In [26]:
import pandas as pd
import yaml
import os

# load config file
with open('data/config.yaml', 'r') as streamfile:
    config_file = yaml.load(streamfile, Loader=yaml.FullLoader)

# analysis path
analysis_path = config_file['analysis_path']

# original (prior) manifest files, merge them
manifests = [pd.read_table(analysis_path+'sample_sheets/manifests/'+i) for i in config_file['manifests_prior']]
manifests_merge = pd.concat(manifests)

# original (prior) sample sheets, merge them
sample_sheets = [pd.read_table(analysis_path+'sample_sheets/prior_sample_sheets/'+i) for i in config_file['sample_sheets_prior']]
sample_sheets_merge = pd.concat(sample_sheets)

sample_sheets_merge['Case ID'] = sample_sheets_merge['Case ID'].str.split(', ', expand=True)[0]


- merge manifest and sample sheet
- if previous selection of case IDs -> filter for specific case IDs of previous analysis
- create adapted filtered manifest file for gdc-client download

In [33]:
manifests_pipeline_files = []

Manifest for download

In [34]:
import shlex
import subprocess
import yaml

with open('data/config.yaml', 'r') as streamfile:
    config_file = yaml.load(streamfile, Loader=yaml.FullLoader)

# filtered manifest(s) in there, but only names in 'sample_sheets/manifests/
if config_file['manifest_for_download'] == False:
    manifests_pipeline = ', '.join(manifests_pipeline_files) # only file names
    
    manifest_file_change_cmd = f"sed -i 's/^manifest_for_download: .*/manifest_for_download: [{manifests_pipeline}]/' data/config.yaml"
    subprocess.run(shlex.split(manifest_file_change_cmd))

### Download TCGA data via a manifest document and the GDC-client tool
- For restricted access files:
    - Login at NIH for restricted access files
    - Download access token, save as secured file
- Download gdc-client tool -> create conda environment named, also possible with this code here -> creates a conda environment called "gdc_client"



- the files from the manifest are downloaded into the following folder: analysis_path + '00_raw_data'

```
gdc-client download -m manifest.txt -t user-token.txt
```


In [106]:
import os
import subprocess
import yaml
import shlex

# Read in manifest download files (either from manual input in config.yaml or from previous pipeline steps)
def Create_Manifest_Download_List():
    with open('data/config.yaml', 'r') as streamfile:
        config_file = yaml.load(streamfile, Loader=yaml.FullLoader)
    
    manifest_download_list = config_file['manifest_for_download']
    analysis_path = config_file['analysis_path']

    if manifest_download_list == False:
        manifest_for_download = False
        print('Please execute the previous part of the pipeline or input your files manually in the data/config.yaml file')
    else:
        manifest_for_download = [analysis_path+'sample_sheets/manifests/'+i for i in manifest_download_list]
    
    return manifest_for_download


# Download the gdc-client in a new conda environment and run the gdc-client, if accepted
def Download_gdc_client():
    with open('data/config.yaml', 'r') as streamfile:
        config_file = yaml.load(streamfile, Loader=yaml.FullLoader)
    
    conda_gdc = config_file['conda_gdc']
    name_conda_gdc_env = False

    if conda_gdc == False:
        print('Please execute the TCGA data download in your own environment or '+
            'set "conda_gdc" in the data/config.yaml file to "True" to create a conda environment with the gdc-client')
    elif conda_gdc == True:
        name_conda_gdc_env = 'gdc_client'
    elif conda_gdc == 'First_install':
        gdc_client_conda_cmd = shlex.split(f'conda create --name gdc_client --file envs/gdc_client.txt')
        subprocess.run(gdc_client_conda_cmd)
        
        conda_gdc_status_change_cmd = f"sed -i 's/^conda_gdc: .*/conda_gdc: True/' data/config.yaml"
        subprocess.run(shlex.split(conda_gdc_status_change_cmd))

        name_conda_gdc_env = 'gdc_client'
    else:
        name_conda_gdc_env = conda_gdc
    
    return name_conda_gdc_env


# Prepare commands to download TCGA data, dependent on available user token file
def TCGA_Data_Download():
    with open('data/config.yaml', 'r') as streamfile:
        config_file = yaml.load(streamfile, Loader=yaml.FullLoader)
    
    tcga_user_token_file = config_file['tcga_user_token_file']
    analysis_path = config_file['analysis_path']
    raw_data_path = analysis_path + '00_raw_data'

    os.makedirs(raw_data_path, exist_ok=True)

    manifest_for_download = Create_Manifest_Download_List()

    name_conda_gdc_env = Download_gdc_client()

    if manifest_for_download == False:
        print('No TCGA data are download due to no manifest files.')
    elif name_conda_gdc_env == False:
        print('No TCGA data are downloaded due to the specifications in the gdc client conda environment.')
    else:
        for manifest_file in manifest_for_download:
            if tcga_user_token_file == False:
                print(f'Download TCGA data with TCGA manifest {manifest_file.split("/")[-1]} without TCGA user token')
                command_download_tcga_data = f'conda run -n {name_conda_gdc_env} gdc-client download -m {manifest_file}'
            else:
                print(f'Download TCGA data with TCGA manifest {manifest_file.split("/")[-1]} with TCGA user token file {tcga_user_token_file.split("/")[-1]}')
                command_download_tcga_data = f'conda run -n {name_conda_gdc_env} gdc-client download -m {manifest_file} {tcga_user_token_file}'
            process = subprocess.Popen(command_download_tcga_data, cwd=raw_data_path, shell=True)
            process.wait()

TCGA_Data_Download()

### Rename the downloaded files as case_id.file_suffix
- in manifest only id, filename with 36 different characters
- take merged manifest and sample sheet
- rename downloaded files and put them in new folders for each analysis

In [7]:
# categorize samples for their analysis method
method_dict = {'BRASS':'BRASS', 'CaVEMan':'CaVEMan', 'ASCAT':'CNV_segment', 'pindel':'Pindel', 'star_splice':'Splicing', 
               'star_gene_counts':'STAR_counts'}

sample_sheets_merge['Folder'] = ''

for met in method_dict.keys():
    sample_sheets_merge.loc[sample_sheets_merge['File Name'].str.contains(met), 'Folder'] = method_dict[met]

sample_sheets_merge['File Suffix'] = sample_sheets_merge['File Name'].str.split('.', expand=True)[1]

sample_sheets_merge['Path_raw'] = analysis_path+'00_raw_data/'+sample_sheets_merge['Folder']+'/'+sample_sheets_merge['File ID']+'/'+sample_sheets_merge['File Name']
sample_sheets_merge['Path_sample'] = analysis_path+'01_sample_data/'+sample_sheets_merge['Folder']+'/'+sample_sheets_merge['Case ID']+'.'+sample_sheets_merge['File Suffix']



### Analyze files
- with Snakemake pipeline
