#### The purpose of this script is to use the metadata to create a dictionary that has patients IDs as the keys, and lists of all the file names associated with that patient as the values. We also create a list of IDs of patients that only have cancerous samples.

In [1]:
# Necessary imports

import json
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import csv

In [2]:
'''
We need to be able to determine which files are associated with
a given patient. The json file contains this information. Here
we are storing all the metadata.
'''

with open('../Data/GDC_Metadata.json') as file:
    json_string = file.readlines()
    json_string = ''.join(json_string)
metadata = json.loads(json_string)

In [3]:
'''
Making a dictionary where each key is the Case ID, and the value
is a list of the files associated with each patient.
'''

# Dictionary of Case IDs
case_id_dict = {}

# For each entry in the metadata from the json file
for entry in metadata:
    
    # Getting the case ID
    case_id = entry['associated_entities'][0]['case_id']
    # Getting the associated file name
    file_name = entry['file_name']
    
    # If the Case ID is already in the dictionary, add the file
    # name to the list of values
    if case_id in case_id_dict:
        case_id_dict[case_id].append(file_name)
    # If not already in the dictionary, add new key and value
    else:
        case_id_dict[case_id] = [file_name]

In [4]:
'''
Some patients don't have both the microRNA and mRNA files, so we
need to remove those from analysis. Some also have samples from
healthy tissue, which we will use later in the project but are
removing for now. Thus, we want patients that only have 2 files
associated to them.
'''

# List of Case IDs that have the right number of files
tumor_case_ids = []

# For each Case ID
for key in case_id_dict:
    # If they have 2 files, add them to the list of files to use
    if len(case_id_dict[key]) == 2:
        tumor_case_ids.append(key)

In [5]:
'''
Writing the dictionary of case IDs and associated files to a csv
file for permanent access throughout project.
'''

# Open a file for writing
with open('../Organized_Data/case_id_dict.csv','w') as file:
    
    w = csv.writer(file, dialect='excel')
    
    # Writing the file headers
    w.writerow(['Case IDs', 'Associated Files'])
    
    # Loop over dictionary keys and values
    for key, val in case_id_dict.items():

        # Write every key and value to file
        w.writerow([key, val])
        
file.close()
    
'''
Creating a csv file for all the case IDs that only have cancerous
data.
'''

# Open a file for writing
with open('../Organized_Data/tumor_case_ids.csv','w') as file:
    
    wr = csv.writer(file, dialect='excel')
    
    # Writing header
    wr.writerow(['Tumor Case IDs'])
    
    # Writing each case ID to a row
    for id in tumor_case_ids:
        wr.writerow([id])
        
file.close()