#### The goal of this script is to generate 2 matricies, for miRNA and mRNA respectively, combing the data for each cancerous sample in our dataset. Each row will be a patient, and each column will be a miRNA/mRNA. The data is normalized counts of the RNA. Matricies are output into csv files in the folder "Organized_Data".

In [1]:
# Necessary Imports

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import csv
from scipy import stats

In [2]:
'''
Pulling in all the case IDs for cancerous patient data
'''

# List of case IDs for patients that only have cancerous data
tumor_case_ids = []

# Getting all case IDs for cancerous data
with open('../Organized_Data/tumor_case_ids.csv', newline='') \
as csvfile:
    reader = csv.reader(csvfile, delimiter=' ')
    for row in reader:
        tumor_case_ids.append(row[0])
        
tumor_case_ids = tumor_case_ids[1:]

In [3]:
'''
Pulling in dictionary that has case IDs as keys and a list of 
associated files as values
'''

case_id_dict = {}

with open('../Organized_Data/case_id_dict.csv', mode='r') \
as csvfile:
    reader = csv.reader(csvfile)
    for rows in reader:
        k = rows[0]
        v = rows[1][1:-1].split(',')
        case_id_dict[k] = v

In [4]:
'''
Making 2 matricies, one for miRNA and one for mRNA, compiling the
data from all cancerous breast tissue
'''

# Initializing miRNA and mRNA matrices
miRNA_data = np.empty((1881,), int)
mRNA_data = np.empty((60483,), int)

# For all cancerous cases
for tumor_id in tumor_case_ids:
    
    # Get the associated files
    associated_files = case_id_dict[tumor_id]
    
    # For each file
    for file in associated_files:
        
        # String manipulation to get the core file name
        file = file[1:-1]
        if file[0] == "\'":
            file = file[1:]
        
        # If the file is an miRNA file
        if file[-1] == 't':
            
            case_file_mirna = '../Data/GDC_Data/' + file
            
            # Load the microRNA file as an array
            mirnas = np.loadtxt(case_file_mirna, dtype='str')
            
            # Removing the column titles
            mirnas = np.delete(mirnas, 0, 0)
            
            # Getting the counts of the miRNAs and appending to
            # miRNA data
            miRNA_counts = mirnas[:,2].astype(float)
            miRNA_data = np.vstack([miRNA_data, miRNA_counts])
            
        # If the file is an mRNA file
        else:
            
            case_file_mRNA = '../Data/GDC_Data/' + file
            
            # These files are zipped, need to open using pandas
            mRNAs_dF = pd.read_csv(case_file_mRNA, sep='\t', 
                                   header=None)
            mRNAs_dF.columns = ["mRNA_Name", "Normalized_Count"]
            
            # Getting the counts of the miRNAs and appending to
            # mRNA data
            mRNA_counts = mRNAs_dF["Normalized_Count"].to_numpy()
            mRNA_data = np.vstack([mRNA_data, mRNA_counts])

# Names of the miRNA and mRNA in matrices, in order
miRNA_titles = mirnas[:,0]
mRNA_titles = mRNAs_dF["mRNA_Name"].to_numpy()

# Final matrices, deleting first row because it's not a patient
miRNA_data = np.delete(miRNA_data, 0, 0)
mRNA_data = np.delete(mRNA_data, 0, 0)

In [5]:
# Saving both matricies to csv files

np.savetxt('../Organized_Data/miRNA_Cancer_Data.csv', 
           miRNA_data, delimiter=",")
np.savetxt('../Organized_Data/mRNA_Cancer_Data.csv', 
           mRNA_data, delimiter=",")