# RNA Seq Analysis - Template

In [3]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import matplotlib.style as style
style.use('ggplot')
#style.use('Solarize_Light2')
from matplotlib import rcParams
rcParams['pdf.fonttype'] = 42 ## Output Type 3 (Type3) or Type 42 (TrueType)
rcParams['font.sans-serif'] = 'Arial'
from pylab import *

# ignore FutureWarning that may pop up when plotting
import warnings
warnings.filterwarnings("ignore")
import urllib3
urllib3.disable_warnings()

from dfply import *

# For PCA
from sklearn.decomposition import PCA

In [5]:
# load preprocessed harmonized data from local file
genes_with_meta = pd.read_csv("small_Harmonized_rna_seq_data.csv")
genes_with_meta.head(2)

#Clean up data format
genes_with_meta = genes_with_meta.replace(to_replace = "male", value = "Male")
genes_with_meta = genes_with_meta.replace(to_replace = "female", value = "Female")
genes_with_meta = genes_with_meta.rename(columns={'used': 'id'})
genes_with_meta = genes_with_meta.reset_index()
genes_with_meta = genes_with_meta.drop(labels=["index", "path"], axis =1)

#Quality control
#removing any rows or columns with uncompliant values
genes_with_meta = genes_with_meta[genes_with_meta.totalCounts != "Male"]

# Data file

In [6]:
genes_with_meta.head()

Unnamed: 0.1,Unnamed: 0,ROW_ID,ROW_VERSION,totalCounts,Symbol,zScore,specimenID,parent,individualID,assay,...,resourceType,nf1Genotype,nf2Genotype,studyName,id,age,isCellLine,experimentalCondition,transplantationType,modelOf
0,0,1,9324,136.993,A1BG,-0.226861,patient10tumor1,syn18407530,patient10,rnaSeq,...,experimentalData,-/-,unknown,Cutaneous Neurofibroma Data Resource,syn20430744,,,,,Cutaneous Neurofibroma
1,1,2,9324,168.454,A1BG,-0.248889,patient10tumor2,syn18407530,patient10,rnaSeq,...,experimentalData,-/-,unknown,Cutaneous Neurofibroma Data Resource,syn20430745,,,,,Cutaneous Neurofibroma
2,2,3,9324,115.456,A1BG,-0.115625,patient10tumor3,syn18407530,patient10,rnaSeq,...,experimentalData,-/-,unknown,Cutaneous Neurofibroma Data Resource,syn20430746,,,,,Cutaneous Neurofibroma
3,3,4,9324,500.304,A1BG,-0.184728,patient11tumor1,syn18407530,patient11,rnaSeq,...,experimentalData,-/-,unknown,Cutaneous Neurofibroma Data Resource,syn20430747,,,,,Cutaneous Neurofibroma
4,4,5,9324,380.698,A1BG,-0.229165,patient11tumor14,syn18407530,patient11,rnaSeq,...,experimentalData,-/-,unknown,Cutaneous Neurofibroma Data Resource,syn20430748,,,,,Cutaneous Neurofibroma


The following table contains detailed descriptions of the metadata variables included in the data file:

Var | Description
--- | --- 
 _id_  |  the synapseIDs of the individual files with raw data
 _specimenID_  |  individual samples
 _species_  |  the source of the specimen
 _age_  |  the age of the patient 
 _Sex_  |  the sex of the patient
 _tumorType_  |  the the type of tumor, can be one of 7 different diagnoses
 _isCellLine_  |  indicates whether the origin tissue was a cell line or a patient
 _study_  |  the specific initiative/consortia that the study was a part of
 _Symbol_  |  the common names of genes 
 _cellCulture_  |  whether the data was obtained from a cell culture experiment or an _in-vivo_ experiment. Many inherent differences exist between _cell culture_ experiments and _in-vivo_ experiments. These differences may need to be taken into account during analysis and interpretation of the data
 _totalCounts_  |  the main measure of expression levels of the gene. _(the values documented here are not corrected for batch/study)_ 
 _zScore_  |  normalized counts (counts for a gene can vary significantly due to technical differences in different studies, so we normalized the counts into a zScore for a comparable measurement between samples for the same gene)

# Metadata