In [31]:
# includes
# %matplotlib inline
% autosave 15

import matplotlib.pyplot as plt
#import seaborn as sns
import numpy as np
import pandas as pd
from Bio import SeqIO
from matplotlib import cm

Autosaving every 15 seconds


In [32]:
def process_input_file(filename):
    raw_data = []
    with open('../data/clean/' + filename) as input_file:
        for line in input_file:
            tokenized = line.strip().split(',')
            raw_data.append(tokenized)
    return raw_data

def flatten(data_list):
    return [item for sublist in data_list for item in sublist]

In [33]:
feature_names = ['CAs', 'CGs', 'SNPs', 'COVs', 'MAFs', 'meth', 'TATAs', 'TFBSinf', 'TFBSs']

gene_names = flatten(process_input_file('names'))
features_raw = {}
for name in feature_names:
    features_raw[name] = process_input_file(name)

In [70]:
def plot_feature_vec(fname, destdir, feature_vec, imgname):
    plt.figure(figsize=(50,30))
    minval, maxval = min(feature_vec), max(feature_vec)
    if not minval == maxval:
        plt.ylim(ymin=minval, ymax=maxval + 0.1*maxval)
    plt.xlabel(str(name), fontsize=60)
    plt.ylabel(fname, fontsize=60)
    plt.tick_params(axis='both', which='major', labelsize=30)
    plt.tick_params(axis='both', which='minor', labelsize=20)
    plt.plot(np.arange(-1000, 1000), feature_vec, marker='o', linewidth=3, alpha=0.5, color='grey')
    plt.scatter(np.arange(-1000, 1000), feature_vec, s=100, c=feature_vec, marker='o', cmap=cm.jet)
    plt.savefig('../images/' + destdir + '/' + imgname +'.png')
    plt.close()

def fasta_plot(fname, destdir, sample_size=10):
    #raw_data = SeqIO.parse(open('../data/raw/' + filename + '.s.fasta'), 'fasta')
    print(fname)
    #raw_data = process_input_file(filename)
    raw_data = features_raw[fname]
    genes_processed = 0
    for i in xrange(len(raw_data)):
        if genes_processed == sample_size: 
            break
        data_vec = raw_data[i]
        name = gene_names[i]
        feature_vec = np.array(map(float, data_vec))
        plot_feature_vec(fname, destdir, feature_vec, str(genes_processed))
        genes_processed += 1
        
def averaged_fasta_plot(fname, destdir, sample_size=10):
    #raw_data = SeqIO.parse(open('../data/raw/' + filename + '.s.fasta'), 'fasta')
    print(fname)
    avg_vec = [0 for i in xrange(len(features_raw[fname][0]))]
    #raw_data = process_input_file(filename)
    raw_data = features_raw[fname]
    genes_processed = 0
    for i in xrange(len(raw_data)):
        if genes_processed == sample_size: 
            break
        data_vec = raw_data[i]
        name = gene_names[i]
        feature_vec = np.array(map(float, data_vec))
        avg_vec += feature_vec
        genes_processed += 1
        
    avg_vec /= float(sample_size)
    plot_feature_vec(fname, destdir, avg_vec, 'averaged')

In [63]:
def fasta_integrated_plot(gene_pos):
    name = gene_names[gene_pos]
    colors = ['red', 'orange', 'yellow', 'green', 'cyan', 'blue', 'red', 'yellow', 'magenta']
    plt.figure(figsize=(50, 30))
    plt.xlabel(name, fontsize=60)
    plt.ylabel('Value', fontsize=60)
    plt.tick_params(axis='both', which='major', labelsize=30)
    plt.tick_params(axis='both', which='minor', labelsize=20)
    for i in range(len(colors)):
        raw_data = np.array(map(float, features_raw[feature_names[i]][gene_pos]))
        normalized = [(val - min(raw_data))/(max(raw_data) - min(raw_data)) for val in raw_data]
        plt.plot(np.arange(-1000, 1000), normalized, marker='o', linewidth=3, alpha=0.5, color=colors[i])
    plt.savefig('test.png')
    plt.close()

In [72]:
folder_names = ['1_CA', '2_CG', '3_SNP_CNT', '4_COV', '5_MAF', '6_METHYL', '7_TATA', '8_TFBS_INF', '9_TFBS']

for i in xrange(len(feature_names)):
    fasta_plot(feature_names[i], folder_names[i])

CAs
CGs
SNPs
COVs
MAFs
meth
TATAs
TFBSinf
TFBSs


In [24]:
fasta_integrated_plot(0)

In [68]:
for i in xrange(len(feature_names)):
    averaged_fasta_plot(feature_names[i], folder_names[i])

CAs
CGs
SNPs
COVs
MAFs
meth
TATAs
TFBSinf
TFBSs
