In [1]:
# Ipython Notebook
# Carolina Monzo - 10-10-2018
# Tests to create plots for VLT-Tesis

In [1]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
plt.style.use('seaborn-whitegrid')
sns.set_style('whitegrid')

In [7]:
# Read coverage per base file and add column names to the file
header = ["chr", "start", "end", "feature", "gene", "base"]
samples = []
# Amplicons
#for i in range(0, 2990):

# Exome
#for i in range(0, 489):
    samples.append(str(i))
header = header + samples

#df = pd.read_csv("./complete_paste-amplicon-val-SREBF2.tsv", sep="\t", names = header)
df = pd.read_csv("./complete_pasted.tsv", sep="\t", names = header)
df.head()

Unnamed: 0,chr,start,end,feature,gene,base,0,1,2,3,...,479,480,481,482,483,484,485,486,487,488
0,22,42229109,42229362,1,SREBF2,1,0,0,0,0,...,5,22,19,13,10,20,0,20,11,12
1,22,42229109,42229362,1,SREBF2,2,0,0,0,0,...,5,22,19,13,10,20,0,19,11,12
2,22,42229109,42229362,1,SREBF2,3,0,0,0,0,...,5,22,19,13,10,20,0,19,10,12
3,22,42229109,42229362,1,SREBF2,4,0,0,0,0,...,5,22,19,13,10,20,0,19,10,12
4,22,42229109,42229362,1,SREBF2,5,0,0,0,0,...,5,22,19,13,10,20,0,19,10,12


In [8]:
df["mean"] = df.loc[:, samples].mean(axis=1)
df.head()

Unnamed: 0,chr,start,end,feature,gene,base,0,1,2,3,...,480,481,482,483,484,485,486,487,488,mean
0,22,42229109,42229362,1,SREBF2,1,0,0,0,0,...,22,19,13,10,20,0,20,11,12,1.90184
1,22,42229109,42229362,1,SREBF2,2,0,0,0,0,...,22,19,13,10,20,0,19,11,12,1.844581
2,22,42229109,42229362,1,SREBF2,3,0,0,0,0,...,22,19,13,10,20,0,19,10,12,1.846626
3,22,42229109,42229362,1,SREBF2,4,0,0,0,0,...,22,19,13,10,20,0,19,10,12,1.832311
4,22,42229109,42229362,1,SREBF2,5,0,0,0,0,...,22,19,13,10,20,0,19,10,12,1.832311


In [5]:
# Create a copy of the dataframe so I dont make changes to the original dataframe
ab = df.copy()

# Create a dataframe per gene
for gene in list(ab.gene.unique()):
    dfg = ab.loc[ab['gene'].map(str) == gene].reset_index(drop=True)
    dfg["index"] = dfg.index

    # Create a plot per gene
    fig = plt.figure(figsize=(40,10))
    ax1 = fig.add_subplot(111)
    for i in range(6, len(samples)):
        #ax1.plot(dfg['index'], dfg[header[i]], alpha=0.5, color='gray')
        ax1.plot(dfg['index'], dfg[samples[i]], alpha=0.5, color='gray')
    ax1.plot(dfg['index'], dfg['mean'], alpha=0.5, color='blue')
    # Plot limits
    ax1.set_ylim(0, dfg['mean'].max() + 700)
    ax1.set_xlim(0, dfg['index'].max() + 1)

    # Getting positions to plot vertical lines separating exons
    index_positions = []
    names = []
    for exon in list(dfg.feature.unique()):
        index_positions.append(dfg[dfg['feature']==exon].iloc[0]["index"])
        names.append(str(dfg[dfg['feature']==exon].iloc[0]["chr"]) + "-" + 
                     str(dfg[dfg['feature']==exon].iloc[0]["start"]))
        
    plt.axhline(y=20, xmin=ax1.get_xlim()[0], xmax=ax1.get_xlim()[-1], color='r', linewidth = 2)

    # Set limits for the vertical lines, it is a plot on top of a plot
    ax2 = ax1.twiny()
    ax2.set_ylim(ax1.get_ylim())
    ax2.set_xlim(ax1.get_xlim())

    # Paint vertical lines, we use [1:] because the first point is always zero
    ax2.vlines(x=index_positions, ymin=0, ymax=ax2.get_ylim()[1], linestyle='--', 
               alpha=0.15)
    ax2.grid(b=False)
    ax2.set_xticks(index_positions)
    ax2.set_xticklabels(names, rotation=50, minor=False, fontsize=20)

    ax1.set_xlabel('cDNA position (bp)', fontsize=24)
    ax1.set_ylabel('Coverage', fontsize=24)

    fig.tight_layout()
    figname = "{}_pbcov-amplicon-val.png".format(gene)
    fig.savefig(os.path.join("./", figname))
    #plt.show()
    plt.close(fig)

In [12]:
# Create a copy of the dataframe so I dont make changes to the original dataframe
ab = df.copy()

# Create a dataframe per gene
for gene in list(ab.gene.unique()):
    dfg = ab.loc[ab['gene'].map(str) == gene].reset_index(drop=True)
    dfg["index"] = dfg.index

    # Create a plot per gene
    fig = plt.figure(figsize=(40,10))
    ax1 = fig.add_subplot(111)
    for i in range(6, len(samples)):
        #ax1.plot(dfg['index'], dfg[header[i]], alpha=0.5, color='gray')
        ax1.plot(dfg['index'], dfg[samples[i]], alpha=0.5, color='gray')
    ax1.plot(dfg['index'], dfg['mean'], alpha=0.5, color='blue')
    # Plot limits
    ax1.set_ylim(0, dfg['mean'].max() + 200)
    ax1.set_xlim(0, dfg['index'].max() + 1)

    # Getting positions to plot vertical lines separating exons
    index_positions = []
    names = []
    for exon in list(dfg.feature.unique()):
        index_positions.append(dfg[dfg['feature']==exon].iloc[0]["index"])
        names.append(str(dfg[dfg['feature']==exon].iloc[0]["chr"]) + "-" + 
                     str(dfg[dfg['feature']==exon].iloc[0]["start"]))
        
    plt.axhline(y=20, xmin=ax1.get_xlim()[0], xmax=ax1.get_xlim()[-1], color='r', linewidth = 2)

    # Set limits for the vertical lines, it is a plot on top of a plot
    ax2 = ax1.twiny()
    ax2.set_ylim(ax1.get_ylim())
    ax2.set_xlim(ax1.get_xlim())

    # Paint vertical lines, we use [1:] because the first point is always zero
    ax2.vlines(x=index_positions, ymin=0, ymax=ax2.get_ylim()[1], linestyle='--', 
               alpha=0.15)
    ax2.grid(b=False)
    ax2.set_xticks(index_positions)
    ax2.set_xticklabels(names, rotation=50, minor=False, fontsize=20)

    ax1.set_xlabel('cDNA position (bp)', fontsize=24)
    ax1.set_ylabel('Coverage', fontsize=24)

    fig.tight_layout()
    figname = "{}_pbcov-exome.png".format(gene)
    fig.savefig(os.path.join("./", figname))
    #plt.show()
    plt.close(fig)