In [1]:
from Bio import SeqIO
import collections
import itertools
import csv
import gzip
import pysam
import pandas as pd
import pymysql
import time
import gffutils
import datetime
from multiprocessing import cpu_count, Manager
from concurrent.futures import ProcessPoolExecutor
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import chart_studio.plotly as py
import plotly.figure_factory as ff
import plotly.offline
import plotly.io as pio
import plotly
import plotly.graph_objects as go

# TETRANSCRIPTS / SALMON TE -> Counts Table

In [2]:
# Read the count matrix 
rep_df = pd.read_csv('SALMONTE_RES/SalmonTE_CountTable_NORM_SCALED_TXIMPORT.txt', 
                     delimiter=',', 
                     header=0, 
                     index_col=0)

In [3]:
rep_df.head()

Unnamed: 0,RNA01,RNA02,RNA03,RNA04,RNA05,RNA06,RNA07,RNA08,RNA09,RNA10,RNA11,RNA12,RNA13,RNA14,RNA15,RNA16
ALU,2559.131743,3740.700488,2508.289496,2596.31358,3141.466282,2867.429824,2980.923011,3928.83386,2894.187651,2871.173486,3671.499003,3134.58265,2360.392495,2615.879033,2849.225578,3174.428286
AluJb,9206.05204,10192.644382,9495.976815,10381.851551,11548.720297,11262.126857,11342.649402,10506.465376,10963.242747,11625.341133,10846.896484,11946.779528,10898.821487,10239.953789,10631.323965,11021.958754
AluJo,1299.397953,125.369526,1243.314483,1398.538508,1380.487401,1347.928995,1534.625553,59.814776,1287.653121,1289.116582,178.075558,1628.816283,1328.090514,1222.606568,1277.42843,1058.576783
AluJr,484.696062,563.653234,488.444975,457.672577,575.560353,527.038341,446.44745,325.990528,429.455501,512.421602,547.320465,495.845584,576.78764,476.068027,422.880435,575.511609
AluJr4,2360.017649,2497.197874,2479.047779,2447.442389,3290.447454,2788.753237,3239.898584,2740.513646,2547.484373,3030.633355,2728.746049,2868.430241,2613.784672,2591.925925,2774.535007,2626.259989


In [38]:
rep_df.columns = ['BRNO0286', 'BRNO1567', 'BRNO0842', 'BRNO1208',
                 'BRNO0271', 'BRNO1215', 'BRNO1152', 'BRNO0462',
                 'BRNO0384', 'BRNO0627', 'BRNO0871', 'BRNO0414',
                 'BRNO0820', 'BRNO0208', 'BRNO1335', 'BRNO1727']

In [95]:
rep_data = {'C':rep_df.iloc[:, :8].sum(axis=1), 'T':rep_df.iloc[:, 8:16].sum(axis=1)}

In [96]:
rep_cond_df = pd.DataFrame(rep_data)

In [97]:
rep_cond_df.head()

Unnamed: 0,C,T
ALU,24323.088286,23571.368183
AluJb,83936.486721,88174.317888
AluJo,8389.477194,9270.363839
AluJr,3869.503521,4036.290861
AluJr4,21843.318613,21781.799613


In [39]:
# subset to Alu
alu_df = rep_df.copy()
for idx in alu_df.index:
    if idx[:3] != 'Alu':
        alu_df.drop(idx, inplace=True)

In [40]:
alu_df.head()

Unnamed: 0,BRNO0286,BRNO1567,BRNO0842,BRNO1208,BRNO0271,BRNO1215,BRNO1152,BRNO0462,BRNO0384,BRNO0627,BRNO0871,BRNO0414,BRNO0820,BRNO0208,BRNO1335,BRNO1727
AluJb,9206.05204,10192.644382,9495.976815,10381.851551,11548.720297,11262.126857,11342.649402,10506.465376,10963.242747,11625.341133,10846.896484,11946.779528,10898.821487,10239.953789,10631.323965,11021.958754
AluJo,1299.397953,125.369526,1243.314483,1398.538508,1380.487401,1347.928995,1534.625553,59.814776,1287.653121,1289.116582,178.075558,1628.816283,1328.090514,1222.606568,1277.42843,1058.576783
AluJr,484.696062,563.653234,488.444975,457.672577,575.560353,527.038341,446.44745,325.990528,429.455501,512.421602,547.320465,495.845584,576.78764,476.068027,422.880435,575.511609
AluJr4,2360.017649,2497.197874,2479.047779,2447.442389,3290.447454,2788.753237,3239.898584,2740.513646,2547.484373,3030.633355,2728.746049,2868.430241,2613.784672,2591.925925,2774.535007,2626.259989
AluSc,25607.183099,31640.006691,29318.611811,29848.248556,33615.082568,31467.791034,32834.016737,28188.710027,30887.980509,34856.314877,37408.960954,34124.02014,34225.691178,29902.461546,31649.030761,34821.056463


In [41]:
# subset to L1
l1_df = rep_df.copy()
for idx in l1_df.index:
    if idx[:2] != 'L1':
        l1_df.drop(idx, inplace=True)

In [42]:
l1_df.head()

Unnamed: 0,BRNO0286,BRNO1567,BRNO0842,BRNO1208,BRNO0271,BRNO1215,BRNO1152,BRNO0462,BRNO0384,BRNO0627,BRNO0871,BRNO0414,BRNO0820,BRNO0208,BRNO1335,BRNO1727
L1,77862.336912,62718.398677,62755.973603,54305.658593,60180.890576,58363.808832,59363.690753,60002.195469,68837.008436,56918.21486,54428.270527,59121.382508,76724.58763,56589.218306,59603.075075,63405.233726
L1HS,113507.726173,138070.580066,132854.867234,120450.405029,120148.491561,85298.690967,90043.583523,81072.947171,120643.466761,190157.685623,139493.393291,84021.216425,154975.444091,159991.792602,93374.196744,83169.239861
L1M1B_5,168.969331,150.851137,102.887523,157.378117,129.688646,124.1763,140.603907,151.530765,132.688909,142.438872,115.225361,120.315472,193.248508,124.755772,149.381141,153.643371
L1M1_5,4257.551168,3325.859862,3158.105428,3108.430479,3126.460984,3137.58437,3104.70252,3584.898899,3623.263269,3030.633355,3524.848544,3183.802616,3512.981813,3439.26713,3217.186475,3919.208015
L1M2A1_5,138.824568,95.810857,116.966868,125.051801,180.063574,86.259872,144.810007,90.719077,82.752223,143.334714,68.087713,120.315472,98.596178,159.687389,104.34712,171.872245


In [102]:
l1_df.loc['L1HS', :]

C    8.814473e+05
T    1.025826e+06
Name: L1HS, dtype: float64

In [43]:
def plot_set(norm_df, title):
    fig = go.Figure()
    
    for i in range(int(len(norm_df.columns)/2)):
        col_name = norm_df.columns[i]
        fig.add_trace(go.Scatter(
            x=norm_df.index,
            y=norm_df[col_name],
            name = col_name,
            marker=dict(
                color='rgba(156, 165, 196, 0.95)',
                line_color='rgba(156, 165, 196, 1.0)',
            ),
            opacity = 0.9))

    for i in range(int(len(norm_df.columns)/2),len(norm_df.columns)):
        col_name = norm_df.columns[i]
        fig.add_trace(go.Scatter(
            x=norm_df.index,
            y=norm_df[col_name],
            name = col_name,
            marker=dict(
                color='rgba(204, 204, 204, 0.95)',
                line_color='rgba(217, 217, 217, 1.0)'
            ),
            opacity = 0.9))

        layout = dict(
            title='Normalized Reads Count'
        )

    fig.update_traces(mode='markers', marker=dict(line_width=1, symbol='circle', size=5))
    fig.update_layout(
        title=title,
        xaxis=dict(
            showgrid=False,
            showline=True,
            linecolor='rgb(102, 102, 102)',
            tickfont_color='rgb(102, 102, 102)',
            showticklabels=True,
            dtick=1,
            ticks='outside',
            tickcolor='rgb(102, 102, 102)',
        ),
        margin=dict(l=140, r=40, b=50, t=80),
        legend=dict(
            font_size=10,
            yanchor='middle',
            xanchor='right',
        ),
        width=800,
        height=600,
        paper_bgcolor='white',
        plot_bgcolor='white',
        hovermode='closest',
    )
    return fig

In [48]:
title="Normalized Reads Count: All TE's"
fig_all = plot_set(rep_df, title)

In [49]:
plotly.offline.plot(fig_all, filename='SALMONTE_RES/SalmonTE_All_readCounts_AllSamples.html')
plotly.offline.iplot(fig_all, filename = "SALMONTE_RES/SalmonTE_All_readCounts_AllSamples.html")

In [103]:
def plot_subset(norm_df, title):
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=norm_df.index,
        y=norm_df['C'],
        name = "C",
        marker=dict(
            color='rgba(156, 165, 196, 0.95)',
            line_color='rgba(156, 165, 196, 1.0)',
        ),
        opacity = 0.8))

    fig.add_trace(go.Scatter(
        x=norm_df.index,
        y=norm_df['T'],
        name = "T",
        marker=dict(
            color='rgba(204, 204, 204, 0.95)',
            line_color='rgba(217, 217, 217, 1.0)'
        ),
        opacity = 0.8))

    layout = dict(
        title='Normalized Reads Count'
    )

    fig.update_traces(mode='markers', marker=dict(line_width=1, symbol='circle', size=5))
    fig.update_layout(
        title=title,
        xaxis=dict(
            showgrid=False,
            showline=True,
            linecolor='rgb(102, 102, 102)',
            tickfont_color='rgb(102, 102, 102)',
            showticklabels=True,
            dtick=1,
            ticks='outside',
            tickcolor='rgb(102, 102, 102)',
        ),
        margin=dict(l=140, r=40, b=50, t=80),
        legend=dict(
            font_size=10,
            yanchor='middle',
            xanchor='right',
        ),
        width=800,
        height=600,
        paper_bgcolor='white',
        plot_bgcolor='white',
        hovermode='closest',
    )
    return fig

In [104]:
title="Normalized Reads Count: All TE's"
fig_all = plot_subset(rep_cond_df, title)

In [105]:
plotly.offline.plot(fig_all, filename='SALMONTE_RES/SalmonTE_All_readCounts_NORM_SCALED_TXIMPORT.html')
plotly.offline.iplot(fig_all, filename = "SALMONTE_RES/SalmonTE_All_readCounts_NORM_SCALED_TXIMPORT.html")

In [106]:
title='Normalized Reads Count: Alu'
fig_alu = plot_subset(alu_df, title)

In [107]:
plotly.offline.plot(fig_alu, filename='SALMONTE_RES/SalmonTE_Alu_readCounts_NORM_SCALED_TXIMPORT.html')
plotly.offline.iplot(fig_alu, filename = "SALMONTE_RES/SalmonTE_Alu_readCounts_NORM_SCALED_TXIMPORT.html")

In [108]:
title='Normalized Reads Count: L1'
fig_l1 = plot_subset(l1_df, title)

In [109]:
plotly.offline.plot(fig_l1, filename='SALMONTE_RES/SalmonTE_L1_readCounts_NORM_SCALED_TXIMPORT.html')
plotly.offline.iplot(fig_l1, filename = "SALMONTE_RES/SalmonTE_L1_readCounts_NORM_SCALED_TXIMPORT.html")