# Some code for generating/parsing/manipulating repeat element pipeline outputs
- actually i moved the code into a script now. This notebook contains the appropriate commandline arguments to pass to: plot_repetitive_elements_sunburst

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tnrange, tqdm_notebook

In [2]:
ip_parsed_file = '/projects/ps-yeolab/software/ecliprepmap/0.0.3/examples/repeat_element_mapping_374_01_APOBEC3C/results/374_01_APOBEC3C.parsed'
input_parsed_file = '/projects/ps-yeolab/software/ecliprepmap/0.0.3/examples/repeat_element_mapping_374_01_APOBEC3C/results/374_01_APOBEC3C.input.parsed'
color_file = '/projects/ps-yeolab3/bay001/reference_data/ENCODE/color_list_269.lines'
output_file = '/oasis/tscc/scratch/bay001/sunburst_example.svg'

### You can run the following command in the terminal (after module load eclipanalysis/0.0.2) to generate sunburst plots of the top enriched repeat elements.

In [3]:
cmd = 'plot_repetitive_elements_sunburst '
cmd += '--ip_parsed {} '.format(ip_parsed_file)
cmd += '--input_parsed {} '.format(ip_parsed_file)
cmd += '--out_file {} '.format(output_file)
cmd += '--color_file {}'.format(color_file)
cmd

'plot_repetitive_elements_sunburst --ip_parsed /projects/ps-yeolab/software/ecliprepmap/0.0.3/examples/repeat_element_mapping_374_01_APOBEC3C/results/374_01_APOBEC3C.parsed --input_parsed /projects/ps-yeolab/software/ecliprepmap/0.0.3/examples/repeat_element_mapping_374_01_APOBEC3C/results/374_01_APOBEC3C.parsed --out_file /oasis/tscc/scratch/bay001/sunburst_example.svg --color_file /projects/ps-yeolab3/bay001/reference_data/ENCODE/color_list_269.lines'

# You may also use the parsing functions to read in and parse repeat-mapping parsed files
- the most updated script is here:  ```https://github.com/byee4/clip_analysis/blob/master/clip_analysis/parsers.py```

But I have copied the relevant code below:

In [4]:
def read_parsed(fn):
    """
    Reads Eric's parsed file from the repetitive element pipeline.
    Parameters
    ----------
    fn : basestring
        the *.parsed file
    Returns
    -------
    total_df : pandas.DataFrame
        dataframe of total reads per unique/repetitive element family.
    element_df : pandas.DataFrame
        dataframe of unique repetitive/unique elements that each unique read
        mapped to.
    total_reads : int
    total_genomic_reads : int
    total_usable_reads : int
    total_repfamily_reads : int
    """
    df = pd.read_table(fn, names=[
        'total_or_element', 'element', 'read_num',
        'clip_rpr', 'annotation', 'gene'
    ])
    total_reads = df[
        (df['total_or_element'] == '#READINFO') & (df['element'] == 'AllReads')
        ]['read_num'].values[0]
    total_genomic_reads = df[
        (df['total_or_element'] == '#READINFO') & (
        df['element'] == 'GenomicReads')
        ]['read_num'].values[0]
    total_usable_reads = df[
        (df['total_or_element'] == '#READINFO') & (
        df['element'] == 'UsableReads')
        ]['read_num'].values[0]
    total_repfamily_reads = df[
        (df['total_or_element'] == '#READINFO') & (
        df['element'] == 'RepFamilyReads')
        ]['read_num'].values[0]

    total_df = df[df['total_or_element'] == 'TOTAL'][
        ['element', 'read_num', 'clip_rpr']
    ]
    element_df = df[df['total_or_element'] == 'ELEMENT'][
        ['element', 'read_num', 'clip_rpr']
    ]
    return total_df, element_df, \
           total_reads, total_genomic_reads, \
           total_usable_reads, total_repfamily_reads


def return_l2fc_entropy_from_parsed(ip_parsed, input_parsed, nopipes=True):
    """
    From 2 parsed rep element pipeline outputs (ip and input),
    compute fold change and information content. Usually fold changes of > 3+
    and information content of 0.1? can be considered enriched.
    Parameters
    ----------
    ip_parsed : str
        filename of the ip parsed string
    input_parsed : str
        filename of the input parsed string
    nopipes : bool
        if True, return just the uniquely mapped rep family mappings
        if False, return all unique and nonunique
    Returns
    -------
    merged : Pandas.DataFrame
        table consisting of fold enrichment and information content params
    """
    total_ip, _, _, _, _, _ = read_parsed(ip_parsed)
    total_input, _, _, _, total_input_usable_reads, _ = read_parsed(
        input_parsed)
    # a pipe indicates read totals mapping to more than one element/rep family.
    if nopipes:
        total_ip = total_ip[total_ip['element'].str.contains('\|') == False]
        total_input = total_input[
            total_input['element'].str.contains('\|') == False]
    # index columns by their element
    total_ip.set_index('element', inplace=True)
    total_input.set_index('element', inplace=True)
    # rename the IP and input columns separately
    total_ip.columns = ["IP_{}".format(c) for c in total_ip.columns]
    total_input.columns = ["Input_{}".format(c) for c in total_input.columns]
    # merge the two on element id
    merged = pd.merge(total_ip, total_input, how='left', left_index=True,
                      right_index=True)
    # deal with missing values
    merged['Input_read_num'].fillna(
        1, inplace=True
    )  # Pseudocount all missing values
    merged['Input_clip_rpr'].fillna(
        merged['Input_read_num'] / (total_input_usable_reads), inplace=True)
    # calculate fold enrichment and information content
    merged['Fold_enrichment'] = merged['IP_clip_rpr'].div(
        merged['Input_clip_rpr'])
    merged['Information_content'] = merged['IP_clip_rpr'] * np.log2(
        merged['IP_clip_rpr'].div(merged['Input_clip_rpr']))

    return merged

In [5]:
df = return_l2fc_entropy_from_parsed(
        ip_parsed_file, input_parsed_file
)

In [6]:
df

Unnamed: 0_level_0,IP_read_num,IP_clip_rpr,Input_read_num,Input_clip_rpr,Fold_enrichment,Information_content
element,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
RNA18S,2835929,4.774671e-01,3417663.0,2.318053e-01,2.059776,4.977536e-01
RNA28S,1158119,1.949851e-01,6581758.0,4.464122e-01,0.436782,-2.330097e-01
unique_CDS,296866,4.998142e-02,235425.0,1.596786e-02,3.130126,8.228045e-02
RNA5S,287825,4.845925e-02,1257957.0,8.532180e-02,0.567959,-3.954964e-02
unique_3utr,195612,3.293394e-02,120237.0,8.155157e-03,4.038418,6.632204e-02
unique_distintron,192168,3.235409e-02,396676.0,2.690482e-02,1.202539,8.608887e-03
unique_proxintron,86299,1.452961e-02,161287.0,1.093940e-02,1.328190,5.949323e-03
chrM,79029,1.330560e-02,147786.0,1.002369e-02,1.327416,5.436948e-03
antisense_L1,69547,1.170918e-02,137225.0,9.307380e-03,1.258053,3.878002e-03
antisense_Alu,59908,1.008633e-02,160089.0,1.085815e-02,0.928918,-1.072955e-03
