# ACGT content in modalities

In [1]:
%load_ext autoreload
%autoreload 2

import anchor
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

sns.set(style='ticks', context='talk', rc={'font.sans-serif':'Arial', 'pdf.fonttype': 42})

%matplotlib inline



import flotilla

flotilla_dir = '/projects/ps-yeolab/obotvinnik/flotilla_projects'


study = flotilla.embark('singlecell_pnm_figure2_modalities_bayesian', flotilla_dir=flotilla_dir)

not_outliers = study.splicing.singles.index.difference(study.splicing.outliers.index)

  _get_xdg_config_dir())
  load_species_data=load_species_data)


2016-05-25 11:22:02	Reading datapackage from /projects/ps-yeolab/obotvinnik/flotilla_projects/singlecell_pnm_figure2_modalities_bayesian/datapackage.json
2016-05-25 11:22:02	Parsing datapackage to create a Study object
2016-05-25 11:22:52 	Initializing Study
2016-05-25 11:22:52 	Initializing Predictor configuration manager for Study
2016-05-25 11:22:52	Predictor ExtraTreesClassifier is of type <class 'sklearn.ensemble.forest.ExtraTreesClassifier'>
2016-05-25 11:22:52	Added ExtraTreesClassifier to default predictors
2016-05-25 11:22:52	Predictor ExtraTreesRegressor is of type <class 'sklearn.ensemble.forest.ExtraTreesRegressor'>
2016-05-25 11:22:52	Added ExtraTreesRegressor to default predictors
2016-05-25 11:22:52	Predictor GradientBoostingClassifier is of type <class 'sklearn.ensemble.gradient_boosting.GradientBoostingClassifier'>
2016-05-25 11:22:52	Added GradientBoostingClassifier to default predictors
2016-05-25 11:22:52	Predictor GradientBoostingRegressor is of type <class 'sklear

In [2]:
folder = '/home/obotvinnik/Dropbox/figures2/singlecell_pnm/figure2_modalities/bayesian'
figure_folder = '{}/nucleotide_content'.format(folder)
! mkdir $figure_folder

mkdir: cannot create directory `/home/obotvinnik/Dropbox/figures2/singlecell_pnm/figure2_modalities/bayesian/nucleotide_content': File exists


In [3]:
csv_folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper'

In [7]:
splicing_feature_folder = '/projects/ps-yeolab/obotvinnik/singlecell_pnms/csvs_for_paper/splicing_feature_data'

In [9]:
ls $splicing_feature_folder

[0m[01;34malternative[0m/                  exonbody_conservation.err-1
basewise_conservation.py      exonbody_conservation.err-2
basewise_conservation.sh      exonbody_conservation.out-1
basewise_conservation.sh.err  exonbody_conservation.out-2
basewise_conservation.sh.out  exonbody_conservation.sh
[01;34mconstitutive[0m/
[m

In [10]:
nucleotides = 'ACGT'
contexts = 'upstream', 'downstream'
exon_types = 'alternative', 'constitutive'

alternative_dfs = []
constitutive_dfs = []

for nucleotide in nucleotides:
    for context in contexts:
        for exon_type in exon_types:
            csv = '{}/{}/nucleotide_content/{}200_nucleotide_content_{}.csv'.format(splicing_feature_folder, 
                                                                                    exon_type, context, nucleotide)
            df = pd.read_csv(csv, index_col=0)
            df = df * 100
            tidy = df.unstack().reset_index()
            tidy = tidy.rename(
                columns={'level_0': 'window_relative_to_exon', 'level_1': 'event_name', 0:'Nucleotide Percentage'})
            tidy['window_relative_to_exon'] = tidy['window_relative_to_exon'].astype(int)
#             if context == 'upstream':
#                 tidy['window_relative_to_genome'] = np.abs(tidy['window_relative_to_exon'] - 39)
#             else:
#                 tidy['window_relative_to_genome'] = tidy['window_relative_to_exon'].copy()
            tidy['Region'] = context.title()
            tidy['Nucleotide'] = nucleotide
            tidy['exon_type'] = exon_type
            if exon_type == 'exon2':
                alternative_dfs.append(tidy)
            else:
                constitutive_dfs.append(tidy)
nucleotide_content = pd.concat(alternative_dfs)
constitutive_nucleotide_content = pd.concat(constitutive_dfs)

print(nucleotide_content.shape)
nucleotide_content.head()

ValueError: invalid literal for long() with base 10: 'index'

In [None]:
study.supplemental.modalities_tidy.head()

In [None]:
nucleotide_content.head()

In [None]:
nucleotide_content_modalities = nucleotide_content.merge(
    study.supplemental.modalities_tidy, left_on='event_name', right_on='event_id', how='outer')
nucleotide_content_modalities.head()

In [None]:
print(nucleotide_content_modalities.shape)
nucleotide_content_modalities = nucleotide_content_modalities.dropna()
print(nucleotide_content_modalities.shape)

In [None]:
nucleotide_content_modalities.head()

In [None]:
constitutive_nucleotide_content['modality'] = 'constitutive'

constitutives = []

for phenotype in study.phenotype_order:
    constitutive_nucleotide_content['phenotype'] = phenotype
    constitutives.append(constitutive_nucleotide_content.copy())

In [None]:
nucleotide_content_all = pd.concat([nucleotide_content_modalities] + constitutives)
nucleotide_content_all.head()

In [None]:
nucleotide_content_all.tail()

In [None]:
from anchor import MODALITY_ORDER, MODALITY_PALETTE, MODALITY_TO_COLOR

In [None]:
MODALITY_TO_COLOR['constitutive'] = 'k'

In [None]:
modality_order = ['constitutive', '~1', '~0', 'bimodal']
modality_palette = [MODALITY_TO_COLOR[x] for x in modality_order]

for nucleotide, nucleotide_df in nucleotide_content_all.groupby('Nucleotide'):

    g = sns.FacetGrid(nucleotide_df, hue='modality', row='phenotype', col='Region', 
                      palette=modality_palette, hue_order=modality_order, row_order=study.phenotype_order,
                     col_order=['Upstream', 'Downstream'], sharex=False)

    g.map(sns.pointplot, 'window_relative_to_exon', 'Nucleotide Percentage', scale=0.5, dodge=True, alpha=0.5, ci=None)
    g.set_titles('{row_name} ' + nucleotide)
    for ax in g.axes.flat:
        if ax.is_first_col():
            ax.invert_xaxis()
    g.set(xlabel='')
    g.set(xticks=[0, 19, 39], xticklabels=[0, 200, 400])
    g.fig.tight_layout()
    g.savefig('{}/intron_window_nucleotide_content_{}.pdf'.format(figure_folder, nucleotide))
    
    
    # Zoom in on first 100nt
    g.set(xlim=(0, 9), xticks=[0, 5, 9], xticklabels=[0, 50, 100])
    for ax in g.axes.flat:
        if ax.is_first_col():
            ax.invert_xaxis()
    g.savefig('{}/intron_window_nucleotide_content_{}_zoom100nt.pdf'.format(figure_folder, nucleotide))


In [None]:
nucleotide_to_color = {'A': 'Green',
                       'C': 'DarkBlue',
                       'G': 'Gold', 
                       'T': 'Crimson'}
nucleotide_order = list('ACGT')
nucleotide_palette = [nucleotide_to_color[x] for x in nucleotide_order]

for modality, modality_df in nucleotide_content_all.groupby('modality'):

    g = sns.FacetGrid(modality_df, hue='Nucleotide', row='phenotype', col='Region', 
                      palette=nucleotide_palette, hue_order=nucleotide_order,
#                       palette=modality_palette, hue_order=modality_order, 
                      row_order=study.phenotype_order,
                     col_order=['Upstream', 'Downstream'], sharex=False)

    g.map(sns.pointplot, 'window_relative_to_exon', 'Nucleotide Percentage', scale=0.5, dodge=True, ci=None)
    g.set_titles('{row_name} ' + modality)
    for ax in g.axes.flat:
        if ax.is_first_col():
            ax.invert_xaxis()
    g.set(xlabel='')
    g.set(xticks=[0, 19, 39], xticklabels=[0, 200, 400])
    g.fig.tight_layout()
    g.savefig('{}/intron_window_nucleotide_content_{}.pdf'.format(figure_folder, modality))
    
    
    # Zoom in on first 100nt
    g.set(xlim=(0, 9), xticks=[0, 4, 9], xticklabels=[0, 50, 100])
    for ax in g.axes.flat:
        if ax.is_first_col():
            ax.invert_xaxis()
    g.savefig('{}/intron_window_nucleotide_content_{}_zoom100nt.pdf'.format(figure_folder, modality))
