In [None]:
import modish
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

from modish.visualize import MODALITY_TO_COLOR, MODALITY_ORDER, MODALITY_PALETTE
modality_order = MODALITY_ORDER

sns.set(style='ticks', context='talk', rc={'font.sans-serif':'Arial', 'pdf.fonttype': 42})

%matplotlib inline

%load_ext autoreload
%autoreload 2

folder = '/home/obotvinnik/Dropbox/figures2/singlecell_pnm/figure4_voyages'

import flotilla
study = flotilla.embark('singlecell_pnm_figure4_voyages')

In [None]:
homer_dir = '/projects/ps-yeolab/obotvinnik/singlecell_pnms/figure4_voyages/homer'

In [None]:
import six
import pandas as pd

RNA = 'ACGU'
DNA = 'ACGT'

import itertools

kmer_lengths = 4, 5, 6

print sum(4**k for k in kmer_lengths)

kmers = list(itertools.chain(
    *[map(lambda x: ''.join(x), itertools.product(RNA, repeat=k)) for k in kmer_lengths]))
print len(kmers)
kmers_list = map(list, kmers)
kmers[:10]


def homer_motif_reader(handle, residues=DNA):
    """Read homer.motifs output and return tuples of motif_id, motif_pwm (as pandas dataframe)
    
    
    """
    names = list(residues)
    record_id, record = None, ''
    new_record_id = None
    for line in handle:
        if line.startswith('>'):
            new_record_id = line.lstrip('>').strip()
            if record_id is None:
                record_id = new_record_id
            if len(record) > 0:
#                 print '\n', record
                pwm = pd.read_table(six.StringIO(record), header=None, names=names)
                yield record_id, pwm
                record = ''
                record_id = new_record_id
        else:
            record += line
            
def score_kmers(pwm, kmers):
    """Generator to score kmers given a position-weight matrix
    
    Parameters
    ----------
    pwm : pandas.DataFrame
        A (length, 4) dataframe of the weight of each position's probability 
        of each nucleotide
    kmers : list of list
        A list of kmers strings as lists, e.g. [['G', 'G', 'G', 'G', 'G', 'G']]
    
    """
    motif_length = pwm.shape[0]
    for kmer in kmers:
        k = len(kmer)
        
        divisor = min(k, motif_length)
        if k == motif_length:
            score = np.sum(pwm.lookup(range(motif_length), kmer))/divisor
        elif k > motif_length:
            starts = range(k - motif_length + 1)
            n_positions = len(starts)
            score = sum(np.sum(pwm.lookup(range(motif_length), 
                                                kmer[start:(start+motif_length)]))/divisor
                        for start in starts)/n_positions
        else:
            # k < motif_length
            starts = range(motif_length - k + 1)
            n_positions = len(starts)
            score = sum(np.sum(pwm.lookup(range(start, start+k), kmer))/divisor
                        for start in starts)/n_positions
        yield score

In [None]:
ls $homer_dir

In [None]:
# %%time
import os

import glob

globber = '{}/*/homerMotifs.all.motifs'.format(homer_dir)


metadata_dfs = []
score_dfs = []

for filename in glob.glob(globber):
    with open(filename) as f:
        records = pd.Series(dict(homer_motif_reader(f, residues=RNA)))
    metadata = pd.DataFrame.from_records(records.index.map(lambda x: pd.Series(x.split())))
    metadata.columns = ['Consensus Sequence', 'Motif ID', 'Log Odds Threshold', 
                                                       '$\log(p)$', 'empty_placeholder', 'Occurence Information', 
                                                       'Motif Statistics']
    motif_folder = filename.split('/')[-2]
    split = motif_folder.split('_')
    transition = split[1]
    direction = split[2]
    location = split[-1]
    if 'stream' not in location:
        location = 'exonbody'
    motif_annotation = '_{}_{}_{}'.format(transition, direction, location)
    print motif_annotation
    metadata['Motif ID'] += motif_annotation
    metadata['Transition'] = transition
    metadata['Direction'] = direction
    metadata['Location'] = location
    metadata_dfs.append(metadata)
    
    scores = records.map(lambda x: pd.Series(score_kmers(x, kmers_list), index=kmers))
    scores = pd.DataFrame.from_records(scores).T
    scores.columns = metadata['Motif ID']
    score_dfs.append(scores)

In [None]:
motif_metadata = pd.concat(metadata_dfs, ignore_index=True)
motif_metadata['Direction'] = motif_metadata['Direction'].map(lambda x: r'$\{}$'.format(x))
print motif_metadata.shape
motif_metadata.head()

In [None]:
motif_metadata = motif_metadata.set_index('Motif ID')
motif_metadata

In [None]:
motif_metadata = motif_metadata.convert_objects(convert_dates=False, convert_numeric=True)
motif_metadata.dtypes

In [None]:
np.exp(-5)

In [None]:
np.log(1e-5)

In [None]:
sns.factorplot(data=motif_metadata, hue='Direction', col='Transition', x='Location', 
               y='$\log(p)$', kind='violin', aspect=1.25)

In [None]:
motif_scores = pd.concat(score_dfs, axis=1)
print motif_scores.shape
motif_scores.head()

In [None]:
motif_scores_significant = motif_scores.loc[:, motif_metadata.index[motif_metadata['$\log(p)$'] < -5]]
motif_scores_significant.shape

In [None]:
corr = motif_scores.corr()

In [None]:
direction_order = [ r'$\nwarrow$', r'$\swarrow$',  r'$\nearrow$', r'$\searrow$']
direction_palette = [MODALITY_TO_COLOR['~1'], MODALITY_TO_COLOR['middle'], 
                     MODALITY_TO_COLOR['bimodal'], MODALITY_TO_COLOR['~0']]

sns.palplot(direction_palette)
ax = plt.gca()
ax.set(xticklabels=direction_order, xticks=np.arange(len(direction_order)))
fig = plt.gcf()
fig.tight_layout()
fig.savefig('{}/direction_to_color.pdf'.format(folder))

direction_to_color = dict(zip(direction_order, direction_palette))



transition_order = ['iPSC-NPC', 'NPC-MN', 'iPSC-MN']
transition_palette = sns.color_palette('Set2', n_colors=3)

sns.palplot(transition_palette)
ax = plt.gca()
ax.set(xticklabels=transition_order, xticks=np.arange(len(transition_order)))
fig = plt.gcf()
fig.tight_layout()
fig.savefig('{}/transition_to_color.pdf'.format(folder))


transition_to_color = dict(zip(transition_order, transition_palette))

In [None]:
motif_scores.head()

In [None]:

from scipy.stats import spearmanr

sns.jointplot('2-AAGGTG_iPSC-NPC_searrow_upstream200', '5-AGGT_iPSC-NPC_searrow_upstream200', 
              motif_scores, stat_func=spearmanr)

In [None]:
motif_scores.iloc[:, :10].idxmax()

In [None]:

for location, df1 in motif_scores.groupby(motif_metadata['Location'], axis=1):
    corr = df.corr(method='spearman')
    
    side_colors = [[direction_to_color[x] for x in motif_metadata.loc[df.columns, 'Direction']],
                   [transition_to_color[x] for x in motif_metadata.loc[df.columns, 'Transition']]]
    g = sns.clustermap(corr, metric='cityblock', row_colors=side_colors, col_colors=side_colors, figsize=(60, 60))
    g.ax_col_dendrogram.set(title=location)
    g.savefig('{}/motif_clustermap_{}.png'.format(folder, location), dpi=150)

In [None]:

for location, df1 in motif_scores_significant.groupby(motif_metadata['Location'], axis=1):
    corr = df.corr(method='spearman')
    
    side_colors = [[direction_to_color[x] for x in motif_metadata.loc[df.columns, 'Direction']],
                   [transition_to_color[x] for x in motif_metadata.loc[df.columns, 'Transition']]]
    g = sns.clustermap(corr, metric='cityblock', row_colors=side_colors, col_colors=side_colors, figsize=(60, 60))
    g.ax_col_dendrogram.set(title=location)
    g.savefig('{}/motif_logp-5_clustermap_{}.png'.format(folder, location), dpi=150)

In [None]:

for location, df1 in motif_scores.groupby(motif_metadata['Location'], axis=1):
    for direction, df2 in df1.groupby(motif_metadata['Direction'], axis=1):
        corr = df2.corr(method='spearman')

        figwidth = int(corr_subset.shape[0]/4.)
        figsize = (figwidth, figwidth)
        
        side_colors = [[direction_to_color[x] for x in motif_metadata.loc[df2.columns, 'Direction']],
                       [transition_to_color[x] for x in motif_metadata.loc[df2.columns, 'Transition']]]
        g = sns.clustermap(corr, metric='cityblock', row_colors=side_colors, col_colors=side_colors, figsize=figsize)
        g.ax_col_dendrogram.set(title=location)
        direction_str = direction.strip('$\\')
        
        g.savefig('{}/motif_clustermap_{}_{}.png'.format(folder, location, direction_str), dpi=150)

In [None]:

for location, df1 in motif_scores.groupby(motif_metadata['Location'], axis=1):
    for transition, df2 in df1.groupby(motif_metadata['Transition'], axis=1):
        corr = df2.corr(method='spearman')
        
        figwidth = int(corr_subset.shape[0]/4.)
        figsize = (figwidth, figwidth)


        side_colors = [[direction_to_color[x] for x in motif_metadata.loc[df2.columns, 'Direction']],
                       [transition_to_color[x] for x in motif_metadata.loc[df2.columns, 'Transition']]]
        g = sns.clustermap(corr, metric='cityblock', row_colors=side_colors, col_colors=side_colors, figsize=figsize)
        g.ax_col_dendrogram.set(title=location)
        g.savefig('{}/motif_clustermap_{}_{}.png'.format(folder, location, transition), dpi=150)