In [None]:
import sys
import pickle

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from statistics import mean, stdev
from MulticoreTSNE import MulticoreTSNE as TSNE

In [None]:
def update_progress_bar(perc, option_info=None):
    sys.stdout.write(
        '[{:60}] {:.2f}%, {}\r'.format('=' * int(60 * perc // 100),
                                       perc,
                                       option_info))
    sys.stdout.flush()

In [None]:
def plot_tsne(x_embedded, y_data):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(x_embedded[:, 0], x_embedded[:, 1], s=100,
               marker='o', c=y_data, cmap='seismic')

In [None]:
def do_fisher_exact(x_gene_data, y_gene_data, tf_list, sd_threshold=1, pvalue_threshold=0.05):
    passed_tf = []
    for i, tf_name in enumerate(tf_list):
        gene_psi_mean = mean(y_gene_data)
        gene_psi_stdev = stdev(y_gene_data)
        upper_bound = gene_psi_mean + sd_threshold * gene_psi_stdev
        lower_bound = gene_psi_mean + sd_threshold * gene_psi_stdev
        psi_high_mask = y_gene_data > upper_bound
        psi_low_mask = y_gene_data < lower_bound
        tf_bind_mask = x_gene_data[:, i]
        _, pvalue = sp.stats.fisher_exact([[(psi_high_mask & tf_bind_mask).sum(),
                                            (psi_high_mask & ~tf_bind_mask).sum()],
                                           [(psi_low_mask & tf_bind_mask).sum(),
                                            (psi_low_mask & ~tf_bind_mask).sum()]])
        if pvalue < pvalue_threshold:
            passed_tf.append(tf_name)
    return passed_tf

In [None]:
# gene filiter (fisher's exact test)
result = {}
for i, gene in enumerate(gene_list):
    update_progress_bar(i / len(gene_list) * 100,
                        '{}/{}'.format(i, len(gene_list)))
    if len(psi_list[gene]) <= 2:
        continue
    gene_mask = data_order['Gene'] == gene
    x_temp = x_data[gene_mask]
    y_temp = y_data[gene_mask]
    result[gene] = do_fisher_exact(x_temp, y_temp, tf_list)

In [None]:
with open('./input/ML/gene_fisher_exact.pickle', mode='wb') as fh:
    pickle.dump(result, fh)

In [None]:
[(k, v) for k, v  in result.items() if 'PLAG1' in v]

In [None]:
# do_fisher_exact(x_data[data_order['Gene'] == 'ENSG00000108639'], y_data[data_order['Gene'] == 'ENSG00000108639'], tf_list)
y_data[data_order['Gene'] == 'ENSG00000108639']

In [None]:
temp = []
for a, b in combinations(np.arange(15), 2):
    x_temp = x_data[data_order['Gene'] == 'ENSG00000108639'][:, tf_list.index('PLAG1')]
    y_temp = y_data[data_order['Gene'] == 'ENSG00000108639'].tolist()
    temp.append([abs(y_temp[a] - y_temp[b]), x_temp[a] ^ x_temp[b]])

In [None]:
d = pd.DataFrame(temp, columns=['dZPSI', 'C'])
sns.distplot(d[d['C']]['dZPSI'], bins=20)
sns.distplot(d[~d['C']]['dZPSI'], bins=20)

In [None]:
# fetch one gene's x_data and visualize the feature space with t-SNE
gene = 'ENSG00000139496'
x_temp = x_data[data_order['Gene'] == gene]
y_temp = y_data[data_order['Gene'] == gene]
sd_threshold = 1
gene_psi_mean = mean(y_temp)
gene_psi_stdev = stdev(y_temp)
psi_filter_mask = ((y_temp > (gene_psi_mean + sd_threshold * gene_psi_stdev)) |
                   (y_temp < (gene_psi_mean - sd_threshold * gene_psi_stdev)))
x_temp = x_temp[psi_filter_mask]
y_temp = y_temp[psi_filter_mask]
x_temp = TSNE(n_components=2,
              perplexity=100.0,
              learning_rate=50,
              n_iter=10000,
              n_jobs=16,
              random_state=seed).fit_transform(x_temp)
plot_tsne(x_temp, y_temp)