In [1]:
import os
import numpy as np
import pandas as pd
import plotly.offline as py
import plotly.graph_objs as go
import plotly.express as px
import plotly.io as pio

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from plotly.offline import  init_notebook_mode
from umap import UMAP
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import as_completed

py.init_notebook_mode(connected=True)
pio.templates.default = "plotly_white"

In [256]:
def run_tsne(dataset, perplexity, metric):
    tsne_transformer = TSNE(n_components=2, n_iter=30000, perplexity=perplexity, metric=metric)
    tsne_transformer.fit(dataset)
    return tsne_transformer

In [257]:
BENCHMARK_RESULTS_DIR = "/home/penzard/benchmarking/"

DATASETS = {
    'hocomoco_jolma_yang' : os.path.join(BENCHMARK_RESULTS_DIR, "jolma_yang_hocomoco_roc10.txt"),
    'hocomoco_jolma_yang_shuffled': os.path.join(BENCHMARK_RESULTS_DIR, "jolma_yang_shuf_hocomoco_roc10.txt"),
    'motifs_info': os.path.join(BENCHMARK_RESULTS_DIR, "motifs_prefinal.tsv"),
    "hocomoco_jolma": os.path.join(BENCHMARK_RESULTS_DIR, 'hocomoco_jolma.txt'),
    'jolma_yang_jaspar': os.path.join(BENCHMARK_RESULTS_DIR, 'jolma_yang_jaspar_roc10.txt'),
    'jolma_yang_jaspar_shuffled': os.path.join(BENCHMARK_RESULTS_DIR, 'jolma_yang_shuf_jaspar_roc10.txt'),
    'hocomoco_jolma_yang_shuffled_new': "/home/penzard/phylip_new/jolma_yang_shuf_hocomoco_new_roc10.txt",
    'motifs_info_final': os.path.join(BENCHMARK_RESULTS_DIR, "motif_annotation_final.tsv"),
    'jolma_yang_cisbp': os.path.join(BENCHMARK_RESULTS_DIR, 'jolma_yang_cisbp_roc10.txt'),
     'jolma_yang_cisbp_shuffled': os.path.join(BENCHMARK_RESULTS_DIR, 'jolma_yang_shuf_cisbp_roc10.txt')
}

MOTIF_INFO_NAME = "motifs_info_final"

import shutil

dataset_name = 'full_roc10'
HTML_OUT_DIR = f"html_{dataset_name}"

try:
    os.mkdir(HTML_OUT_DIR)
except:
    pass

In [258]:
hocomoco = pd.read_table(DATASETS['hocomoco_jolma_yang']).transpose()
jaspar = pd.read_table(DATASETS['jolma_yang_jaspar']).transpose()
cisbp = pd.read_table(DATASETS['jolma_yang_cisbp']).transpose()

In [259]:

cisbp.index =  [x[:x.find("_", x.find("_") + 1)] for x in cisbp.index]


In [260]:
dataset = pd.concat([hocomoco, jaspar, cisbp], axis=0)

In [262]:
umap_transformer = UMAP(n_neighbors=50,metric='cosine', n_components=2)
umap_transformer.fit(dataset)
umap_transform = umap_transformer.transform(dataset)





The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../miniconda3/lib/python3.7/site-packages/umap/nndescent.py", line 47:
    @numba.njit(parallel=True)
    def nn_descent(
    ^




In [308]:
dataset_coords = pd.DataFrame(umap_transform)
dataset_coords.columns = ["umap_x", "umap_y"]
dataset_coords['motif'] = dataset.index

In [264]:
perp_range = [25]
len(perp_range)

1

In [265]:
metric = 'cosine'

In [266]:
executor = ProcessPoolExecutor(max_workers=min(len(perp_range), 30))

tasks = {}

for perplexity in perp_range:
    t = executor.submit(run_tsne, dataset, perplexity, metric)
    tasks[t] = perplexity
    
results = {}
for t in as_completed(tasks):
    perplexity = tasks[t]
    try:
        results[perplexity] = t.result()
        print (f"Finished for perplexity {perplexity}")
    except Exception as exc:
        print(exc)
        


Finished for perplexity 25


In [309]:
for perp, tsne_object in results.items():
    dataset_coords[f'tsne_x_{perp}'] = tsne_object.embedding_[:, 0]
    dataset_coords[f'tsne_y_{perp}'] = tsne_object.embedding_[:, 1]

In [310]:
motifs_info = pd.read_table(DATASETS[MOTIF_INFO_NAME])



In [311]:
motifs_info['motif'] = motifs_info.loc[:, ['collection', 'motif']].apply(lambda x : x[1] if x[0] != 'jaspar' else x[1].split("_")[0],axis=1)

In [312]:
dataset_coords = dataset_coords.merge(motifs_info, on='motif', how='left')

In [2]:
source_type_table = pd.read_table("/home/penzard/benchmarking/source_types_curation.tsv")

In [47]:
dataset_coords = pd.read_csv('temp.csv')

In [48]:

dataset_coords = dataset_coords.rename({"source_type": 'annotated_source'}, axis=1)

In [49]:
a = dataset_coords['motif'][pd.isnull(dataset_coords['annotated_source'])]

In [50]:
dataset_coords = dataset_coords.merge(source_type_table, on='annotated_source', how='left')

In [51]:
grouping_columns = ['cisbp_families', 
                    'TF_Class_level_2',
                    'TF_Class_level_3',
                    'unified_source',
                    'source_type',
                   'collection']

In [55]:
params = {}
params['tsne'] =  [25]

In [56]:
MINIMUM_NUMBER_OF_MOTIFS_PER_GROUP = 3 

In [57]:
dataset_coords[grouping_columns] = dataset_coords[grouping_columns].fillna('Other')

In [58]:
dataset_coords[grouping_columns] = dataset_coords[grouping_columns].fillna('Other')
dataset_coords[grouping_columns] = dataset_coords[grouping_columns].\
    applymap(lambda x: 'Other' if (","  in x) or (";" in x) or (x.lower() == 'other') or (x.lower() == 'unknown')  else x)

In [59]:
NAMES_MAPPING = {'cisbp_families': "CIS-BP Families",
 'TF_Class_level_2': "TFClass level 2 (class)",
 'TF_Class_level_3': "TFClass level 3 (family)",
 'unified_source': "Motif experimental data",
 'source_type': "Motif experimental data type",
 'collection': "Motif collection"}

COLLECTION_MAPPING = {
    'hocomoco': "HOCOMOCO", 
    'Other': "Other", 
    'other': "Other",
    'jaspar': "JASPAR",  
    'cisbp_direct': "CIS-BP",
    'cisbp_inferred': 'CIS-INF'
}

In [60]:
dataset_coords['collection'] = dataset_coords['collection'].apply(lambda x : COLLECTION_MAPPING[x])

In [61]:
dataset_name='all_latest'

In [67]:
HTML_OUT_DIR='htmls_final'
os.mkdir(HTML_OUT_DIR)

FileExistsError: [Errno 17] File exists: 'htmls_final'

In [69]:
for method in ('tsne', ):
    for par in params[method]:
        outfilename = f"{dataset_name}_{method}_{par}.html"
        outfilepath = os.path.join(HTML_OUT_DIR, outfilename)
        print(outfilename)
        with open(outfilepath, "w") as outfile:
            
            x_coord_name = f'{method}_x_{par}'
            y_coord_name = f'{method}_y_{par}'
            for column in grouping_columns:
                data = []
                trace = go.Scatter(
                    x = dataset_coords[x_coord_name],
                    y = dataset_coords[y_coord_name],
                    mode = 'markers',
                    name = f"all",
                    hoverinfo='text',
                    text=dataset_coords[column], 
                    visible=True, 
                    marker = dict(
                      color = 'lightgrey',
                    )
                )

                data.append(trace)


                groups = dataset_coords.groupby(column)
                groups = [g for g in groups if g[1].shape[0] >= MINIMUM_NUMBER_OF_MOTIFS_PER_GROUP]
                groups = sorted(groups, key = lambda x : x[0].upper() if x[0].lower() != "other" else 'Z' * 100)
                




                for group_name, group_dt in groups:
                    size = group_dt.shape[0]
                    trace = go.Scatter(
                        x = group_dt[x_coord_name],
                        y = group_dt[y_coord_name],
                        mode = 'markers',
                        name = f"{group_name}",
                        text=[f"{group_name}|{x}|{y}"for _, (x, y) in group_dt[['collection', 'motif']].iterrows() ],
                        hoverinfo='text', 
                        visible="legendonly",
                        #legendgroup='whole'
                    )
                    data.append(trace)



                layout= go.Layout(
                    title = f'PBM Benchmark, correlation, tSNE, {NAMES_MAPPING[column]}',
                    #title = "Additional file. Dimensionality reduction with t-SNE applied to PWMs performance at PBM data. Each point corresponds to a PWM. Coloring schemes correspond to TFClass classes (level 2), TFClass families (level 3), CIS-BP families, Motif experimental source data, and Motif collections.",
                    hovermode= 'closest',
                    xaxis= dict(
                        title= f'{method}_x',
                        ticklen= 5,
                        zeroline= False,
                        gridwidth= 2,

                    ),
                    yaxis=dict(
                        title= f'{method}_y',
                        ticklen= 5,
                        gridwidth= 2,
                    ),
                    showlegend= True,

                )


                fig= go.Figure(data=data, layout=layout)

                outfile.write(fig.to_html())
                outfile.write("\n\n")




all_latest_tsne_25.html


In [283]:
dataset_coords = dataset_coords.drop(['umap_x', 'umap_y'], axis=1)

In [65]:
!pwd

/home/penzard/tfbs_answer


In [286]:
dataset_coords.to_csv('../final_all_roc10.tsv', sep="\t", index=None)

@kehrlaehda Заголовки для html:

HT-SELEX Benchmark 10 (или 50), AUC ROC, tSNE, CIS-BP Families

тожесамое TFClass level 2 (class)

тожесамое TFClass level 3 (family)

тожесамое Motif experimental data

тожесамое Motif experimental data type

тожесамое Motif collection

* на последней фигуре легенда: CIS-BP, HOCOMOCO, JASPAR, Other

** еще ты подписи точек сломал - хорошо чтобы в них была коллекция и ID мотива, а не дублирование легенды.

['cisbp_families',
 'TF_Class_level_2',
 'TF_Class_level_3',
 'unified_source',
 'source_type',
 'collection']

In [302]:
y

collection          other
motif         M09559_2.00
Name: 4988, dtype: object