# Accuracy Summaries for DeepSEA

This notebook contains DeepSEA accuracies for each prediction type (DNase, TFs, and histones) and cell-type specific accuracies for TF predictions.

In [None]:
import pandas as pd

In [None]:
# NOTE: this line will fail if `xlrd` is not installed
results = pd.read_excel('deepsea_accuracies.xlsx', skiprows=1)

# convert string in AUC column to NaN value
results['AUC'] = results.AUC.apply(pd.to_numeric, errors='coerce')
results.head()

In [None]:
# 125 dnase, 690 tfs, 104 histones
dnase = results[:125]
tfs = results[125:815]
histones = results[-104:]

# sanity check
assert(len(dnase) == 125)
assert(len(tfs) == 690)
assert(len(histones) == 104)

# Accuracies for DNase, TFs, Histones

In [None]:
pd.concat([dnase.mean().to_frame("DNase"),
           tfs.mean().to_frame("TFs"),
           histones.mean().to_frame("Histones"),
           results.mean().to_frame("All")],
           axis=1).transpose()

# Cell-type Accuracies for TFs

In [None]:
mean_auc_by_celltype = tfs.groupby('Cell Type').mean()

In [None]:
pd.set_option('display.max_rows', len(mean_auc_by_celltype))
mean_auc_by_celltype

# Accuracies for intersection of GM12878, H1-hESC

In [None]:
gm12878 = results[results["Cell Type"] == "GM12878"]
h1hesc = results[results["Cell Type"] == "H1-hESC"]

idx1 = pd.Index(gm12878["TF/DNase/HistoneMark"])
idx2 = pd.Index(h1hesc["TF/DNase/HistoneMark"])
overlap = set(idx1.intersection(idx2))


In [None]:
gm12878_overlap = gm12878[gm12878["TF/DNase/HistoneMark"].isin(overlap)]
gm12878_overlap_remove_first = gm12878_overlap.groupby("TF/DNase/HistoneMark").first()
gm12878_overlap_remove_first.mean()


In [None]:
h1hesc_overlap = h1hesc[h1hesc["TF/DNase/HistoneMark"].isin(overlap)]
h1hesc_overlap_remove_first = h1hesc_overlap.groupby("TF/DNase/HistoneMark").max()
h1hesc_overlap_remove_first.mean()
