<a href="https://colab.research.google.com/github/anihab/dnaTokenization/blob/main/results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import json
import pandas as pd
import plotly.express as px

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###**Load Data**

In [26]:
# get results
df = pd.read_csv('/content/drive/MyDrive/tokenization/data/results.csv')
df.drop('Unnamed: 0', inplace=True, axis=1)
df.head(5)

Unnamed: 0,model,task,task_category,task_benchmark,replicate_number,accuracy,f1,matthews_correlation,epoch
0,dnabert1_6,H3,epigenetic mark prediction (yeast),GUE,1,0.863059,0.862989,0.726318,3.0
1,dnabert1_6,H3K14ac,epigenetic mark prediction (yeast),GUE,1,0.717398,0.710967,0.421938,3.0
2,dnabert1_6,H3K36me3,epigenetic mark prediction (yeast),GUE,1,0.743979,0.736517,0.47768,3.0
3,dnabert1_6,H3K4me1,epigenetic mark prediction (yeast),GUE,1,0.714962,0.710911,0.422638,3.0
4,dnabert1_6,H3K4me2,epigenetic mark prediction (yeast),GUE,1,0.669599,0.648195,0.304856,3.0


In [27]:
# get average metrics across number of replications
average_metrics = df.groupby(['model', 'task', 'task_category', 'task_benchmark']).agg({
    'accuracy': 'mean',
    'f1': 'mean',
    'matthews_correlation': 'mean'
}).reset_index()
average_metrics.head(5)

Unnamed: 0,model,task,task_category,task_benchmark,accuracy,f1,matthews_correlation
0,dnabert1_6,H3,epigenetic mark prediction (yeast),GUE,0.863727,0.863633,0.727935
1,dnabert1_6,H3K14ac,epigenetic mark prediction (yeast),GUE,0.707806,0.700716,0.40211
2,dnabert1_6,H3K36me3,epigenetic mark prediction (yeast),GUE,0.743377,0.736783,0.476836
3,dnabert1_6,H3K4me1,epigenetic mark prediction (yeast),GUE,0.71411,0.709217,0.420698
4,dnabert1_6,H3K4me2,epigenetic mark prediction (yeast),GUE,0.672792,0.640793,0.307338


In [29]:
# get reported results
reported = pd.read_csv('/content/drive/MyDrive/tokenization/data/reported.csv')
reported['matthews_correlation'] = reported['matthews_correlation'] / 100

# merge self-reported results with our averaged results and results reported by other teams
ours = average_metrics
ours['reported_by'] = 'us'
merged_df = pd.merge(reported, ours, how='outer')
merged_df.head(5)

Unnamed: 0,model,reported_by,task,task_category,task_benchmark,accuracy,f1,matthews_correlation
0,dnabert1_6,dnabert,H3,epigenetic mark prediction (yeast),GUE,,,0.731
1,dnabert1_6,dnabert,H3K14ac,epigenetic mark prediction (yeast),GUE,,,0.4006
2,dnabert1_6,dnabert,H3K36me3,epigenetic mark prediction (yeast),GUE,,,0.4725
3,dnabert1_6,dnabert,H3K4me1,epigenetic mark prediction (yeast),GUE,,,0.4144
4,dnabert1_6,dnabert,H3K4me2,epigenetic mark prediction (yeast),GUE,,,0.3237


###**Bar Charts by Model and Task Category**


In [30]:
fig = px.bar(average_metrics, x='task_category', y='accuracy', color='model', barmode='group', title='Average Accuracy by Task Category and Model')
fig.show()

fig = px.bar(average_metrics, x='task_category', y='f1', color='model', barmode='group', title='Average F1 by Task Category and Model')
fig.show()

fig = px.bar(average_metrics, x='task_category', y='matthews_correlation', color='model', barmode='group', title='Average MCC by Task Category and Model')
fig.show()

###**Bar Charts by Benchmark and Model**

In [31]:
fig = px.bar(average_metrics, x='task_benchmark', y='accuracy', color='model', barmode='group',
             title = 'Average Accuracy by Benchmark and Model')
fig.show()

fig = px.bar(average_metrics, x='task_benchmark', y='f1', color='model', barmode='group',
             title = 'Average F1 by Benchmark and Model')
fig.show()

fig = px.bar(average_metrics, x='task_benchmark', y='matthews_correlation', color='model', barmode='group',
             title = 'Average MCC by Benchmark and Model')
fig.show()

###**Scatter Plots on Full Results**

In [48]:
fig = px.scatter(df[df['model'] == 'dnabert2'],
                 x='replicate_number',
                 y='matthews_correlation',
                 color='task',
                 title='DNABERT2 MCC Results by Replication Number and Task')
fig.update_traces(marker={'size': 15})
fig.show()

fig = px.scatter(df[df['model'] == 'dnabert2'],
                 x='replicate_number',
                 y='matthews_correlation',
                 color='task_category',
                 hover_data=['task'],
                 title='DNABERT2 MCC Results by Replication Number and Task Category')
fig.update_traces(marker={'size': 15})
fig.show()

In [46]:
fig = px.scatter(df[df['model'] == 'dnabert1_6'],
                 x='replicate_number',
                 y='matthews_correlation',
                 color='task',
                 title='DNABERT1 MCC Results by Replication Number and Task (kmer=6)')
fig.update_traces(marker={'size': 15})
fig.show()

fig = px.scatter(df[df['model'] == 'dnabert1_6'],
                 x='replicate_number',
                 y='matthews_correlation',
                 color='task_category',
                 hover_data=['task'],
                 title='DNABERT1 MCC Results by Replication Number and Task Category (kmer=6)')
fig.update_traces(marker={'size': 15})
fig.show()

In [47]:
fig = px.scatter(df[df['model'] == 'nt_NT_500_1000g'],
                 x='replicate_number',
                 y='matthews_correlation',
                 color='task',
                 title='Nucleotide Transformer MCC Results by Replication Number and Task (NT-1000G, 500M)')
fig.update_traces(marker={'size': 15})
fig.show()

fig = px.scatter(df[df['model'] == 'nt_NT_500_1000g'],
                 x='replicate_number',
                 y='matthews_correlation',
                 color='task_category',
                 hover_data=['task'],
                 title='Nucleotide Transformer MCC Results by Replication Number and Task Category (NT-1000G, 500M)')
fig.update_traces(marker={'size': 15})
fig.show()

###**Scatter Plot: Matthews Correlation Self Reported vs. Experimental Results**

*   x-axis - self reported
*   y-axis - measured by us or other publications
*   color the dots by category

In [160]:
filtered_df = merged_df[(merged_df['model'] == 'dnabert1_6') & (merged_df['reported_by'] == 'dnabert')]
fig = px.scatter(merged_df[merged_df['model'] == 'dnabert1_6'],
                 x=filtered_df['matthews_correlation_reported'],
                 y='matthews_correlation_results',
                 color='task_category',
                 hover_data=['task', 'reported_by'],
                 title='DNABERT1 MCC Comparison')
fig.update_traces(marker={'size': 15})
fig.show()

In [162]:
fig = px.scatter(merged_df[merged_df['model'] == 'dnabert2'],
                 x='matthews_correlation_reported',
                 y='matthews_correlation_results',
                 color='task_category',
                 hover_data=['task', 'reported_by'],
                 title='DNABERT2 MCC Comparison')
fig.update_traces(marker={'size': 15})
fig.show()

In [163]:
fig = px.scatter(merged_df[merged_df['model'] == 'nt_NT_500_1000g'],
                 x='matthews_correlation_reported',
                 y='matthews_correlation_results',
                 color='task_category',
                 hover_data=['task', 'reported_by'],
                 title='NT MCC Comparison')
fig.update_traces(marker={'size': 15})
fig.show()