<a href="https://colab.research.google.com/github/anihab/dnaTokenization/blob/main/results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import json
import pandas as pd
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [49]:
# define color scheme
colors = ['#ff6961', '#ffb480', '#f8f38d', '#42d6a4', '#08cad1', '#59adf6', '#9d94ff', '#c780e8']
model_color_map = {'dnabert1_6': '#59adf6', 'dnabert2': '#08cad1', 'nt_NT_500_1000g': '#ff6961', 'hyenadna_1k': '#42d6a4'}

##**Load and Process Data**

In [33]:
# get our results
df = pd.read_csv('/content/drive/MyDrive/tokenization/data/finetune/results.csv')
df.head()

Unnamed: 0,model,task,task_category,task_benchmark,replicate_number,accuracy,f1,matthews_correlation,epoch
0,dnabert1_6,H3,epigenetic mark prediction (yeast),GUE,1,0.863059,0.862989,0.726318,3.0
1,dnabert1_6,H3K14ac,epigenetic mark prediction (yeast),GUE,1,0.717398,0.710967,0.421938,3.0
2,dnabert1_6,H3K36me3,epigenetic mark prediction (yeast),GUE,1,0.743979,0.736517,0.47768,3.0
3,dnabert1_6,H3K4me1,epigenetic mark prediction (yeast),GUE,1,0.714962,0.710911,0.422638,3.0
4,dnabert1_6,H3K4me2,epigenetic mark prediction (yeast),GUE,1,0.669599,0.648195,0.304856,3.0


In [34]:
# group by model and task, calculate mean mcc and std
mean_mcc = df.groupby(['model', 'task', 'task_category', 'task_benchmark']).agg({'matthews_correlation': ['mean', 'std']})
mean_mcc.columns = ['mean', 'std']
mean_mcc['mean'] = mean_mcc['mean'].round(2)
mean_mcc.reset_index(inplace=True)
mean_mcc.head()

Unnamed: 0,model,task,task_category,task_benchmark,mean,std
0,dnabert1_6,H3,epigenetic mark prediction (yeast),GUE,0.73,0.009385
1,dnabert1_6,H3,epigenetic mark prediction (yeast),Nucleotide Transformer,0.73,0.009385
2,dnabert1_6,H3K14ac,epigenetic mark prediction (yeast),GUE,0.4,0.017481
3,dnabert1_6,H3K14ac,epigenetic mark prediction (yeast),Nucleotide Transformer,0.4,0.017481
4,dnabert1_6,H3K36me3,epigenetic mark prediction (yeast),GUE,0.48,0.010941


In [58]:
# group by model and task, calculate mean accuracy and std
mean_acc = df.groupby(['model', 'task', 'task_category', 'task_benchmark']).agg({'accuracy': ['mean', 'std']})
mean_acc.columns = ['mean', 'std']
mean_acc['mean'] = mean_acc['mean'].round(2)
mean_acc.reset_index(inplace=True)
mean_acc.head()

Unnamed: 0,model,task,task_category,task_benchmark,mean,std
0,dnabert1_6,H3,epigenetic mark prediction (yeast),GUE,0.86,0.004585
1,dnabert1_6,H3,epigenetic mark prediction (yeast),Nucleotide Transformer,0.86,0.004585
2,dnabert1_6,H3K14ac,epigenetic mark prediction (yeast),GUE,0.71,0.007009
3,dnabert1_6,H3K14ac,epigenetic mark prediction (yeast),Nucleotide Transformer,0.71,0.007009
4,dnabert1_6,H3K36me3,epigenetic mark prediction (yeast),GUE,0.74,0.00543


In [57]:
# group by model and task, calculate mean f1 and std
mean_f1 = df.groupby(['model', 'task', 'task_category', 'task_benchmark']).agg({'f1': ['mean', 'std']})
mean_f1.columns = ['mean', 'std']
mean_f1['mean'] = mean_f1['mean'].round(2)
mean_f1.reset_index(inplace=True)
mean_f1.head()

Unnamed: 0,model,task,task_category,task_benchmark,mean,std
0,dnabert1_6,H3,epigenetic mark prediction (yeast),GUE,0.86,0.004547
1,dnabert1_6,H3,epigenetic mark prediction (yeast),Nucleotide Transformer,0.86,0.004547
2,dnabert1_6,H3K14ac,epigenetic mark prediction (yeast),GUE,0.7,0.008626
3,dnabert1_6,H3K14ac,epigenetic mark prediction (yeast),Nucleotide Transformer,0.7,0.008626
4,dnabert1_6,H3K36me3,epigenetic mark prediction (yeast),GUE,0.74,0.004988


In [35]:
# get all average metrics across number of replications
average_metrics = df.groupby(['model', 'task', 'task_category', 'task_benchmark']).agg({
    'accuracy': 'mean',
    'f1': 'mean',
    'matthews_correlation': 'mean'
}).reset_index()

# get reported results
reported = pd.read_csv('/content/drive/MyDrive/tokenization/data/finetune/reported.csv')
reported['matthews_correlation'] = reported['matthews_correlation'] / 100

# merge self-reported results with our averaged results and results reported by other teams
ours = average_metrics
ours['reported_by'] = 'us'
merged_df = pd.merge(reported, ours, how='outer')
merged_df.head()

Unnamed: 0,model,reported_by,task,task_category,task_benchmark,accuracy,f1,matthews_correlation
0,dnabert1_6,dnabert,H3,epigenetic mark prediction (yeast),GUE,,,0.731
1,dnabert1_6,dnabert,H3K14ac,epigenetic mark prediction (yeast),GUE,,,0.4006
2,dnabert1_6,dnabert,H3K36me3,epigenetic mark prediction (yeast),GUE,,,0.4725
3,dnabert1_6,dnabert,H3K4me1,epigenetic mark prediction (yeast),GUE,,,0.4144
4,dnabert1_6,dnabert,H3K4me2,epigenetic mark prediction (yeast),GUE,,,0.3237


##**Bar Charts**

1.   Mean MCC results by task for each model, with SD bars
2.   Mean Accuracy results by task for each model, with SD bars
3.   Mean F1 results by task for each model, with SD bars

In [50]:
task = 'H3'
benchmark = 'GUE'

bar = px.bar(mean_mcc[(mean_mcc['task'] == task) & (mean_mcc['task_benchmark'] == benchmark)],
             x='mean', y='model', color='model',
             color_discrete_map=model_color_map,
             text_auto=True,
             error_x='std', error_y='std',
             title=task)

bar.update_layout(title_x=0.5,
                  showlegend=False,
                  xaxis=dict(showline=True, linecolor='black', title=''),
                  yaxis=dict(showline=True, linecolor='black', title=''),
                  )
bar.update_traces(insidetextanchor='start', textfont_color='white', textfont_size=14)

bar.update_layout(bargap=0.1)
bar.update_xaxes(range = [0,1])
bar.show()

In [61]:
task = 'H3'
benchmark = 'GUE'

bar = px.bar(mean_acc[(mean_acc['task'] == task) & (mean_acc['task_benchmark'] == benchmark)],
             x='mean', y='model', color='model',
             color_discrete_map=model_color_map,
             text_auto=True,
             error_x='std', error_y='std',
             title=task)

bar.update_layout(title_x=0.5,
                  showlegend=False,
                  xaxis=dict(showline=True, linecolor='black', title=''),
                  yaxis=dict(showline=True, linecolor='black', title=''),
                  )
bar.update_traces(insidetextanchor='start', textfont_color='white', textfont_size=14)

bar.update_layout(bargap=0.1)
bar.update_xaxes(range = [0,1])
bar.show()

In [62]:
task = 'H3'
benchmark = 'GUE'

bar = px.bar(mean_f1[(mean_f1['task'] == task) & (mean_f1['task_benchmark'] == benchmark)],
             x='mean', y='model', color='model',
             color_discrete_map=model_color_map,
             text_auto=True,
             error_x='std', error_y='std',
             title=task)

bar.update_layout(title_x=0.5,
                  showlegend=False,
                  xaxis=dict(showline=True, linecolor='black', title=''),
                  yaxis=dict(showline=True, linecolor='black', title=''),
                  )
bar.update_traces(insidetextanchor='start', textfont_color='white', textfont_size=14)

bar.update_layout(bargap=0.1)
bar.update_xaxes(range = [0,1])
bar.show()

##**Full Bar Chart**

*  Mean results by task category for each model, with SD bars

In [79]:
fig = px.histogram(mean_acc, x='task_category', y='mean',
                   color='model', color_discrete_map=model_color_map,
                   barmode='group',
                   title='Average Accuracy by Task Category and Model')
fig.show()

fig = px.histogram(mean_f1, x='task_category', y='mean',
                   color='model', color_discrete_map=model_color_map,
                   barmode='group',
                   title='Average F1 by Task Category and Model')
fig.show()

fig = px.histogram(mean_mcc, x='task_category', y='mean',
                   color='model', color_discrete_map=model_color_map,
                   barmode='group',
                   title='Average MCC by Task Category and Model')
fig.show()

##**Scatter Plots on Full Results**

####Single Model

In [37]:
fig = px.scatter(df[df['model'] == 'dnabert1_6'],
                 x='replicate_number',
                 y='matthews_correlation',
                 color='task',
                 title='DNABERT1 MCC Results by Replication Number and Task (kmer=6)')

fig.update_traces(marker={'size': 15})
fig.show()

fig = px.scatter(df[df['model'] == 'dnabert1_6'],
                 x='replicate_number',
                 y='matthews_correlation',
                 color='task_category',
                 hover_data=['task'],
                 title='DNABERT1 MCC Results by Replication Number and Task Category (kmer=6)')
fig.update_traces(marker={'size': 15})
fig.show()

####All Models

Show all model summaries side-by-side

* x-axis - matthews correlation coefficient
* y-axis - replication number
* color the dots by task category


In [38]:
color_mapping = {
    'epigenetic mark prediction (yeast)': '#636efa',
    'transcription factor prediction (human)': '#EF553B',
    'transcription factor prediction (mouse)': '#00cc96',
    'promoter detection (human)': '#ab63fa',
    'core promoter detection (human)': '#FFA15A',
    'covid variant classification (virus)': '#19d3f3',
    'splice site detection (human)': '#FF6692',
    'enhancers (human)': 'pink'
}

fig = sp.make_subplots(rows=1, cols=3, subplot_titles=('DNABERT-1', 'DNABERT-2', 'NT-1000G (500M)'))

models = ['dnabert1_6', 'dnabert2', 'nt_NT_500_1000g']
for i, model in enumerate(models, start=1):
    filtered_df = df[df['model'] == model]
    for task_category in filtered_df['task_category'].unique():
        scatter = px.scatter(filtered_df[filtered_df['task_category'] == task_category],
                             x='matthews_correlation',
                             y='replicate_number',
                             color='task_category',
                             color_discrete_map=color_mapping,
                             hover_data=['task'],
                             title=f'{model} MCC Results by Replication Number and Task Category {task_category}')
        fig.add_trace(scatter.data[0], row=1, col=i)
        fig.update_layout(legend=dict(traceorder='normal'))
fig.show()

##**Scatter Plot: Matthews Correlation Self Reported vs. Experimental Results**

*   x-axis - self reported
*   y-axis - measured by us or other publications
*   color the dots by category

In [39]:
# filter data to single model
filtered_df = merged_df[merged_df['model'] == 'dnabert1_6']
self_reported = filtered_df[filtered_df['reported_by'] == 'dnabert']
other_reported = filtered_df[filtered_df['reported_by'] != 'dnabert']

# merge 'self_reported' and 'other_reported' to align the rows
aligned_df = pd.merge(self_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      other_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      on=['task', 'task_category'],
                      suffixes=('_self', '_other'))

# map reported category to opacity values
opacity_mapping = {'NT': 0.5, 'us': 1.0}
aligned_df['opacity'] = aligned_df['reported_by_other'].map(opacity_mapping)

# create a scatter plot
fig = px.scatter(aligned_df,
                 x='matthews_correlation_self',
                 y='matthews_correlation_other',
                 color='task_category',
                 color_discrete_sequence=colors,
                 opacity=aligned_df['opacity'],
                 hover_data=['task', 'reported_by_other'],
                 title='DNABERT1 MCC Comparison')
fig.update_traces(marker={'size': 15})
fig.show()

In [40]:
# filter data to single model
filtered_df = merged_df[merged_df['model'] == 'dnabert2']
self_reported = filtered_df[filtered_df['reported_by'] == 'dnabert']
other_reported = filtered_df[filtered_df['reported_by'] != 'dnabert']

# merge 'self_reported' and 'other_reported' to align the rows
aligned_df = pd.merge(self_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      other_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      on=['task', 'task_category'],
                      suffixes=('_self', '_other'))

# map reported category to opacity values
opacity_mapping = {'NT': 0.5, 'us': 1.0}
aligned_df['opacity'] = aligned_df['reported_by_other'].map(opacity_mapping)

# create a scatter plot
fig = px.scatter(aligned_df,
                 x='matthews_correlation_self',
                 y='matthews_correlation_other',
                 color='task_category',
                 color_discrete_sequence=colors,
                 opacity=aligned_df['opacity'],
                 hover_data=['task', 'reported_by_other'],
                 title='DNABERT2 MCC Comparison')
fig.update_traces(marker={'size': 15})
fig.show()

In [41]:
# filter data to single model
filtered_df = merged_df[merged_df['model'] == 'nt_NT_500_1000g']
self_reported = filtered_df[filtered_df['reported_by'] == 'NT']
other_reported = filtered_df[filtered_df['reported_by'] != 'NT']

# merge 'self_reported' and 'other_reported' to align the rows
aligned_df = pd.merge(self_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      other_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      on=['task', 'task_category'],
                      suffixes=('_self', '_other'))

# map reported category to opacity values
opacity_mapping = {'dnabert': 0.5, 'us': 1.0}
aligned_df['opacity'] = aligned_df['reported_by_other'].map(opacity_mapping)

# create a scatter plot
fig = px.scatter(aligned_df,
                 x='matthews_correlation_self',
                 y='matthews_correlation_other',
                 color='task_category',
                 color_discrete_sequence=colors,
                 opacity=aligned_df['opacity'],
                 hover_data=['task', 'reported_by_other'],
                 title='Nucleotide Transformer MCC Comparison')
fig.update_traces(marker={'size': 15})
fig.show()

##**Violin Plots**

*  show results for tasks that had a large standard deviation (≥ 2%) i.e., more variability than we expected.

In [44]:
# find tasks that have large std
mean_mcc[(mean_mcc['std'] >= 0.03) & (mean_mcc['task_benchmark'] == 'GUE')] # they seem to all be GUE tasks

Unnamed: 0,model,task,task_category,task_benchmark,mean,std
25,dnabert1_6,mouse_3,transcription factor prediction (mouse),GUE,0.57,0.100708
31,dnabert1_6,prom_300_tata,promoter detection (human),GUE,0.66,0.058395
47,dnabert2,H3K14ac,epigenetic mark prediction (yeast),GUE,0.5,0.036057
49,dnabert2,H3K36me3,epigenetic mark prediction (yeast),GUE,0.55,0.034612
53,dnabert2,H3K4me2,epigenetic mark prediction (yeast),GUE,0.31,0.031763
55,dnabert2,H3K4me3,epigenetic mark prediction (yeast),GUE,0.34,0.061783
63,dnabert2,H4ac,epigenetic mark prediction (yeast),GUE,0.44,0.03992
65,dnabert2,covid,covid variant classification (virus),GUE,0.03,0.035098
71,dnabert2,mouse_3,transcription factor prediction (mouse),GUE,0.75,0.05142
77,dnabert2,prom_300_tata,promoter detection (human),GUE,0.63,0.039657


In [45]:
fig = px.violin(df[(df['model'] == 'dnabert1_6') & (df['task_benchmark'] == 'GUE') & (df['task'] == 'mouse_3')],
                x='matthews_correlation',
                points='all',
                box=True,
                color='model',
                color_discrete_map=model_color_map,
                title='DNABERT-1 mouse_3 Task Results')
fig.update_traces(pointpos=0)
fig.show()

In [46]:
fig = px.violin(df[(df['model'] == 'dnabert2') & (df['task_benchmark'] == 'GUE') & (df['task'] == 'H3K4me3')],
                x='matthews_correlation',
                points='all',
                box=True,
                color='model',
                color_discrete_map=model_color_map,
                title='DNABERT2 H3K4me3 Task Results')
fig.update_traces(pointpos=0)
fig.show()