<a href="https://colab.research.google.com/github/anihab/dnaTokenization/blob/main/results.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import json
import pandas as pd
import plotly.express as px
import plotly.subplots as sp
import plotly.graph_objects as go

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# define color scheme
colors = ['#AD000E', '#D6382B', '#FF5E49', '#FF8269', '#FFA789']
model_color_map = {'DNABERT-1 (6-mer)': '#AD000E', 'DNABERT-2': '#D6382B', 'NT 500M 1000G': '#FF8269', 'HyenaDNA (1k)': '#FFA789'}

##**Load and Process Data**

In addition to full results (`df`), create dataframes for:
*   Mean metrics across replications, with standard deviation (`mean_acc`, `mean_f1`, `mean_mcc`)
*   Merge self-reported results with our averaged results and results reported by other teams (`merged_df`)



In [5]:
# get our results
df = pd.read_csv('/content/drive/MyDrive/tokenization/data/finetune/results.csv')
df.head()

Unnamed: 0,model,task,task_category,task_benchmark,replicate_number,accuracy,f1,mcc,epoch
0,DNABERT-1 (6-mer),H3,EMP(yeast),GUE,1,0.863059,0.862989,0.726318,3.0
1,DNABERT-1 (6-mer),H3K14ac,EMP(yeast),GUE,1,0.717398,0.710967,0.421938,3.0
2,DNABERT-1 (6-mer),H3K36me3,EMP(yeast),GUE,1,0.743979,0.736517,0.47768,3.0
3,DNABERT-1 (6-mer),H3K4me1,EMP(yeast),GUE,1,0.714962,0.710911,0.422638,3.0
4,DNABERT-1 (6-mer),H3K4me2,EMP(yeast),GUE,1,0.669599,0.648195,0.304856,3.0


In [6]:
# phage results
phage_df = pd.read_csv('/content/drive/MyDrive/tokenization/data/finetune/phage_results.csv')
phage_df.head()

Unnamed: 0,model,task,task_category,task_benchmark,replicate_number,accuracy,f1,mcc,epoch
0,HyenaDNA (4k),phage,Phage Identification,Phage Identification,1,0.883957,0.883958,0.767902,100.0
1,HyenaDNA (4k),phage,Phage Identification,Phage Identification,2,0.89848,0.898482,0.796958,100.0
2,HyenaDNA (4k),phage,Phage Identification,Phage Identification,3,0.896461,0.896462,0.79292,100.0
3,HyenaDNA (4k),phage,Phage Identification,Phage Identification,4,0.889872,0.889869,0.779729,100.0
4,HyenaDNA (4k),phage,Phage Identification,Phage Identification,5,0.895499,0.895499,0.790987,100.0


In [7]:
# get all average metrics across number of replications
average_metrics = df.groupby(['model', 'task', 'task_category', 'task_benchmark']).agg({
    'accuracy': 'mean',
    'f1': 'mean',
    'mcc': 'mean'
}).reset_index()

# get reported results
reported = pd.read_csv('/content/drive/MyDrive/tokenization/data/finetune/reported.csv')
reported['matthews_correlation'] = reported['matthews_correlation'] / 100

# merge self-reported results with our averaged results and results reported by other teams
ours = average_metrics
ours['reported_by'] = 'us'
merged_df = pd.merge(reported, ours, how='outer')
merged_df.head()

Unnamed: 0,model,reported_by,task,task_category,task_benchmark,accuracy,f1,matthews_correlation,mcc
0,dnabert1_6,dnabert,H3,epigenetic mark prediction (yeast),GUE,,,0.731,
1,dnabert1_6,dnabert,H3K14ac,epigenetic mark prediction (yeast),GUE,,,0.4006,
2,dnabert1_6,dnabert,H3K36me3,epigenetic mark prediction (yeast),GUE,,,0.4725,
3,dnabert1_6,dnabert,H3K4me1,epigenetic mark prediction (yeast),GUE,,,0.4144,
4,dnabert1_6,dnabert,H3K4me2,epigenetic mark prediction (yeast),GUE,,,0.3237,


##**Full Bar Chart**

*  Mean results by task category for each model, with SD bars
*  Mean results by task for each model, with SD bars

In [8]:
# group by model and task category, calculate mean stats and std
mean_mcc = df.groupby(['model', 'task_category']).agg({'mcc': ['mean', 'std']})
mean_mcc.columns = ['mean', 'std']
mean_mcc['mean'] = mean_mcc['mean'].round(2) * 100
mean_mcc['std'] = mean_mcc['std'].round(2) * 100
mean_mcc.reset_index(inplace=True)

mean_acc = df.groupby(['model', 'task_category']).agg({'accuracy': ['mean', 'std']})
mean_acc.columns = ['mean', 'std']
mean_acc['mean'] = mean_acc['mean'].round(2) * 100
mean_acc['std'] = mean_acc['std'].round(2) * 100
mean_acc.reset_index(inplace=True)

In [9]:
fig = px.bar(mean_mcc[mean_mcc['task_category'] != 'Covid Variants (virus)'],
             x='task_category', y='mean',
             color='model', color_discrete_map=model_color_map,
             labels={
                     "mean": "Percentage",
                     "task_category": "Task Category",
                 },
             barmode='group',
             text_auto=True,
             error_x='std', error_y='std',
             title='Mean MCC for Benchmark Tasks by Model')
fig.update_traces(insidetextanchor='start')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()
fig.write_html('fig.html')

In [None]:
fig = px.bar(mean_acc[mean_acc['task_category'] != 'Covid Variants (virus)'],
             x='task_category', y='mean',
             color='model', color_discrete_map=model_color_map,
             labels={
                     "mean": "Percentage",
                     "task_category": "Task Category",
                 },
             barmode='group',
             text_auto=True,
             error_x='std', error_y='std',
             title='Mean Accuracy for Benchmark Tasks by Model')
fig.update_traces(insidetextanchor='start')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()

##**Bar Charts by Task**

*   Mean results by task for each model, with SD bars

In [10]:
task = 'H3'
benchmark = 'GUE'
statistic = 'mcc'

filtered = df.groupby(['model', 'task', 'task_category', 'task_benchmark']).agg({statistic: ['mean', 'std']})
filtered.columns = ['mean', 'std']
filtered['mean'] = filtered['mean'].round(2)
filtered.reset_index(inplace=True)

bar = px.bar(filtered[(filtered['task'] == task) & (filtered['task_benchmark'] == benchmark)],
             x='mean', y='model', color='model',
             color_discrete_map=model_color_map,
             text_auto=True,
             error_x='std', error_y='std',
             title=task)

bar.update_layout(title_x=0.5,
                  showlegend=False,
                  xaxis=dict(showline=True, linecolor='black', title=''),
                  yaxis=dict(showline=True, linecolor='black', title=''),
                  )
bar.update_traces(insidetextanchor='start', textfont_color='white', textfont_size=14)

bar.update_layout(bargap=0.1)
bar.update_xaxes(range = [0,1])
bar.show()

##**Bar Charts by Benchmark**

In [11]:
# separated by benchmark
gue = df[df['task_benchmark'] == 'GUE']
ntb = df[df['task_benchmark'] == 'Nucleotide Transformer']
gb = df[df['task_benchmark'] == 'Genomic Benchmark']

GUE Benchmark

In [12]:
statistic = 'mcc'

filtered = gue.groupby(['model', 'task']).agg({statistic: ['mean', 'std']})
filtered.columns = ['mean', 'std']
filtered['mean'] = filtered['mean'].round(2) * 100
filtered['std'] = filtered['std'].round(2) * 100
filtered.reset_index(inplace=True)

fig = px.bar(filtered, x='task', y='mean',
             color='model', color_discrete_map=model_color_map,
             labels={
                     "mean": "Percentage",
                     "task": "Task",
                 },
             barmode='group',
             text_auto=True,
             error_x='std', error_y='std',
             title='Mean MCC for GUE Tasks by Model')
fig.update_traces(insidetextanchor='start')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()

filtered = gue.groupby(['model', 'task_category']).agg({statistic: ['mean', 'std']})
filtered.columns = ['mean', 'std']
filtered['mean'] = filtered['mean'].round(2) * 100
filtered['std'] = filtered['std'].round(2) * 100
filtered.reset_index(inplace=True)

fig = px.bar(filtered, x='task_category', y='mean',
             color='model', color_discrete_map=model_color_map,
             labels={
                     "mean": "Percentage",
                     "task_category": "Task Category",
                 },
             barmode='group',
             text_auto=True,
             error_x='std', error_y='std',
             title='Mean MCC for GUE Tasks by Model')
fig.update_traces(insidetextanchor='start')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()

Nucleotide Transformer Benchmark

In [13]:
statistic = 'mcc'

filtered = ntb.groupby(['model', 'task']).agg({statistic: ['mean', 'std']})
filtered.columns = ['mean', 'std']
filtered['mean'] = filtered['mean'].round(2) * 100
filtered['std'] = filtered['std'].round(2) * 100
filtered.reset_index(inplace=True)

fig = px.bar(filtered, x='task', y='mean',
             color='model', color_discrete_map=model_color_map,
             labels={
                     "mean": "Percentage",
                     "task": "Task",
                 },
             barmode='group',
             text_auto=True,
             error_x='std', error_y='std',
             title='Mean MCC for Nucleotide Transformer Tasks by Model')
fig.update_traces(insidetextanchor='start')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()

Genomic Benchmark

In [14]:
statistic = 'mcc'

filtered = gb.groupby(['model', 'task']).agg({statistic: ['mean', 'std']})
filtered.columns = ['mean', 'std']
filtered['mean'] = filtered['mean'].round(2) * 100
filtered['std'] = filtered['std'].round(2) * 100
filtered.reset_index(inplace=True)

fig = px.bar(filtered, x='task', y='mean',
             color='model', color_discrete_map=model_color_map,
             labels={
                     "mean": "Percentage",
                     "task": "Task",
                 },
             barmode='group',
             text_auto=True,
             error_x='std', error_y='std',
             title='Mean MCC for GUE Tasks by Model')
fig.update_traces(insidetextanchor='start')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()

##**Scatter Plots on Full Results**

####Single Model

In [None]:
fig = px.scatter(df[df['model'] == 'dnabert1_6'],
                 x='matthews_correlation',
                 y='replicate_number',
                 color='task',
                 color_discrete_sequence=colors,
                 title='DNABERT1 MCC Results by Replication Number and Task (kmer=6)')

fig.update_traces(marker={'size': 15})
fig.show()

fig = px.scatter(df[df['model'] == 'dnabert1_6'],
                 x='matthews_correlation',
                 y='replicate_number',
                 color='task_category',
                 color_discrete_sequence=colors,
                 hover_data=['task'],
                 title='DNABERT1 MCC Results by Replication Number and Task Category (kmer=6)')
fig.update_traces(marker={'size': 15})
fig.show()

####All Models

Show all model summaries side-by-side

* x-axis - matthews correlation coefficient
* y-axis - replication number
* color the dots by task category


In [None]:
color_mapping = {
    'epigenetic mark prediction (yeast)': '#636efa',
    'transcription factor prediction (human)': '#EF553B',
    'transcription factor prediction (mouse)': '#00cc96',
    'promoter detection (human)': '#ab63fa',
    'core promoter detection (human)': '#FFA15A',
    'covid variant classification (virus)': '#19d3f3',
    'splice site detection (human)': '#FF6692',
    'enhancers (human)': 'pink'
}

fig = sp.make_subplots(rows=1, cols=3, subplot_titles=('DNABERT-1', 'DNABERT-2', 'NT-1000G (500M)'))

models = ['dnabert1_6', 'dnabert2', 'nt_NT_500_1000g']
for i, model in enumerate(models, start=1):
    filtered_df = df[df['model'] == model]
    for task_category in filtered_df['task_category'].unique():
        scatter = px.scatter(filtered_df[filtered_df['task_category'] == task_category],
                             x='matthews_correlation',
                             y='replicate_number',
                             color='task_category',
                             color_discrete_map=color_mapping,
                             hover_data=['task'],
                             title=f'{model} MCC Results by Replication Number and Task Category {task_category}')
        fig.add_trace(scatter.data[0], row=1, col=i)
        fig.update_layout(legend=dict(traceorder='normal'))
fig.show()

##**Violin Plots**

*  show results for tasks that had a large standard deviation (≥ 2%) i.e., more variability than we expected.

In [None]:
#mean_mcc[(mean_mcc['std'] >= 0.03)]

In [None]:
fig = px.violin(df[(df['model'] == 'DNABERT-1 (6-mer)') & (df['task'] == 'mouse_3')],
                x='mcc',
                points='all',
                box=True,
                color='model',
                color_discrete_map=model_color_map,
                title='DNABERT-1 mouse_3 Task Results')
fig.update_traces(pointpos=0)
fig.show()

In [None]:
fig = px.violin(df[(df['model'] == 'DNABERT-2') & (df['task'] == 'H3K4me3')],
                x='mcc', labels={'mcc': 'MCC'},
                points='all',
                box=True,
                color='model',
                color_discrete_map=model_color_map,
                title='DNABERT2 H3K4me3 Task MCC Variation by Replication')
fig.update_traces(pointpos=0)
fig.update_layout(showlegend=False)
fig.show()
fig.write_html('fig.html')

##**Scatter Plot: Matthews Correlation Self Reported vs. Experimental Results**

*   x-axis - self reported
*   y-axis - measured by us or other publications
*   color the dots by category

In [None]:
# filter data to single model
filtered_df = merged_df[merged_df['model'] == 'dnabert1_6']
self_reported = filtered_df[filtered_df['reported_by'] == 'dnabert']
other_reported = filtered_df[filtered_df['reported_by'] != 'dnabert']

# merge 'self_reported' and 'other_reported' to align the rows
aligned_df = pd.merge(self_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      other_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      on=['task', 'task_category'],
                      suffixes=('_self', '_other'))

# map reported category to opacity values
opacity_mapping = {'NT': 0.5, 'us': 1.0}
aligned_df['opacity'] = aligned_df['reported_by_other'].map(opacity_mapping)

# create a scatter plot
fig = px.scatter(aligned_df,
                 x='matthews_correlation_self',
                 y='matthews_correlation_other',
                 color='task_category',
                 color_discrete_sequence=colors,
                 opacity=aligned_df['opacity'],
                 hover_data=['task', 'reported_by_other'],
                 title='DNABERT1 MCC Comparison')
fig.update_traces(marker={'size': 15})
fig.show()

In [None]:
# filter data to single model
filtered_df = merged_df[merged_df['model'] == 'dnabert2']
self_reported = filtered_df[filtered_df['reported_by'] == 'dnabert']
other_reported = filtered_df[filtered_df['reported_by'] != 'dnabert']

# merge 'self_reported' and 'other_reported' to align the rows
aligned_df = pd.merge(self_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      other_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      on=['task', 'task_category'],
                      suffixes=('_self', '_other'))

# map reported category to opacity values
opacity_mapping = {'NT': 0.5, 'us': 1.0}
aligned_df['opacity'] = aligned_df['reported_by_other'].map(opacity_mapping)

# create a scatter plot
fig = px.scatter(aligned_df,
                 x='matthews_correlation_self',
                 y='matthews_correlation_other',
                 color='task_category',
                 color_discrete_sequence=colors,
                 opacity=aligned_df['opacity'],
                 hover_data=['task', 'reported_by_other'],
                 title='DNABERT2 MCC Comparison')
fig.update_traces(marker={'size': 15})
fig.show()

In [None]:
# filter data to single model
filtered_df = merged_df[merged_df['model'] == 'nt_NT_500_1000g']
self_reported = filtered_df[filtered_df['reported_by'] == 'NT']
other_reported = filtered_df[filtered_df['reported_by'] != 'NT']

# merge 'self_reported' and 'other_reported' to align the rows
aligned_df = pd.merge(self_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      other_reported[['reported_by', 'task', 'task_category', 'matthews_correlation']],
                      on=['task', 'task_category'],
                      suffixes=('_self', '_other'))

# map reported category to opacity values
opacity_mapping = {'dnabert': 0.5, 'us': 1.0}
aligned_df['opacity'] = aligned_df['reported_by_other'].map(opacity_mapping)

# create a scatter plot
fig = px.scatter(aligned_df,
                 x='matthews_correlation_self',
                 y='matthews_correlation_other',
                 color='task_category',
                 color_discrete_sequence=colors,
                 opacity=aligned_df['opacity'],
                 hover_data=['task', 'reported_by_other'],
                 title='Nucleotide Transformer MCC Comparison')
fig.update_traces(marker={'size': 15})
fig.show()

##**Phage Results**

In [34]:
statistic = 'mcc'

filtered = phage_df.groupby(['model', 'task']).agg({statistic: ['mean', 'std']})
filtered.columns = ['mean', 'std']
filtered['mean'] = filtered['mean'].round(2) * 100
filtered['std'] = filtered['std'].round(2) * 100
filtered.reset_index(inplace=True)

dnabert2 = pd.DataFrame({'model': 'DNABERT-2', 'task': 'phage', 'mean': 90.6, 'std': 0}, index=[0])
dnabert1 = pd.DataFrame({'model': 'DNABERT-1 (6-mer)', 'task': 'phage', 'mean': 63, 'std': 0}, index=[0])
nt500 = pd.DataFrame({'model': 'NT 500M 1000G', 'task': 'phage', 'mean': 62.5, 'std': 0}, index=[0])

filtered = pd.concat([filtered, dnabert2, dnabert1, nt500], ignore_index=True)

fig = px.bar(filtered, x='task', y='mean',
             color='model', color_discrete_map=model_color_map,
             labels={
                     "mean": "Percentage",
                     "task": "Task",
                 },
             barmode='group',
             text_auto=True,
             error_x='std', error_y='std',
             title='Mean MCC for Phage Identification Task by Model')
fig.update_traces(insidetextanchor='start')
fig.update_layout(legend=dict(
    orientation="h",
    yanchor="bottom",
    y=1.02,
    xanchor="right",
    x=1
))
fig.show()