In [1]:
#!pip install bert-score
#!pip install rouge
#!pip install nltk

In [2]:
import argparse
import json
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from rouge import Rouge
import numpy as np
import statistics
import pandas as pd
import bert_score

from sklearn.metrics import accuracy_score

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

from nltk.tokenize import word_tokenize
from nltk.translate.meteor_score import single_meteor_score

import torch
from transformers import BertTokenizer, BertForMaskedLM


In [3]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/abhijit/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/abhijit/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# Initialize BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [5]:
def read_result_csv(file_path):
    result_df = pd.read_csv(file_path)
    result_df.drop(result_df.columns[0], axis=1, inplace=True)
    return result_df

BLEU Score: Measures the similarity between the generated language response and the reference response based on n-gram overlap. Higher BLEU scores indicate greater similarity.

In [6]:
# @title
def compute_bleu(reference, candidate):
    reference = [[ref.split()] for ref in reference]
    candidate = [cand.split() for cand in candidate]
    smoothing_function = SmoothingFunction().method4
    bleu_scores = [corpus_bleu([ref], [cand], smoothing_function=smoothing_function) for ref, cand in zip(reference, candidate)]
    # Compute mean BLEU score and standard deviation
    mean_bleu_score = np.mean(bleu_scores)
    bleu_std_dev = np.std(bleu_scores)
    return round(mean_bleu_score, 3), round(bleu_std_dev, 3)


In [7]:
# @title
def compute_bleu_unigram(reference, candidate):
    reference = [[ref.split()] for ref in reference]
    candidate = [cand.split() for cand in candidate]
    smoothing_function = SmoothingFunction().method4
    weights = (1, 0, 0, 0)
    bleu_scores = [corpus_bleu([ref], [cand], smoothing_function=smoothing_function, weights = weights) for ref, cand in zip(reference, candidate)]
    # Compute mean BLEU score and standard deviation
    mean_bleu_score = round(np.mean(bleu_scores), 3)
    bleu_std_dev = round(np.std(bleu_scores), 3)
    return mean_bleu_score, bleu_std_dev

ROUGE Score (unigram and bigram): Calculates the overlap between the generated language response and the reference response at the unigram and bigram levels. Higher ROUGE scores indicate greater similarity.

In [8]:
# @title
def compute_rouge(reference, candidate):
    rouge = Rouge()

    # Compute ROUGE scores for all pairs of reference and candidate sentences
    
    rouge_scores = [rouge.get_scores(cand, ref, avg=True) for ref, cand in zip(reference, candidate)]
    

    # Extract individual ROUGE scores (f, p, r) for each pair
    rouge_1_f_scores = [score['rouge-1']['f'] for score in rouge_scores]
    rouge_2_f_scores = [score['rouge-2']['f'] for score in rouge_scores]
    rouge_l_f_scores = [score['rouge-l']['f'] for score in rouge_scores]

    # Compute mean and standard deviation of ROUGE F1 scores
    mean_rouge_1_f_score = np.mean(rouge_1_f_scores)
    mean_rouge_2_f_score = np.mean(rouge_2_f_scores)
    mean_rouge_l_f_score = np.mean(rouge_l_f_scores)

    rouge_1_f_std_dev = np.std(rouge_1_f_scores)
    rouge_2_f_std_dev = np.std(rouge_2_f_scores)
    rouge_l_f_std_dev = np.std(rouge_l_f_scores)

    return round(mean_rouge_1_f_score, 3), round(mean_rouge_2_f_score, 3), round(mean_rouge_l_f_score, 3), round(rouge_1_f_std_dev, 3), round(rouge_2_f_std_dev, 3), round(rouge_l_f_std_dev, 3)

BERTScore: Computes a similarity score for each token in the candidate sentence with each token in the reference sentence, based on contextual embeddings obtained from a pre-trained BERT model.

In [9]:
# @title
def compute_bert_score(reference, candidate):
    bert_p_scores, bert_r_scores, bert_f1_scores = bert_score.score(candidate, reference, lang="en", verbose=False)
    #print(bert_p_scores.mean().item(), bert_r_scores.mean().item())
    return round(bert_f1_scores.mean().item(), 3), round(bert_f1_scores.std().item(), 3)

METEOR Score: Evaluates the quality of the generated language response by considering both exact word matches and semantic similarity. Higher METEOR scores indicate better performance.

In [10]:
# @title
def compute_meteor_scores(reference, candidate):
  # Compute the METEOR scores for each candidate-reference pair
  # Tokenize each sentence
  tokenized_candidates = [word_tokenize(candidate.replace("<s>", "").replace("</s>", "").strip()) for candidate in candidate]
  tokenized_references = [word_tokenize(sentence) for sentence in reference]
  # Compute the METEOR scores for each candidate-reference pair
  meteor_scores = []
  for ref_sentence, candidate in zip(tokenized_references, tokenized_candidates):
      meteor_scores.append(single_meteor_score(ref_sentence, candidate))
  meteor_scores_mean = sum(meteor_scores) / len(meteor_scores)
  meteor_scores_std = statistics.stdev(meteor_scores)
  return round(meteor_scores_mean, 3), round(meteor_scores_std, 3)

In [11]:

import re

def clean_text(text):
    # Regular expression to match unwanted symbols and numbers
    regex = r"[^a-zA-Z0-9.,!?;:'\"()\[\]{}\-\s]"

    # Remove unwanted symbols and numbers
    cleaned_text = re.sub(regex, '', text)

    # Split text into lines
    lines = re.split(r'(?<=[.!?]) +', cleaned_text)

    # Remove duplicate lines while preserving order
    seen = set()
    unique_lines = []
    for line in lines:
        cleaned_line = re.sub(r'\s+', ' ', line).strip()  # Clean up extra spaces in each line
        if cleaned_line not in seen:
            seen.add(cleaned_line)
            unique_lines.append(cleaned_line)

    # Join lines back into a single string
    # Since these scores are for text similarity, restricting the generated caption to just 2 lines.
    unique_lines = unique_lines[:1]
    result = ' '.join(unique_lines)
    return result

def cleanup_pred_captions(predicted_captions):

    predicted_captions = predicted_captions.fillna('')
    clean_captions = []

    for caption in predicted_captions:
        # clean_caption = f"An image of {category}."
        clean_caption = f"No response."
        if caption.strip():
            clean_caption = clean_text(caption)
            if not clean_caption.strip():
              # clean_caption = f"An image of {category}."
              clean_caption = f"No response."
        clean_captions.append(clean_caption)

    return clean_captions

In [12]:
def run(csv_path):
    results = {}
    result_df = read_result_csv(csv_path)

    image_paths = result_df['Ground Truth Image']
    expected_captions = result_df['Expected Caption']
    predicted_captions = result_df['Generated Caption']
    expected_object_classes = result_df['Expected object']
    predicted_object_classes = result_df['Predicted object']

    predicted_captions = cleanup_pred_captions(predicted_captions)
    references = expected_captions.tolist()
    candidates = predicted_captions

    for i, cand in enumerate(candidates):
        if len(cand)<=1:
            candidates[i] = "No response"

    # BLEU Score

    mean_bleu_score, bleu_std_dev = compute_bleu(references, candidates)
    results["Mean BLEU Score"] =  mean_bleu_score
    results["SD BLEU Score"] =  bleu_std_dev

    mean_bleu_score, bleu_std_dev = compute_bleu_unigram(references, candidates)
    results["Mean BLEU Unigram Score"] =  mean_bleu_score
    results["SD BLEU Unigram Score"] =  bleu_std_dev

    # ROUGE Score

    mean_rouge_1_f_score, mean_rouge_2_f_score, mean_rouge_l_f_score, rouge_1_f_std_dev, rouge_2_f_std_dev, rouge_l_f_std_dev = compute_rouge(references, candidates)
    results["Mean ROUGE-1"] =  mean_rouge_1_f_score
    results["SD ROUGE-1"] = rouge_1_f_std_dev
    results["Mean ROUGE-2"] = mean_rouge_2_f_score
    results["SD ROUGE-2"] = rouge_2_f_std_dev
    results["Mean ROUGE-l"] = mean_rouge_l_f_score
    results["SD ROUGE-l"] =  rouge_l_f_std_dev

    # METEOR Score

    mean_meteor_score, meteor_std_dev = compute_meteor_scores(references, candidates)
    results["Mean Meteor Score"] =  mean_meteor_score
    results["SD Meteor Score"] = meteor_std_dev

    # BERT Score

    bert_score_mean, bert_score_std_dev = compute_bert_score(references, candidates)

    results["Mean BERTScore"] = round(bert_score_mean,3)
    results["SD BERTScore"] = round(bert_score_std_dev, 3)
    return results

In [13]:
# The csv_path below is the csv with generated text
# The below csv_file is for evaluation of text generated for Subject 1 EEG signals using Phi3 model.
import os
import tqdm
results_dir = "../results"
all_res = {}
for file in tqdm.tqdm(os.listdir(results_dir)):
    print (file)
    fullpath = os.path.join(results_dir,file)
    results = run(csv_path = fullpath)
    all_res[file.replace("csv","")] = results

results_df = pd.DataFrame(all_res).transpose()
results_df.to_csv("all_results.csv")

  0%|                                                                                                               | 0/54 [00:00<?, ?it/s]

results_Qwen2.5-7B-Instruct_no_stage2-subject-6.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  2%|█▉                                                                                                     | 1/54 [00:03<02:46,  3.15s/it]

results_Mistral-7B-Instruct-v0.3_chance1.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  4%|███▊                                                                                                   | 2/54 [00:06<02:37,  3.03s/it]

results_Mistral-7B-Instruct-v0.3_subject-5.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  6%|█████▋                                                                                                 | 3/54 [00:07<01:52,  2.21s/it]

results_Meta-Llama-3-8B-Instruct_no_stage2-subject-5.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  7%|███████▋                                                                                               | 4/54 [00:08<01:30,  1.81s/it]

results_Meta-Llama-3-8B-Instruct_subject-6.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  9%|█████████▌                                                                                             | 5/54 [00:09<01:18,  1.60s/it]

results_Qwen2.5-7B-Instruct_subject-1.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 11%|███████████▍                                                                                           | 6/54 [00:11<01:15,  1.57s/it]

results_Mistral-7B-Instruct-v0.3_no_stage2-subject-2.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 13%|█████████████▎                                                                                         | 7/54 [00:12<01:09,  1.47s/it]

results_Meta-Llama-3-8B-Instruct_subject-5.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 15%|███████████████▎                                                                                       | 8/54 [00:13<01:05,  1.42s/it]

results_Meta-Llama-3-8B-Instruct_all.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 17%|█████████████████▏                                                                                     | 9/54 [00:17<01:36,  2.14s/it]

results_Mistral-7B-Instruct-v0.3_subject-2.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 19%|██████████████████▉                                                                                   | 10/54 [00:18<01:21,  1.85s/it]

results_Mistral-7B-Instruct-v0.3_no_stage2-subject-3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 20%|████████████████████▊                                                                                 | 11/54 [00:19<01:10,  1.65s/it]

results_Qwen2.5-7B-Instruct_subject-3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 22%|██████████████████████▋                                                                               | 12/54 [00:21<01:05,  1.55s/it]

results_Meta-Llama-3-8B-Instruct_subject-3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 24%|████████████████████████▌                                                                             | 13/54 [00:22<00:59,  1.45s/it]

results_Meta-Llama-3-8B-Instruct_only_eeg.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 26%|██████████████████████████▍                                                                           | 14/54 [00:25<01:21,  2.05s/it]

results_Mistral-7B-Instruct-v0.3_no_stage2_all.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 28%|████████████████████████████▎                                                                         | 15/54 [00:28<01:27,  2.24s/it]

results_Meta-Llama-3-8B-Instruct_no_stage2_only_eeg.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 30%|██████████████████████████████▏                                                                       | 16/54 [00:32<01:44,  2.76s/it]

results_Meta-Llama-3-8B-Instruct_no_stage2_all.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 31%|████████████████████████████████                                                                      | 17/54 [00:36<01:56,  3.14s/it]

results_Mistral-7B-Instruct-v0.3_no_stage2-subject-4.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 33%|██████████████████████████████████                                                                    | 18/54 [00:37<01:32,  2.56s/it]

results_Meta-Llama-3-8B-Instruct_chance1.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 35%|███████████████████████████████████▉                                                                  | 19/54 [00:42<01:50,  3.15s/it]

results_Qwen2.5-7B-Instruct_no_stage2-subject-4.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 37%|█████████████████████████████████████▊                                                                | 20/54 [00:43<01:26,  2.55s/it]

results_Meta-Llama-3-8B-Instruct_chance2.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 39%|███████████████████████████████████████▋                                                              | 21/54 [00:46<01:26,  2.62s/it]

results_Mistral-7B-Instruct-v0.3_no_stage2_only_eeg.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 41%|█████████████████████████████████████████▌                                                            | 22/54 [00:49<01:27,  2.74s/it]

results_Meta-Llama-3-8B-Instruct_subject-1.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 43%|███████████████████████████████████████████▍                                                          | 23/54 [00:50<01:11,  2.31s/it]

results_Meta-Llama-3-8B-Instruct_subject-4.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 44%|█████████████████████████████████████████████▎                                                        | 24/54 [00:51<00:59,  1.99s/it]

results_Mistral-7B-Instruct-v0.3_no_stage2-subject-5.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 46%|███████████████████████████████████████████████▏                                                      | 25/54 [00:53<00:50,  1.76s/it]

results_Mistral-7B-Instruct-v0.3_no_stage2-subject-6.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 48%|█████████████████████████████████████████████████                                                     | 26/54 [00:54<00:44,  1.59s/it]

results_Meta-Llama-3-8B-Instruct_no_stage2-subject-6.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 50%|███████████████████████████████████████████████████                                                   | 27/54 [00:55<00:39,  1.46s/it]

results_Mistral-7B-Instruct-v0.3_no_stage2-subject-1.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 52%|████████████████████████████████████████████████████▉                                                 | 28/54 [00:56<00:37,  1.43s/it]

results_Meta-Llama-3-8B-Instruct_no_stage2-subject-2.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 54%|██████████████████████████████████████████████████████▊                                               | 29/54 [00:58<00:34,  1.37s/it]

results_Mistral-7B-Instruct-v0.3_subject-4.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 56%|████████████████████████████████████████████████████████▋                                             | 30/54 [00:59<00:31,  1.32s/it]

results_Qwen2.5-7B-Instruct_no_stage2-subject-5.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 57%|██████████████████████████████████████████████████████████▌                                           | 31/54 [01:00<00:28,  1.25s/it]

results_Qwen2.5-7B-Instruct_only_eeg.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 59%|████████████████████████████████████████████████████████████▍                                         | 32/54 [01:04<00:46,  2.11s/it]

results_Meta-Llama-3-8B-Instruct_no_stage2-subject-1.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 61%|██████████████████████████████████████████████████████████████▎                                       | 33/54 [01:05<00:38,  1.84s/it]

results_Qwen2.5-7B-Instruct_chance2.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 63%|████████████████████████████████████████████████████████████████▏                                     | 34/54 [01:08<00:41,  2.07s/it]

results_Mistral-7B-Instruct-v0.3_only_eeg.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 65%|██████████████████████████████████████████████████████████████████                                    | 35/54 [01:11<00:44,  2.36s/it]

results_Meta-Llama-3-8B-Instruct_subject-2.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 67%|████████████████████████████████████████████████████████████████████                                  | 36/54 [01:12<00:36,  2.05s/it]

results_Qwen2.5-7B-Instruct_all.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 69%|█████████████████████████████████████████████████████████████████████▉                                | 37/54 [01:16<00:43,  2.57s/it]

results_Qwen2.5-7B-Instruct_no_stage2-subject-2.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 70%|███████████████████████████████████████████████████████████████████████▊                              | 38/54 [01:17<00:34,  2.14s/it]

results_Mistral-7B-Instruct-v0.3_subject-3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 72%|█████████████████████████████████████████████████████████████████████████▋                            | 39/54 [01:18<00:27,  1.86s/it]

results_Mistral-7B-Instruct-v0.3_subject-6.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 74%|███████████████████████████████████████████████████████████████████████████▌                          | 40/54 [01:19<00:23,  1.65s/it]

results_Qwen2.5-7B-Instruct_chance1.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 76%|█████████████████████████████████████████████████████████████████████████████▍                        | 41/54 [01:23<00:29,  2.29s/it]

results_Mistral-7B-Instruct-v0.3_subject-1.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 78%|███████████████████████████████████████████████████████████████████████████████▎                      | 42/54 [01:24<00:23,  1.99s/it]

results_Mistral-7B-Instruct-v0.3_chance2.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 80%|█████████████████████████████████████████████████████████████████████████████████▏                    | 43/54 [01:28<00:26,  2.39s/it]

results_Meta-Llama-3-8B-Instruct_no_stage2-subject-3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 81%|███████████████████████████████████████████████████████████████████████████████████                   | 44/54 [01:29<00:20,  2.04s/it]

results_Qwen2.5-7B-Instruct_subject-4.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 83%|█████████████████████████████████████████████████████████████████████████████████████                 | 45/54 [01:30<00:16,  1.81s/it]

results_Meta-Llama-3-8B-Instruct_no_stage2-subject-4.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 85%|██████████████████████████████████████████████████████████████████████████████████████▉               | 46/54 [01:31<00:12,  1.62s/it]

results_Qwen2.5-7B-Instruct_subject-2.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 87%|████████████████████████████████████████████████████████████████████████████████████████▊             | 47/54 [01:33<00:10,  1.50s/it]

results_Qwen2.5-7B-Instruct_no_stage2-subject-1.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 89%|██████████████████████████████████████████████████████████████████████████████████████████▋           | 48/54 [01:34<00:08,  1.46s/it]

results_Mistral-7B-Instruct-v0.3_all.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 91%|████████████████████████████████████████████████████████████████████████████████████████████▌         | 49/54 [01:37<00:09,  1.83s/it]

results_Qwen2.5-7B-Instruct_no_stage2_all.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 93%|██████████████████████████████████████████████████████████████████████████████████████████████▍       | 50/54 [01:41<00:09,  2.46s/it]

results_Qwen2.5-7B-Instruct_subject-5.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 94%|████████████████████████████████████████████████████████████████████████████████████████████████▎     | 51/54 [01:42<00:06,  2.11s/it]

results_Qwen2.5-7B-Instruct_no_stage2-subject-3.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 96%|██████████████████████████████████████████████████████████████████████████████████████████████████▏   | 52/54 [01:43<00:03,  1.82s/it]

results_Qwen2.5-7B-Instruct_no_stage2_only_eeg.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
 98%|████████████████████████████████████████████████████████████████████████████████████████████████████  | 53/54 [01:47<00:02,  2.51s/it]

results_Qwen2.5-7B-Instruct_subject-6.csv


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 54/54 [01:48<00:00,  2.02s/it]
