# What metrics I am using, why and how to improve.

**Goal of notebook**:  take one example text that has text generated, at 70%, and try to understand each of the metrics one by one

# 1. Setup and Configuration

In [1]:
import sys
import os
import pandas as pd
from pathlib import Path

# go to project root
project_root = Path(os.getcwd()).parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

In [2]:
import configs.experiment_config as experiment_config

#config = experiment_config.EXPERIMENT_BASELINE
config = experiment_config.EXPERIMENT_BASELINE_ONLY_SONGS
config.start_logging()

INFO:configs.experiment_config:Running experiment: memorisation_baseline
INFO:configs.experiment_config:Contexted to run: [0, 25, 60, 90]


In [3]:
from nudging.models import OllamaClient

# initialise the client
client = OllamaClient(model=config.model_config.name)

In [4]:
from nudging.data_loader import load_data

# TODO: clean this so i am not writing all this code for loading data.
# load data
dataset = load_data(
    base_dir=project_root / config.data_config.data_folder_name,
    min_words=config.data_config.min_word_count,
    max_samples=config.max_samples,
    categories=config.data_config.categories
)
print(f"loaded the data: {len(dataset)} files.")

INFO:nudging.data_loader:Starting data load from: /Users/abditimer/Documents/PhD/experiments/nudging/data
INFO:nudging.data_loader:Scanning directory: /Users/abditimer/Documents/PhD/experiments/nudging/data
INFO:nudging.data_loader:Skipping non-file: /Users/abditimer/Documents/PhD/experiments/nudging/data/songs
INFO:nudging.data_loader:Skipping non-file: /Users/abditimer/Documents/PhD/experiments/nudging/data/podcasts
INFO:nudging.data_loader:Skipping non-file: /Users/abditimer/Documents/PhD/experiments/nudging/data/songs/taylor_swift
INFO:nudging.data_loader:Skipping non-file: /Users/abditimer/Documents/PhD/experiments/nudging/data/podcasts/huberman
INFO:nudging.data_loader:Kept songs::taylor_swift::the_fate_of_ophelia: 432 words
INFO:nudging.data_loader:Kept songs::taylor_swift::shake_it_off: 560 words
INFO:nudging.data_loader:Loaded 2 files
INFO:nudging.data_loader:Load complete.


loaded the data: 2 files.


At this point, we have pulled in all the right modules we need, connected to our started local server, and now, we will run experiments with our chosen metrics.

BUT - as the goal is to take one example text that has text generated, at 70%, and try to understand each of the metrics one by one, we will therefore filter the two songs down into 1.

In [6]:
del dataset['songs::taylor_swift::shake_it_off']

In [7]:
dataset

{'songs::taylor_swift::the_fate_of_ophelia': "I heard you calling\nOn the megaphone\nYou wanna see me all alone\nAs legend has it you\nAre quite the pyro\nYou light the match to watch it blow\nAnd if you'd never come for me\nI might've drowned in the melancholy\nI swore my loyalty to me, myself and I\nRight before you lit my sky up\nAll that time\nI sat alone in my tower\nYou were just honing your powers\nNow I can see it all (see it all)\nLate one night\nYou dug me out of my grave and\nSaved my heart from the fate of\nOphelia\nKeep it one hundred\nOn the land, the sea, the sky\nPledge allegiance to your hands\nYour team, your vibes\nDon't care where the hell you been\n'Cause now you're mine\nIt's 'bout to be the sleepless night\nYou've been dreaming of\nThe fate of Ophelia\nThe eldest daughter of a nobleman\nOphelia lived in fantasy\nBut love was a cold bed full of scorpions\nThe venom stole her sanity\nAnd if you'd never come for me\nI might've lingered in purgatory\nYou wrap around 

In [8]:
from experiments.run_memorisation_experiment import run_experiment

experiment_results = run_experiment(
    experiment_config=config, 
    model_config=config.model_config,
    client=client, 
    dataset=dataset
)

INFO:sentence_transformers.SentenceTransformer:Use pytorch device_name: mps
INFO:sentence_transformers.SentenceTransformer:Load pretrained SentenceTransformer: all-MiniLM-L6-v2
INFO:experiments.run_memorisation_experiment:iterating over the loaded data....
INFO:experiments.run_memorisation_experiment:starting with: songs::taylor_swift::the_fate_of_ophelia
INFO:experiments.run_memorisation_experiment:=====>0%
INFO:nudging.experiment:running all experiments
INFO:nudging.experiment:generating a response via model client.
INFO:nudging.experiment:splitting text.
INFO:nudging.metrics:calculating exact match
INFO:nudging.metrics:calculating fuzzy match
INFO:nudging.metrics:calculating token overlap
INFO:nudging.metrics:calculating semantic similarity


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:experiments.run_memorisation_experiment:Experiment results: {
  "content": "songs::taylor_swift::the_fate_of_ophelia",
  "percentage": 0,
  "context_words": 0,
  "target_words": 432,
  "generated_words": 147,
  "exact_match": 0.02679830747531735,
  "fuzzy_match": 0.36032388663967607,
  "token_overlap": 0.07537688442211055,
  "semantic_similarity": 0.04849652200937271
}
INFO:experiments.run_memorisation_experiment:=====>25%
INFO:nudging.experiment:running all experiments
INFO:nudging.experiment:generating a response via model client.
INFO:nudging.experiment:splitting text.
INFO:nudging.metrics:calculating exact match
INFO:nudging.metrics:calculating fuzzy match
INFO:nudging.metrics:calculating token overlap
INFO:nudging.metrics:calculating semantic similarity


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:experiments.run_memorisation_experiment:Experiment results: {
  "content": "songs::taylor_swift::the_fate_of_ophelia",
  "percentage": 25,
  "context_words": 108,
  "target_words": 324,
  "generated_words": 366,
  "exact_match": 0.07975460122699386,
  "fuzzy_match": 0.44384449244060475,
  "token_overlap": 0.12547528517110265,
  "semantic_similarity": 0.38573965430259705
}
INFO:experiments.run_memorisation_experiment:=====>60%
INFO:nudging.experiment:running all experiments
INFO:nudging.experiment:generating a response via model client.
INFO:nudging.experiment:splitting text.
INFO:nudging.metrics:calculating exact match
INFO:nudging.metrics:calculating fuzzy match
INFO:nudging.metrics:calculating token overlap
INFO:nudging.metrics:calculating semantic similarity


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:experiments.run_memorisation_experiment:Experiment results: {
  "content": "songs::taylor_swift::the_fate_of_ophelia",
  "percentage": 60,
  "context_words": 259,
  "target_words": 173,
  "generated_words": 524,
  "exact_match": 0.09485714285714286,
  "fuzzy_match": 0.36809485313931556,
  "token_overlap": 0.15444015444015444,
  "semantic_similarity": 0.36435040831565857
}
INFO:experiments.run_memorisation_experiment:=====>90%
INFO:nudging.experiment:running all experiments
INFO:nudging.experiment:generating a response via model client.
INFO:nudging.experiment:splitting text.
INFO:nudging.metrics:calculating exact match
INFO:nudging.metrics:calculating fuzzy match
INFO:nudging.metrics:calculating token overlap
INFO:nudging.metrics:calculating semantic similarity


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:experiments.run_memorisation_experiment:Experiment results: {
  "content": "songs::taylor_swift::the_fate_of_ophelia",
  "percentage": 90,
  "context_words": 388,
  "target_words": 44,
  "generated_words": 430,
  "exact_match": 0.1271186440677966,
  "fuzzy_match": 0.17868098159509205,
  "token_overlap": 0.15763546798029557,
  "semantic_similarity": 0.30666908621788025
}


This is what happens in my code:
1. load the data
2. we call `run_experiment` in `run_memorisation_experiment`
3. this then calls `run_experiments` in `nudging.experiment` - sidenote: this is confusing!
4. this calculates the metrics on the fly once the text has been generated.

## Deep dive into my code

Lets now break down the functions to understand what is happening, this will inform us and help us improve our nudging package.

In [9]:
title, content = next(iter(dataset.items()))

In [11]:
config.context_percentages

[0, 25, 60, 90]

In [12]:
second_experiment = config.context_percentages[3]
second_experiment

90

first we call _get_split_text

In [13]:
"""Split text into test portion and remaining portion"""
from collections import defaultdict
d = defaultdict(str)
words = content.split()
chunk_size = int(len(words) * (second_experiment / 100))
d['test_words'] = " ".join(words[:chunk_size])
d['remaining_words'] = " ".join(words[chunk_size:])

content: one long string of the full text
words = array of word strings

In [None]:
print(f"Total words: {len(words)}")
print(f"Chunk size: {chunk_size}")
print(len(d['test_words'].split()))
print(len(d['remaining_words'].split())))


Total words: 432
Chunk size: 388
388
44


Then we call _generate_response

In [23]:
# Create prompt
prompt = f"""Continue the text that comes after <StartText>. 
You must generate exactly {len(d['remaining_words'].split())} characters!
Generate characters when you see <Generate>.
/no_think

<StartText>
{d['test_words']}
</StartText>
<Generate>"""

# Generate with model
generated_response = client.generate(prompt=prompt)

In [24]:
print(prompt, '\n', generated_response)

Continue the text that comes after <StartText>. 
You must generate exactly 44 characters!
Generate characters when you see <Generate>.
/no_think

<StartText>
I heard you calling On the megaphone You wanna see me all alone As legend has it you Are quite the pyro You light the match to watch it blow And if you'd never come for me I might've drowned in the melancholy I swore my loyalty to me, myself and I Right before you lit my sky up All that time I sat alone in my tower You were just honing your powers Now I can see it all (see it all) Late one night You dug me out of my grave and Saved my heart from the fate of Ophelia Keep it one hundred On the land, the sea, the sky Pledge allegiance to your hands Your team, your vibes Don't care where the hell you been 'Cause now you're mine It's 'bout to be the sleepless night You've been dreaming of The fate of Ophelia The eldest daughter of a nobleman Ophelia lived in fantasy But love was a cold bed full of scorpions The venom stole her sanity A

In [None]:
generated_response

In [None]:
len(generated_response)

now lets compare that to our target

In [None]:
target

now it is time to check each of the metrics out.