# AthenAI v0 - Initial Workout Generation Model

This notebook contains the initial implementation of the AthenAI workout generation model using FLAN-T5 and sentence transformers.

In [None]:
# Install required packages
!pip install --upgrade datasets transformers
!pip install -U datasets==3.0.1 transformers==4.45.2
!pip install -q transformers sentence-transformers datasets

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Trainer, TrainingArguments
from sentence_transformers import SentenceTransformer
import datasets
import torch

# Load FLAN-T5-base tokenizer and model for generation/fine-tuning
t5_model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(t5_model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(t5_model_name)

# Load embedding model for matching
embed_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embed_model = SentenceTransformer(embed_model_name)

print("Models loaded successfully.")

## Data Loading and Processing
Load the exercise dataset and process it for training.

In [None]:
# Load dataset
dataset = datasets.load_dataset("onurSakar/GYM-Exercise")
print("Sample data point:", dataset['train'][0])

In [None]:
import re

def extract_instruction_response(example):
    """
    Extract instruction and response from text using regex.
    
    Args:
        example: Dictionary containing the text field
        
    Returns:
        Dictionary with instruction and response fields
    """
    text = example['text']
    # Extract instruction (between [INST] and [/INST] tokens)
    instruction_match = re.search(r'\[INST\](.*?)\[/INST\]', text, re.DOTALL)
    instruction_text = ""
    if instruction_match:
        # Remove internal tags like <<SYS>> ... <</SYS>>
        inst_content = instruction_match.group(1)
        inst_clean = re.sub(r'<<SYS>>.*?<</SYS>>', '', inst_content, flags=re.DOTALL).strip()
        instruction_text = inst_clean

    # Extract response after [/INST]
    parts = re.split(r'\[/INST\]', text, maxsplit=1)
    response_text = parts[1].strip() if len(parts) > 1 else ""

    return {
        "instruction": instruction_text,
        "response": response_text
    }

# Process the dataset
processed_dataset = dataset['train'].map(extract_instruction_response)
print("Processed sample:", processed_dataset[0])

## Model Training Configuration
Set up training arguments and initialize the trainer.

In [None]:
# Training arguments configuration
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",
    eval_steps=200,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    num_train_epochs=1,
    save_total_limit=1,
    learning_rate=5e-5,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none",
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,
)

print("Training configuration ready.")

## Model Training
Train the model and upload it to the Hugging Face Hub.

In [None]:
# Start training
trainer.train()

from huggingface_hub import login

# Login with token
login(ACCESS_TOKEN)  # Replace ACCESS_TOKEN with your actual token

# Upload model and tokenizer to Hub with name 'AthenAI'
model.push_to_hub("AthenAI")
tokenizer.push_to_hub("AthenAI")

print("Model and tokenizer successfully uploaded to Hugging Face Hub as 'AthenAI'.")