In [3]:
import logging
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import torch.nn.functional as F
from transformers import DataCollatorForLanguageModeling

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Load the tokenizer and model
model_name = "EleutherAI/gpt-neo-1.3B"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Add a padding token if not present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
    logger.info("Added padding token to tokenizer and resized model embeddings")

# Load and preprocess the dataset
logger.info("Loading and tokenizing dataset...")
dataset = load_dataset("json", data_files="processed_data.jsonl", split="train")

# Tokenize the dataset with metadata
def tokenize_function(examples):
    text_with_metadata = [
        f"{text} [Author: {meta.get('author', 'N/A')}] [Keywords: {meta.get('keywords', 'N/A')}]"
        for text, meta in zip(examples["text"], examples["metadata"])
    ]
    return tokenizer(text_with_metadata, padding="max_length", truncation=True, max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
logger.info("Dataset tokenization complete.")

# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Training arguments with memory-saving and speed optimizations
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="no",
    learning_rate=5e-5,  # Slightly higher learning rate for faster convergence
    per_device_train_batch_size=1,  # Small batch size for memory limitations
    gradient_accumulation_steps=16,  # Higher accumulation for better batch simulation
    num_train_epochs=1,  # Single epoch to minimize training time
    weight_decay=0.01,
    fp16=True if device == "cuda" else False,  # Mixed precision for faster training on GPU
    save_steps=500,
    save_total_limit=1,
    logging_steps=200,  # Less frequent logging
    warmup_steps=50,  # Lower warmup to reach full learning rate quicker
    gradient_checkpointing=True,  # Reduces memory usage
    report_to=[]
)

# Custom Trainer class with detailed logging
class CustomTrainer(Trainer):
    def training_step(self, model, inputs, *args, **kwargs):
        model.train()
        inputs = {k: v.to(self.args.device) for k, v in inputs.items()}

        # Shift input ids and labels to the right
        labels = inputs["input_ids"].clone()
        outputs = model(**inputs)
        logits = outputs.logits

        # Compute the shift in inputs for causal language modeling
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        # Compute the loss
        loss = F.cross_entropy(
            shift_logits.view(-1, shift_logits.size(-1)),
            shift_labels.view(-1),
            ignore_index=tokenizer.pad_token_id
        )

        # Log batch loss
        if self.state.global_step % 100 == 0:
            logger.info(f"Step {self.state.global_step} - Batch loss: {loss.item():.4f}")
        return loss

# Initialize the trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# Start training
logger.info("Starting training...")
trainer.train()

# Save the model and tokenizer
model_save_path = "./fine_tuned_model"
logger.info(f"Saving model and tokenizer to '{model_save_path}'")
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)
logger.info("Model and tokenizer saved successfully.")

# Inference function for generating responses
def generate_response(query):
    input_ids = tokenizer.encode(query, return_tensors="pt").to(device)
    output = model.generate(input_ids, max_length=512, num_return_sequences=1, temperature=0.7)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Example query for testing the model
query = "What is the role of magnesium in pancreatic beta-cell function?"
response = generate_response(query)
logger.info(f"Generated response: {response}")


2024-10-27 01:39:58,277 - INFO - Using device: cpu
2024-10-27 01:40:04,128 - INFO - Added padding token to tokenizer and resized model embeddings
2024-10-27 01:40:04,128 - INFO - Loading and tokenizing dataset...
2024-10-27 01:40:04,459 - INFO - Dataset tokenization complete.
2024-10-27 01:40:04,467 - INFO - Starting training...
2024-10-27 01:40:09,044 - INFO - Step 0 - Batch loss: 2.4602
2024-10-27 01:40:12,982 - INFO - Step 0 - Batch loss: 3.4770
2024-10-27 01:40:16,897 - INFO - Step 0 - Batch loss: 2.0927

KeyboardInterrupt



In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import re

# Load the fine-tuned model and tokenizer
model_path = "./fine_tuned_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

# Set device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Load the dataset for retrieving article information
dataset = load_dataset("json", data_files="processed_data.jsonl", split="train")

# Function to clean and tokenize text for better matching
def preprocess_text(text):
    return re.sub(r'\W+', ' ', text.lower())

# Function to rank and retrieve the top 3 relevant citations based on query
def find_relevant_articles(query, dataset, max_results=3):
    query_tokens = set(preprocess_text(query).split())
    relevant_articles = []
    
    for article in dataset:
        article_text = article["text"]
        sentences = re.split(r'(?<=[.!?]) +', article_text)  # Split text into sentences
        
        # Score each sentence based on keyword overlap
        sentence_scores = []
        for sentence in sentences:
            sentence_tokens = set(preprocess_text(sentence).split())
            overlap = query_tokens.intersection(sentence_tokens)
            score = len(overlap)  # Simple score based on number of overlapping keywords
            if score > 0:
                sentence_scores.append((score, sentence))
        
        # Sort sentences by score in descending order and take top 3
        top_citations = [sent for _, sent in sorted(sentence_scores, key=lambda x: x[0], reverse=True)[:3]]
        
        # If relevant citations are found, add them to the list
        if top_citations:
            relevant_articles.append({
                "title": article["name"],
                "author": article["metadata"].get("author", "N/A"),
                "keywords": article["metadata"].get("keywords", "N/A"),
                "citations": top_citations
            })
            
        if len(relevant_articles) >= max_results:
            break
    
    return relevant_articles

# Function for generating text based on the query
def generate_response(query, max_length=150, temperature=0.7, top_p=0.9):
    # Tokenize the query
    input_ids = tokenizer.encode(query, return_tensors="pt").to(device)
    
    # Generate a response from the model
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            top_p=top_p,
            do_sample=True,
            num_return_sequences=1
        )
    
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

# Combined function to get relevant articles with top citations and generate a response
def query_model_with_articles(query):
    articles = find_relevant_articles(query, dataset)
    
    if articles:
        article_response = "Here are the top relevant articles with citations based on your query:\n\n"
        for article in articles:
            article_response += (
                f"Title: {article['title']}\n"
                f"Author(s): {article['author']}\n"
                f"Keywords: {article['keywords']}\n"
                "Top Citations:\n"
            )
            for citation in article["citations"]:
                article_response += f"- {citation}\n"
            article_response += "\n"
    else:
        article_response = "No relevant articles found in the dataset."

    # Generate additional context from the model
    model_response = generate_response(query)
    return f"{article_response}\nGenerated response:\n{model_response}"

# Example usage
query = "What is magnesium and its role in health?"
response = query_model_with_articles(query)
print(response)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Here are the top relevant articles with citations based on your query:

Title: gymbeam_hackathon_2loow3ua1h.pdf
Author(s): Hamilton Roschel, Bruno Gualano, Sergej M. Ostojic and Eric S. Rawson
Keywords: phosphorylcreatine; dietary supplement; cognition; brain injury; concussion
Top Citations:
- nutrients Review Creatine Supplementation and Brain Health Hamilton Roschel ,* , Bruno Gualano ,, Sergej .
- Creatine Supplementation and Brain Health.
- Ostojic and Eric .

Title: gymbeam_hackathon_b7zwszxghq.pdf
Author(s): Nuraly S. Akimbekov
Keywords: magnesium; diabetes; insulin; glucose; β-cells
Top Citations:
- Most magnesium is present in bone and teeth (∼%), while its concentrations in intracellular compartments and extracellular ﬂuids constitute ∼% and <%, respectively (, –).
- Razzaque mohammed.razzaque@uyrgv.edu; msr.nagasaki@gmail.com  July 2 September 2 September 2 Akimbekov , Coban , AtﬁA and Razzaque  (2) The role of magnesium in pancreatic beta-cell function and homeostasis.
- Th