In [None]:
!pip install torch transformers datasets
!pip install faiss-cpu  # Use CPU version for efficient similarity search
!pip install sentence-transformers
!pip install accelerate peft bitsandbytes
!pip install vllm  # For efficient inference
!pip install wandb  # For experiment tracking (optional)

# !git clone https://github.com/facebookresearch/contriever.git /content/drive/MyDrive/Random/openscholar/contriever
# !git clone https://github.com/FlagOpen/FlagEmbedding.git /content/drive/MyDrive/Random/openscholar/bge-reranker

Collecting bitsandbytes
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl (72.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.46.1
Collecting vllm
  Downloading vllm-0.10.0-cp38-abi3-manylinux1_x86_64.whl.metadata (14 kB)
Collecting blake3 (from vllm)
  Downloading blake3-1.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting openai<=1.90.0,>=1.87.0 (from vllm)
  Downloading openai-1.90.0-py3-none-any.whl.metadata (26 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.1.0-py3-none-any.whl.metadata (13 kB)
Collecting lm-format-enforcer<0.11,>=0.10.11 (from vllm)
  Downloading lm_format_enforcer-0.10.11-py3-none-any.whl.



In [None]:
import torch
import json
import numpy as np
from transformers import (
    AutoTokenizer,
    AutoModel,
    AutoModelForSequenceClassification,
    LlamaForCausalLM,
    LlamaTokenizer,
    TrainingArguments,
    Trainer
)
from datasets import Dataset, DatasetDict
from pathlib import Path
import random
from tqdm import tqdm

In [None]:
import os

# Define the path to your desired default folder
folder_path = '/content/drive/MyDrive/Random/openscholar'

# Change the current working directory
os.chdir(folder_path)

In [None]:
!pwd

/content/drive/MyDrive/Random/openscholar


In [None]:
# -- Step 1: Install the necessary library --
!pip install huggingface_hub -q

# -- Step 2: Log in to Hugging Face --
from huggingface_hub import login, HfApi
# You'll be prompted to paste your token here
login()

# -- Step 3: Define what to upload --
local_model_path = "./openscholar_llama_final"

# The name you want for your model on the Hub
hf_repo_name = "bio-Llama-3.1-8B-Instruct"

# -- Step 4: Create the repository if it doesn't exist --
api = HfApi()
# repo_id = f"{api.whoami()['name']}/{hf_repo_name}"
repo_id = f"bio-protocol/{hf_repo_name}"
try:
    api.create_repo(repo_id=repo_id, repo_type="model")
    print(f"Repository '{repo_id}' created on Hugging Face Hub.")
except Exception as e:
    # Catch exception if repo already exists
    if "You already have a repo named" in str(e):
        print(f"Repository '{repo_id}' already exists on Hugging Face Hub.")
    else:
        raise e # Re-raise other exceptions


# -- Step 5: Upload the folder! --
api.upload_folder(
    folder_path=local_model_path,
    repo_id=repo_id,
    repo_type="model",
    # These files are for resuming training and not needed for sharing
    # ignore_patterns=["optimizer.pt", "scheduler.pt"],
)

print(f"All done! Your model is available at: https://huggingface.co/{repo_id}")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Repository 'bio-protocol/bio-Llama-3.1-8B-Instruct' created on Hugging Face Hub.


Uploading...:   0%|          | 0.00/71.8M [00:00<?, ?B/s]

All done! Your model is available at: https://huggingface.co/bio-protocol/bio-Llama-3.1-8B-Instruct


In [None]:
# Load your text files and prepare the datastore
def load_text_files(directory_path):
    """Load all text files from a directory"""
    texts = []
    file_paths = []

    for txt_file in Path(directory_path).glob("*.md"):
        with open(txt_file, 'r', encoding='utf-8') as f:
            content = f.read()
            texts.append(content)
            file_paths.append(str(txt_file))

    return texts, file_paths

# Split documents into passages (250-word chunks as per paper)
def split_into_passages(text, chunk_size=250):
    """Split text into passages of approximately chunk_size words"""
    words = text.split()
    passages = []

    for i in range(0, len(words), chunk_size):
        passage = ' '.join(words[i:i + chunk_size])
        passages.append(passage)

    return passages

texts, file_paths = load_text_files("/content/drive/MyDrive/Random/openscholar/docs")
all_passages = []
passage_metadata = []

for idx, (text, file_path) in enumerate(zip(texts, file_paths)):
    passages = split_into_passages(text)
    for passage in passages:
        all_passages.append(passage)
        passage_metadata.append({
            'file_path': file_path,
            'doc_id': idx
        })

print(f"Total passages: {len(all_passages)}")

Total passages: 1388


In [None]:
!pip install sentence-transformers datasets -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m126.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m90.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m62.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m38.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import random
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, InputExample, losses

# Check if a GPU is available for training
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available and will be used for training.")
else:
    device = torch.device("cpu")
    print("GPU not available, using CPU. Training will be slow.")

# ----------------------------------------------------
# 4.1 Prepare Training Data
# (Your original function is good, we'll keep it)
# ----------------------------------------------------
def prepare_contriever_training_data(passages, num_samples=10000):
    """Prepare unsupervised training data for Contriever."""
    training_triplets = []
    for _ in range(num_samples):
        # 1. Random passage as query
        query_idx = random.randint(0, len(passages) - 1)

        # 2. Positive: a nearby passage
        # We add a check to make sure the positive is not the same as the query
        positive_offset = random.choice([-2, -1, 1, 2])
        positive_idx = query_idx + positive_offset
        positive_idx = max(0, min(positive_idx, len(passages) - 1))
        if positive_idx == query_idx: # Reroll if it's the same index
            positive_idx = max(0, min(query_idx + random.choice([-1, 1]), len(passages) - 1))

        # 3. Negative: a random passage from a distant part of the list
        negative_idx = random.randint(0, len(passages) - 1)
        while abs(negative_idx - query_idx) < 5:  # Ensure it's not a nearby passage
            negative_idx = random.randint(0, len(passages) - 1)

        training_triplets.append({
            'query': passages[query_idx],
            'positive': passages[positive_idx],
            'negative': passages[negative_idx] # Note: sentence-transformers handles negatives implicitly
        })
    return training_triplets

print(f"Using {len(all_passages)} passages loaded from your docs folder.")
training_data_dict = prepare_contriever_training_data(all_passages, num_samples=2000)
print(f"Generated {len(training_data_dict)} training triplets.")

# Convert the dictionary data to the format required by sentence-transformers
train_examples = []
for item in training_data_dict:
    train_examples.append(InputExample(texts=[item['query'], item['positive']]))


# ----------------------------------------------------
# 4.2 Train Contriever (The Right Way)
# ----------------------------------------------------

# 1. Load the pre-trained Contriever model using the SentenceTransformer class
model_name = 'facebook/contriever'
model = SentenceTransformer(model_name, device=device)

# 2. Create a DataLoader for our training examples
# The library's special collate function will handle tokenization
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=32)

# 3. Define the loss function. MultipleNegativesRankingLoss is ideal for this.
# It uses the other positive passages in a batch as hard negatives.
train_loss = losses.MultipleNegativesRankingLoss(model)

# 4. Set training parameters
num_epochs = 1 # Use 1 for a quick demo, increase for real training
warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warmup

# 5. Fine-tune the model
print("\nStarting the fine-tuning process...")
model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,
          output_path='./contriever_fine_tuned',
          show_progress_bar=True)

print("\nTraining complete. Model saved to './contriever_fine_tuned'.")

GPU is available and will be used for training.
Generating dummy passages for demonstration...
Generated 2000 training triplets.


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]


Starting the fine-tuning process...


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mritvikbio[0m ([33mbio-protocol[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss



Training complete. Model saved to './contriever_fine_tuned'.


In [None]:
!pip install faiss-cpu sentence-transformers -q

In [None]:
import numpy as np
import faiss
import json
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

# 5.1 Generate embeddings for all passages (The Correct Way)
print("Generating embeddings for all passages...")
# The model.encode() method is highly optimized.
# It handles tokenization, batching, and uses the correct pooling strategy.
passage_embeddings = model.encode(
    all_passages,
    batch_size=128, # You can adjust this based on your GPU memory
    show_progress_bar=True
)

print(f"Embeddings generated. Shape: {passage_embeddings.shape}")


# 5.2 Build FAISS index
print("\nBuilding FAISS index...")
embedding_dim = passage_embeddings.shape[1]
# Using IndexFlatIP (Inner Product) because it's recommended for Contriever
index = faiss.IndexFlatIP(embedding_dim)

# Add the passage embeddings to the index
index.add(passage_embeddings.astype('float32')) # FAISS requires float32
print(f"Index built. Total vectors in index: {index.ntotal}")


# 5.3 Save index and passages for later use
print("\nSaving index and passages...")
faiss.write_index(index, "passage_index.faiss")

# Save the passages and their metadata in a separate file
# This allows you to retrieve the actual text after a search
with open("passages.json", "w") as f:
    json.dump({
        "passages": all_passages,
        "metadata": passage_metadata # Use the loaded passage_metadata
    }, f, indent=4)

print("\nProcess complete. 'passage_index.faiss' and 'passages.json' are saved.")



Generating embeddings for all passages...


Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Embeddings generated. Shape: (500, 768)

Building FAISS index...
Index built. Total vectors in index: 500

Saving index and passages...

Process complete. 'passage_index.faiss' and 'passages.json' are saved.


In [None]:
!pip install FlagEmbedding -q

In [None]:
import random
import numpy as np
import torch
from tqdm.auto import tqdm
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding
# Note: The 'FlagReranker' import is removed as it's not needed for training.

# 6.1 Generate synthetic reranking data (with corrected model.encode call)
def generate_reranking_data(passages, retriever, index, num_samples=1000):
    """Generate training data for a reranker."""
    reranking_data = []

    for _ in tqdm(range(num_samples), desc="Generating Reranking Data"):
        query_idx = random.randint(0, len(passages) - 1)
        query = passages[query_idx]

        # FIX: Use the correct model.encode method
        query_embedding = retriever.encode([query], show_progress_bar=False)
        scores, indices = index.search(query_embedding.astype('float32'), k=10)

        # Use the retrieved passages to create training pairs
        retrieved_passages = [passages[i] for i in indices[0]]

        # The actual passage is the positive example
        reranking_data.append({'query': query, 'passage': query, 'label': 1})

        # Others are negative examples
        for passage in retrieved_passages:
            if passage != query:
                reranking_data.append({'query': query, 'passage': passage, 'label': 0})

    return reranking_data

# 6.2 Train BGE reranker
# Define model names
reranker_model_name = "BAAI/bge-reranker-large"

# Generate training data and create a Dataset
reranking_data = generate_reranking_data(all_passages, retriever_model, index) # Use the loaded variables
reranking_dataset = Dataset.from_list(reranking_data)

# Load tokenizer and model for the reranker
reranker_tokenizer = AutoTokenizer.from_pretrained(reranker_model_name)
reranker_model = AutoModelForSequenceClassification.from_pretrained(
    reranker_model_name,
    num_labels=2, # Use 2 for binary classification (relevant/not relevant)
    ignore_mismatched_sizes=True
)

# The reranker needs the query and passage to be tokenized together as a pair.
def preprocess_function(examples):
    # The tokenizer for rerankers is designed to handle pairs of sentences
    return reranker_tokenizer(
        examples['query'],
        examples['passage'],
        truncation=True,
        max_length=512
    )

tokenized_dataset = reranking_dataset.map(preprocess_function, batched=True, remove_columns=['query', 'passage'])

# Define training arguments
reranker_args = TrainingArguments(
    output_dir="./scientific_reranker",
    num_train_epochs=1, # Use 1 for a quick demo
    per_device_train_batch_size=8,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    logging_steps=100,
    save_strategy="epoch",
    fp16=True if torch.cuda.is_available() else False,
)

# Define a data collator to handle padding
data_collator = DataCollatorWithPadding(tokenizer=reranker_tokenizer)

# Initialize the Trainer
reranker_trainer = Trainer(
    model=reranker_model,
    args=reranker_args,
    train_dataset=tokenized_dataset,  # <-- USE THE TOKENIZED DATASET
    data_collator=data_collator,      # <-- ADD THE DATA COLLATOR
    tokenizer=reranker_tokenizer,
)

# Train the model
print("\nStarting the reranker fine-tuning process...")
reranker_trainer.train()
print("\nTraining complete.")



Generating Reranking Data:   0%|          | 0/1000 [00:00<?, ?it/s]

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at BAAI/bge-reranker-large and are newly initialized because the shapes did not match:
- classifier.out_proj.bias: found shape torch.Size([1]) in the checkpoint and torch.Size([2]) in the model instantiated
- classifier.out_proj.weight: found shape torch.Size([1, 1024]) in the checkpoint and torch.Size([2, 1024]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/10000 [00:00<?, ? examples/s]


Starting the reranker fine-tuning process...


Step,Training Loss
100,0.2229
200,0.0587
300,0.0501
400,0.0785
500,0.0402
600,0.022
700,0.0003
800,0.0002
900,0.0012
1000,0.0088



Training complete.


In [None]:
# 7.1 Generate synthetic QA data
def generate_qa_data(passages, num_samples=1000):
    """Generate question-answer pairs from passages"""
    qa_data = []

    # Use a pre-trained model to generate questions
    from transformers import pipeline
    qa_generator = pipeline("text2text-generation",
                           model="google/flan-t5-large",
                           device=0 if torch.cuda.is_available() else -1)

    for _ in tqdm(range(num_samples)):
        # Sample one passage as context
        passage_index = random.randint(0, len(passages) - 1)
        context_passage = passages[passage_index]

        # Generate question based on the single passage
        prompt = f"Generate a scientific question based on this text: {context_passage[:500]}" # Use only the first 500 chars to be safe
        question = qa_generator(prompt, max_length=64, max_new_tokens=64)[0]['generated_text'] # Set max_new_tokens

        # Generate answer using the single context passage
        answer_prompt = f"Question: {question}\nContext: {context_passage}\nAnswer:"
        answer = qa_generator(answer_prompt, max_length=256, max_new_tokens=256)[0]['generated_text'] # Set max_new_tokens

        qa_data.append({
            "question": question,
            "context": [context_passage], # Store as a list for consistency with previous structure
            "answer": answer,
            "passage_indices": [passage_index]
        })

    return qa_data

qa_training_data = generate_qa_data(all_passages) # Use the loaded all_passages

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0
  0%|          | 0/1000 [00:00<?, ?it/s]Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Token indices sequence length is longer than the specified maximum sequence length for this model (1203 > 512). Running this sequence through the model will result in indexing errors
Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
  0%|          | 1/1000 [00:05<1:29:36,  5.38s/it]Both `max_new_tokens` (=256) and `max_length`(=64) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/trans

In [None]:
import json

with open("qa_training_data.json", "w") as f:
    json.dump(qa_training_data, f, indent=4)

In [None]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
The token `langchain_learning` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `langc

In [None]:
from peft import LoraConfig, get_peft_model, TaskType
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling # <-- ADDED DataCollator
import torch
import gc
from transformers import BitsAndBytesConfig, AutoTokenizer, LlamaForCausalLM

# 1. Clear VRAM cache before loading
gc.collect()
torch.cuda.empty_cache()

# 2. Create a 4-bit quantization configuration
# This is the new, recommended way to handle quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for A100 performance
    bnb_4bit_use_double_quant=True,
)

# 3. Load the model with the new quantization config
model_name = "meta-llama/Llama-3.1-8B-Instruct"

llama_model = LlamaForCausalLM.from_pretrained(
    model_name,
    quantization_config=quantization_config, # Pass the config object here
    device_map="auto",
)

# Load tokenizer as before
llama_tokenizer = AutoTokenizer.from_pretrained(model_name)
llama_tokenizer.pad_token = llama_tokenizer.eos_token

print("Model loaded successfully!")
# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)
llama_model = get_peft_model(llama_model, lora_config)

# ... (Your data formatting function remains the same) ...
def format_training_examples(qa_data):
    """Format QA data for instruction tuning"""
    formatted_data = []

    for item in qa_data:
        context_with_citations = "\n".join([
            f"[{i+1}] {passage[:500]}..."
            for i, passage in enumerate(item['context'])
        ])
        instruction = f"""Question: {item['question']}\n\nContext:\n{context_with_citations}\n\nAnswer the question based on the provided context. Include citations in your answer."""
        response = f"{item['answer']}"
        formatted_data.append({"text": f"{instruction}\n\nAnswer: {response}"})
    return formatted_data

# Format data
# qa_training_data = [...] # Assuming this exists
training_texts = format_training_examples(qa_training_data) # Use the loaded qa_training_data
train_dataset = Dataset.from_list(training_texts)


# --- START: ADDED TOKENIZATION AND DATA COLLATOR ---

# 1. Tokenize the dataset
def tokenize_function(examples):
    return llama_tokenizer(examples["text"], truncation=True, max_length=512)

tokenized_dataset = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"] # Remove the original text column
)

# 2. Add a data collator for language modeling
# This creates the 'labels' and handles padding for the batch
data_collator = DataCollatorForLanguageModeling(tokenizer=llama_tokenizer, mlm=False)

# --- END: ADDED TOKENIZATION AND DATA COLLATOR ---


# 8.3 Train Llama
training_args = TrainingArguments(
    output_dir="./openscholar_llama",
    num_train_epochs=2,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=16,
    learning_rate=2e-4,
    warmup_steps=100,
    logging_steps=50,
    save_steps=500,
    fp16=True,
    optim="paged_adamw_8bit",
    save_total_limit=2,
)

trainer = Trainer(
    model=llama_model,
    args=training_args,
    train_dataset=tokenized_dataset,  # <-- USE THE TOKENIZED DATASET
    data_collator=data_collator,      # <-- ADD THE DATA COLLATOR
    tokenizer=llama_tokenizer,
)

# Train
trainer.train()

# Save the model
trainer.save_model("./openscholar_llama_final")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully!


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
50,2.6351


In [None]:
from transformers import AutoTokenizer, LlamaForCausalLM, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch

base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
finetuned_model_path = "./openscholar_llama_final"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load base model
base_model = LlamaForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map="auto"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
tokenizer.pad_token = tokenizer.eos_token  # match training

# Apply LoRA weights
model = PeftModel.from_pretrained(base_model, finetuned_model_path)
model.eval()


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora

In [None]:
def generate_answer(question, context_passages):
    context = "\n".join([f"[{i+1}] {ctx[:500]}..." for i, ctx in enumerate(context_passages)])
    prompt = f"""Question: {question}\n\nContext:\n{context}\n\nAnswer the question based on the provided context. Include citations in your answer.\n\nAnswer:"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=True,
            top_p=0.9,
            temperature=0.7,
            repetition_penalty=1.1
        )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

question = "What is the purpose of the Higgs boson?"
context = [
    "The Higgs boson is a quantum excitation of the Higgs field, which gives mass to elementary particles.",
    "Its discovery at CERN in 2012 confirmed the final missing piece of the Standard Model."
]

print(generate_answer(question, context))


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Question: What is the purpose of the Higgs boson?

Context:
[1] The Higgs boson is a quantum excitation of the Higgs field, which gives mass to elementary particles....
[2] Its discovery at CERN in 2012 confirmed the final missing piece of the Standard Model....

Answer the question based on the provided context. Include citations in your answer.

Answer: The Higgs boson is a quantum excitation of the Higgs field, which gives mass to elementary particles. #### **Hyperspace** # **Hyperspace** - A term coined by Buckminster Fuller to describe any space beyond our three dimensions of length, width and depth. - In cosmology, often used to describe higher-dimensional spaces that might be inhabited by other forms of life. #### **Hyperthermia** - Extremely high temperatures—higher than the normal body temperature of about 98.6°F (37°C). - Some researchers have suggested that hyperthermia may be an effective way to kill cancer cells because these cells are less able than healthy cells to cool 