In [98]:
!pip3 install pydantic openai datasets trl transformers peft torch pandas wandb vllm tqdm fundus arxiv datasets clize xformers flash-attn llama-index

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [99]:



from pydantic import BaseModel, Field
from typing import List, Optional, Literal
from datetime import datetime, timedelta
from pydantic import BaseModel
from lmformatenforcer import JsonSchemaParser
from openai import OpenAI
from lmformatenforcer.integrations.transformers import build_transformers_prefix_allowed_tokens_fn
import pprint
import json
import random
from fundus import PublisherCollection, Crawler
from transformers import AutoTokenizer
from llama_index.readers.papers import ArxivReader


client = OpenAI(
    base_url="http://20.81.188.27:8000/v1",
    api_key="token-abc123",
)


def load_arxiv_papers(search_query: str, max_results: int = 1):    
    loader = ArxivReader()
    documents = loader.load_data(search_query=search_query, max_results=max_results)
    return documents

MODEL_NAME = "mistralai/Mistral-Large-Instruct-2411"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

documents = load_arxiv_papers("(multimodality OR multitask learning) AND ((low-resource languages) OR (code-switching)) -SoupLM", max_results=1)

for document in documents:
    print(document.text)


M3P: Learning Universal Representations via Multitask Multilingual
Multimodal Pre-training
Minheng Ni1*† Haoyang Huang2† Lin Su3† Edward Cui3 Taroon Bharti3 Lijuan Wang4
Jianfeng Gao5 Dongdong Zhang2 Nan Duan2‡
1 Research Center for Social Computing and Information Retrieval
Harbin Institute of Technology, China
2 Natural Language Computing, Microsoft Research Asia, China
3 Bing Multimedia Team, Microsoft, China
4 Cloud+AI, Microsoft, United States
5 Deep Learning, Microsoft Research Redmond, United States
mhni@ir.hit.edu.cn
{haohua, lins, edwac, tbharti, lijuanw, jfgao, Dongdong.Zhang, nanduan}@microsoft.com
Abstract
We present M3P, a Multitask Multilingual Multimodal
Pre-trained model that combines multilingual pre-training
and multimodal pre-training into a uniﬁed framework via
multitask pre-training. Our goal is to learn universal repre-
sentations that can map objects occurred in different modal-
ities or texts expressed in different languages into a com-
mon semantic space. In ad

In [81]:
document

Document(id_='a44d6c99-896d-4657-8b84-5d9016aa665f', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='The following is a summary of the paper: M3P: Learning Universal Representations via Multitask Multilingual Multimodal Pre-training\n\nSummary: We present M3P, a Multitask Multilingual Multimodal Pre-trained model that\ncombines multilingual pre-training and multimodal pre-training into a unified\nframework via multitask pre-training. Our goal is to learn universal\nrepresentations that can map objects occurred in different modalities or texts\nexpressed in different languages into a common semantic space. In addition, to\nexplicitly encourage fine-grained alignment between images and non-English\nlanguages, we also propose Multimodal Code-switched Training (MCT) to combine\nmonolingual pre-training and 

In [100]:
class QA(BaseModel):
    question1: str = Field(description="First general knowledge question about the core topic and concepts")
    answer1: str = Field(description="Clear, factual answer focused on general knowledge")
    question2: str = Field(description="Second general knowledge question about the core topic and concepts") 
    answer2: str = Field(description="Clear, factual answer focused on general knowledge")
    question3: str = Field(description="Third general knowledge question about the core topic and concepts")
    answer3: str = Field(description="Clear, factual answer focused on general knowledge")

DEFAULT_SYSTEM_CONTENT = """You are an expert at creating high-quality training data for language models.
Given academic research content, extract the key concepts and convert them into general knowledge questions and answers.
Questions should:
- Be general and broadly applicable
- Focus on core concepts and ideas
- Be written as standalone questions without referencing any specific research
- Test understanding of the topic area

Answers should:
- Provide clear, factual information
- Be written as standalone knowledge
- Focus on general concepts rather than specific research
- Be useful for general learning about the topic"""

def generate_qa(document_text: str, system_content: str = DEFAULT_SYSTEM_CONTENT):
    qa_format = QA.schema()
    
    return client.chat.completions.create(
        model="/home/lain/text-generation-webui/models/mistralai_Mistral-Large-Instruct-2411",
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": f"Based on these concepts, generate 3 general knowledge questions and answers about: {document_text}"}
        ],
        extra_body={
            "min_p": 0.4,
            "min_tokens": 2000,
            "guided_json": qa_format,
            "repetition_penalty": 1.1
        },
        temperature=1.5,
        max_tokens=4000,
        n=1,
        presence_penalty=0.1,
        frequency_penalty=0.1,
        stop="<|eot_id|>"
    )

qa_response = generate_qa(document.text)
qa_response

ChatCompletion(id='chatcmpl-fc639abe4307403fa9a0e3800450479c', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=' \t\t\t\t\t\t\t\t\t\t\t{\n"question1": "What is the primary goal of multimodal pre-training in language models?",\n"answer1": "The primary goal of multimodal pre-training in language models is to learn universal representations that can map objects occurring in different modalities, such as images and text, into a common semantic space. This enables the model to understand and relate information across different types of inputs."\n\t\t\t\t\t\t\t\t\t\t\t,\n"question2":"What is the purpose of multilingual pre-training in language models?",\n"answer2": "The purpose of multilingual pre-training in language models is to enable the model to understand and generate text in multiple languages. This involves learning representations that can capture the semantics of text expressed in different languages and mapping them into a shared

In [101]:
class QAEvaluation(BaseModel):
    qa_pair1_arguments_for: list[str] = Field(description="Arguments in favor of including first QA pair")
    qa_pair1_arguments_against: list[str] = Field(description="Arguments against including first QA pair") 
    qa_pair1_include: Literal["y", "n"] = Field(description="Whether to include first QA pair")

    qa_pair2_arguments_for: list[str] = Field(description="Arguments in favor of including second QA pair")
    qa_pair2_arguments_against: list[str] = Field(description="Arguments against including second QA pair")
    qa_pair2_include: Literal["y", "n"] = Field(description="Whether to include second QA pair")

    qa_pair3_arguments_for: list[str] = Field(description="Arguments in favor of including third QA pair")
    qa_pair3_arguments_against: list[str] = Field(description="Arguments against including third QA pair")
    qa_pair3_include: Literal["y", "n"] = Field(description="Whether to include third QA pair")
    
    next_search_query: str = Field(description="Recommended next search query for finding related but diverse papers")

def evaluate_qa(document_text: str, qa_output: QA):
    evaluation_format = QAEvaluation.schema()
    
    eval_system_prompt = """You are an expert evaluator of training data quality for language models.
    For each question-answer pair:
    1. List key arguments for and against including it (considering accuracy, clarity, generalizability)
    2. Make a binary decision (y/n) if it should be included in the final dataset
    
    Additionally, recommend a search query for finding the next paper to analyze. The query should:
    - Be related to but very distinct from the current paper's topic
    - Help build a diverse dataset while maintaining topical coherence
    - Potentially focus on an interesting direction suggested by the current paper"""
    
    eval_user_prompt = f"""Evaluate these QA pairs generated from the paper:
    
    Original Paper Text: {document_text}
    
    QA Pair 1:
    Q: {qa_output.question1}
    A: {qa_output.answer1}
    
    QA Pair 2:
    Q: {qa_output.question2}
    A: {qa_output.answer2}
    
    QA Pair 3:
    Q: {qa_output.question3}
    A: {qa_output.answer3}
    
    Provide a structured evaluation following the schema, including a recommended next search query."""
    
    return client.chat.completions.create(
        model="/home/lain/text-generation-webui/models/mistralai_Mistral-Large-Instruct-2411",
        messages=[
            {"role": "system", "content": eval_system_prompt},
            {"role": "user", "content": eval_user_prompt}
        ],
        extra_body={
            "min_p": 0.4,
            "min_tokens": 2000,
            "guided_json": evaluation_format,
            "repetition_penalty": 1.1
        },
        temperature=1.5,
        max_tokens=4000,
        n=1,
        presence_penalty=0.1,
        frequency_penalty=0.1,
        stop="<|eot_id|>"
    )

evaluation_response = evaluate_qa(document.text, QA.parse_raw(qa_response.choices[0].message.content))
evaluation_response


ChatCompletion(id='chatcmpl-8ab47a0ccca24c05ac89f3e672e213a8', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='  {\n    "qa_pair1_arguments_for": [\n        "The question directly relates to the main topic of the paper",\n        "The answer accurately reflects the goal of multimodal pre-training as described in the paper",\n        "The pair is clear and concise"\n    ],\n    "qa_pair1_arguments_against": [],\n    "qa_pair1_include": "y",\n\n    "qa_pair2_arguments_for": [\n        "The question is relevant to the paper\'s scope",\n        "The answer accurately explains the purpose of multilingual pre-training",\n        "The pair is well-structured and easy to understand"\n    ],\n    "qa_pair2_arguments_against": [],\n    "qa_pair2_include": "y",\n\n    "qa_pair3_arguments_for": [\n        "The question addresses a specific and novel aspect of the paper",\n        "The answer provides a clear explanation of MCT and its purpose",\

In [94]:
import pandas as pd
from pathlib import Path
import json
from tqdm import tqdm

def archive_enchanted_dialogues(knowledge_exchange_data, assessment_data, archive_path="wand_university_training_grimoire.csv"):
    """Archive validated magical dialogues and their scholarly assessments for the Wand University curriculum"""
    print("\n🌟 INITIATING ARCHIVAL PROCESS OF ENCHANTED DIALOGUES 🌟")
    
    # Transform assessment data from arcane notation to comprehensible format
    assessment_grimoire = json.loads(assessment_data)
    
    # Initialize collection of validated knowledge exchanges
    validated_exchanges = []
    
    # Evaluate each knowledge exchange through our rigorous magical standards
    for exchange_id in tqdm(range(1,4), desc="VALIDATING KNOWLEDGE EXCHANGES"):
        if assessment_grimoire[f'qa_pair{exchange_id}_include'] == 'y':
            # Synthesize supporting and challenging arguments
            supporting_thesis = assessment_grimoire.get(f'qa_pair{exchange_id}_arguments_for', [])
            counterpoints = assessment_grimoire.get(f'qa_pair{exchange_id}_arguments_against', [])
            
            print(f"\n✨ ACCEPTED KNOWLEDGE EXCHANGE {exchange_id}:")
            print(f"📝 INQUIRY: {getattr(knowledge_exchange_data, f'question{exchange_id}')}")
            print(f"💭 RESPONSE: {getattr(knowledge_exchange_data, f'answer{exchange_id}')}")
            print("✅ POSITIVE FEEDBACK:")
            for strength in supporting_thesis:
                print(f"  • {strength}")
            
            validated_exchanges.append({
                'inquiry': getattr(knowledge_exchange_data, f'question{exchange_id}'),
                'wisdom': getattr(knowledge_exchange_data, f'answer{exchange_id}'),
                'supporting_thesis': ', '.join(supporting_thesis),
                'counterpoints': ', '.join(counterpoints),
                'search_query': assessment_grimoire['next_search_query']
            })
        else:
            print(f"\n⚠️ DISCARDED KNOWLEDGE EXCHANGE {exchange_id}:")
            print(f"📝 INQUIRY: {getattr(knowledge_exchange_data, f'question{exchange_id}')}")
            print(f"💭 RESPONSE: {getattr(knowledge_exchange_data, f'answer{exchange_id}')}")
            print("❌ CRITICAL FEEDBACK:")
            for critique in assessment_grimoire.get(f'qa_pair{exchange_id}_arguments_against', []):
                print(f"  • {critique}")
    
    print("\n📚 TRANSFORMING KNOWLEDGE INTO STRUCTURED FORMAT...")
    # Transform into structured knowledge format
    knowledge_codex = pd.DataFrame(validated_exchanges)
    
    # Preserve in the grand archives
    print("💾 PRESERVING IN THE GRAND ARCHIVES...")
    if Path(archive_path).exists():
        knowledge_codex.to_csv(archive_path, mode='a', header=False, index=False, sep='|')
    else:
        knowledge_codex.to_csv(archive_path, index=False, sep='|')
    
    print(f"✨ SUCCESSFULLY ARCHIVED {len(validated_exchanges)} EXCHANGES ✨")
    return len(validated_exchanges)

def synthesize_and_evaluate_knowledge(source_manuscript, research_focus):
    """Synthesize and evaluate magical knowledge exchanges through our proprietary arcane processes"""
    print("\n🔮 INITIATING KNOWLEDGE SYNTHESIS AND EVALUATION 🔮")
    
    # Generate enlightened knowledge exchanges
    print("📖 GENERATING ENLIGHTENED KNOWLEDGE EXCHANGES...")
    knowledge_response = generate_qa(source_manuscript, research_focus)
    knowledge_data = QA.parse_raw(knowledge_response.choices[0].message.content)
    
    # Submit for rigorous academic evaluation
    print("⚖️ CONDUCTING RIGOROUS ACADEMIC EVALUATION...")
    assessment_response = evaluate_qa(source_manuscript, knowledge_data)
    assessment_results = assessment_response.choices[0].message.content
    
    # Archive worthy exchanges
    exchanges_preserved = archive_enchanted_dialogues(knowledge_data, assessment_results)
    
    # Extract next research direction from evaluation
    assessment_grimoire = json.loads(assessment_results)
    next_research_focus = assessment_grimoire['next_search_query']
    
    return exchanges_preserved, next_research_focus

# Initialize primary research focus
current_research_focus = "(large language models)"

# Initialize arcane metrics
preserved_exchanges_count = 0
research_cycles = 0
maximum_research_cycles = 10  # Adjustable based on arcane energy levels

print("\n🎓 WAND UNIVERSITY KNOWLEDGE ACQUISITION SYSTEM INITIALIZED 🎓")
print("="*80)

with tqdm(total=maximum_research_cycles, desc="RESEARCH CYCLES PROGRESS") as pbar:
    while research_cycles < maximum_research_cycles:
        try:
            # Search the grand archives
            print(f"\n🔍 SEARCHING ARCHIVES WITH FOCUS: {current_research_focus}")
            ancient_manuscripts = load_arxiv_papers(current_research_focus, max_results=1)
            
            for manuscript in tqdm(ancient_manuscripts, desc="PROCESSING MANUSCRIPTS"):
                print("\n" + "="*80)
                print(f"📜 ANALYZING MAGICAL MANUSCRIPT:")
                print("="*80 + "\n")
                print(manuscript.text[:500] + "......")  # Print first 500 characters of the manuscript
                print("\n" + "="*80 + "\n")
                
                # Process each manuscript through our proprietary knowledge extraction ritual
                exchanges_preserved, next_focus = synthesize_and_evaluate_knowledge(manuscript.text, current_research_focus)
                preserved_exchanges_count += exchanges_preserved
                print(f"📊 PRESERVED {exchanges_preserved} MAGICAL EXCHANGES. TOTAL IN ARCHIVES: {preserved_exchanges_count}")
                print(f"🎯 NEXT RESEARCH FOCUS: {next_focus}\n")
                
            current_research_focus = next_focus
            research_cycles += 1
            pbar.update(1)
            
            print(f"🔄 COMPLETED RESEARCH CYCLE {research_cycles}. NEXT FOCUS: {current_research_focus}")
            
        except Exception as e:
            print(f"⚡ MAGICAL ANOMALY DETECTED: {str(e)}")
            continue

print(f"\n🏆 GRAND ARCHIVE COMPLETE! TOTAL MAGICAL EXCHANGES PRESERVED: {preserved_exchanges_count} 🏆")



🎓 WAND UNIVERSITY KNOWLEDGE ACQUISITION SYSTEM INITIALIZED 🎓


RESEARCH CYCLES PROGRESS:   0%|          | 0/10 [00:00<?, ?it/s]


🔍 SEARCHING ARCHIVES WITH FOCUS: (large language models)





📜 ANALYZING MAGICAL MANUSCRIPT:

Lost in Translation
May 2023
A report from
Gabriel Nicholas 
Aliya Bhatia
Large Language Models in 
Non-English Content Analysis......



🔮 INITIATING KNOWLEDGE SYNTHESIS AND EVALUATION 🔮
📖 GENERATING ENLIGHTENED KNOWLEDGE EXCHANGES...


In [None]:
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments, AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model
import torch
import pandas as pd
import wandb

# INITIALIZE THE ARCANE LORA CONFIGURATION FOR KNOWLEDGE TRANSFER
model_id = "/home/lain/text-generation-webui/models/mistralai_Mistral-Large-Instruct-2411",
output_dir = "./wand_university_lora"
num_train_epochs = 4
per_device_train_batch_size = 1
learning_rate = 1e-5
max_seq_length = 4096
lora_r = 8
lora_alpha = 64
lora_dropout = 0.05

# INITIALIZE WANDB TO MONITOR OUR MAGICAL TRAINING RITUAL
wandb.init(
    project="wand-university-lora",
    name="arcane-knowledge-transfer",
    config={
        "epochs": num_train_epochs,
        "batch_size": per_device_train_batch_size,
        "learning_rate": learning_rate,
        "lora_rank": lora_r,
        "lora_alpha": lora_alpha
    }
)

# SUMMON THE TRAINING DATA FROM THE ANCIENT GRIMOIRE
print("🔮 SUMMONING TRAINING DATA FROM THE ANCIENT GRIMOIRE... 📚")
df = pd.read_csv('wand_university_training_grimiore.csv', sep='|')
df = df.dropna()

# INVOKE THE SACRED TOKENIZER AND MODEL
print("⚡ INVOKING THE SACRED TOKENIZER AND MODEL... 🤖")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto')

# CONFIGURE THE MYSTICAL LORA ENCHANTMENT
config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=[
        "q_proj",
        "k_proj", 
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    bias="none",
    lora_dropout=lora_dropout,
    task_type="CAUSAL_LM",
)

# BIND THE LORA ENCHANTMENT TO OUR MODEL
model = get_peft_model(model, config)
model.print_trainable_parameters()

# PREPARE THE SACRED TEXTS FOR TRAINING
print("📜 PREPARING THE SACRED TEXTS... ✨")
data = [{"text": text} for text in df['text'].tolist()]
dataset = Dataset.from_list(data)

# CONFIGURE THE TRAINING RITUAL PARAMETERS
args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    learning_rate=learning_rate,
    report_to="wandb"  # ENABLE WANDB LOGGING OF OUR MAGICAL METRICS
)

# SUMMON THE TRAINING RITUAL MASTER
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
    dataset_text_field='text',
    max_seq_length=max_seq_length,
)

# COMMENCE THE SACRED TRAINING RITUAL
print("🌟 COMMENCING THE SACRED LORA TRAINING RITUAL... 🧙‍♂️")
trainer.train()

# PRESERVE THE ENHANCED MODEL IN OUR ARCANE ARCHIVES
print("💾 PRESERVING THE ENHANCED MODEL IN THE ARCANE ARCHIVES... 📦")
trainer.model.save_pretrained(output_dir)

# CONCLUDE THE WANDB LOGGING RITUAL
wandb.finish()

print("🎉 THE SACRED LORA TRAINING RITUAL IS COMPLETE! 🎊")


In [None]:
### Once we have enough VRAM, we can do dynamic LoRA loading via https://docs.vllm.ai/en/latest/usage/lora.html#dynamically-serving-lora-adapters
### For now, this is not possible, and we need to restart the vllm server to load a new LoRA adapter... 