In [None]:
!pip install transformers
!pip install wandb
!pip install rouge_score
!pip install nltk
!pip install hf_xet
!pip install logger



In [None]:
import torch
from transformers import (
    BertModel, 
    BertTokenizer, 
    GPT2LMHeadModel, 
    GPT2Tokenizer,
    EncoderDecoderModel, 
    EncoderDecoderConfig,
    get_linear_schedule_with_warmup
)
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import json
import os
import numpy as np
import nltk
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction
import logging
import random

In [None]:
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
def set_seed(seed=42):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [10]:
class QuestionGenerationDataset(Dataset):
    """
        Context-question pairs are prepared for encoder-decoder model training using the dataset.
        Encodes the question as the GPT-2 target and the context with the BERT difficulty tag.
    """
    def __init__(self, data, bert_tokenizer, gpt2_tokenizer, max_encoder_length=512, max_decoder_length=64):
        self.data = data
        self.bert_tokenizer = bert_tokenizer
        self.gpt2_tokenizer = gpt2_tokenizer
        self.max_encoder_length = max_encoder_length
        self.max_decoder_length = max_decoder_length
        
        special_tokens = {"additional_special_tokens": ["[EASY]", "[MEDIUM]", "[HARD]", "[CONTEXT]", "[QUESTION]"]}
        if self.bert_tokenizer.add_special_tokens(special_tokens):
            logger.info("Added special tokens to BERT tokenizer")
        
        if self.gpt2_tokenizer.add_special_tokens(special_tokens):
            logger.info("Added special tokens to GPT2 tokenizer")
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        context = item["context"]
        
        qa_pair = random.choice(item["qa_pairs"])
        question = qa_pair["question"]
        difficulty = qa_pair["difficulty"]
        
        difficulty_tag = f"[{difficulty.upper()}]"
        formatted_input = f"[CONTEXT] {context} {difficulty_tag} [QUESTION]"
        
        encoder_inputs = self.bert_tokenizer(
            formatted_input,
            max_length=self.max_encoder_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        decoder_inputs = self.gpt2_tokenizer(
            question,
            max_length=self.max_decoder_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        return {
            "input_ids": encoder_inputs.input_ids.squeeze(),
            "attention_mask": encoder_inputs.attention_mask.squeeze(),
            "decoder_input_ids": decoder_inputs.input_ids.squeeze(),
            "decoder_attention_mask": decoder_inputs.attention_mask.squeeze(),
            "labels": decoder_inputs.input_ids.squeeze().clone(),
            "raw_context": context,
            "raw_question": question,
            "difficulty": difficulty
        }

In [11]:
def load_data(data_path):
    """Load dataset from JSON file"""
    if data_path.endswith('.json'):
        with open(data_path, 'r') as f:
            data = json.load(f)
        return data
    else:
        raise ValueError("Unsupported file format. Please use a JSON file.")

In [12]:
def create_bert_gpt2_model():
    """Create and initialize the BERT-GPT2 encoder-decoder model with improved connection"""
    bert_model_name = "bert-base-uncased"
    gpt2_model_name = "gpt2"
    
    bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
    gpt2_tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model_name)
    
    if gpt2_tokenizer.pad_token is None:
        gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
    
    model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        bert_model_name,
        gpt2_model_name
    )
    
    model.config.add_cross_attention = True
    model.config.decoder.add_cross_attention = True
    
    model.config.decoder_start_token_id = gpt2_tokenizer.bos_token_id
    model.config.eos_token_id = gpt2_tokenizer.eos_token_id
    model.config.pad_token_id = gpt2_tokenizer.pad_token_id
    model.config.vocab_size = model.config.decoder.vocab_size
    
    special_tokens = {"additional_special_tokens": ["[EASY]", "[MEDIUM]", "[HARD]", "[CONTEXT]", "[QUESTION]"]}
    bert_tokenizer.add_special_tokens(special_tokens)
    gpt2_tokenizer.add_special_tokens(special_tokens)
    
    model.encoder.resize_token_embeddings(len(bert_tokenizer))
    model.decoder.resize_token_embeddings(len(gpt2_tokenizer))
    
    for name, param in model.encoder.named_parameters():
        if 'layer.10' not in name and 'layer.11' not in name:
            param.requires_grad = False
            logger.info(f"Freezing parameter: {name}")
    
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    logger.info(f"Total params: {total_params:,} | Trainable params: {trainable_params:,} | " 
               f"Frozen params: {total_params - trainable_params:,}")
    
    return model, bert_tokenizer, gpt2_tokenizer


In [13]:
def prepare_datasets(data, bert_tokenizer, gpt2_tokenizer, train_size=0.8, val_size=0.1, test_size=0.1):
    """Split the data into train, validation, and test sets"""
    assert abs(train_size + val_size + test_size - 1.0) < 1e-6, "Split ratios must sum to 1"
    
    train_data, temp_data = train_test_split(data, test_size=(val_size + test_size), random_state=42)
    
    val_data, test_data = train_test_split(
        temp_data, 
        test_size=test_size/(val_size + test_size), 
        random_state=42
    )
    
    logger.info(f"Data split: Train: {len(train_data)}, Validation: {len(val_data)}, Test: {len(test_data)}")
    
    train_dataset = QuestionGenerationDataset(train_data, bert_tokenizer, gpt2_tokenizer)
    val_dataset = QuestionGenerationDataset(val_data, bert_tokenizer, gpt2_tokenizer)
    test_dataset = QuestionGenerationDataset(test_data, bert_tokenizer, gpt2_tokenizer)
    
    return train_dataset, val_dataset, test_dataset

In [14]:
def train_model(model, train_dataloader, val_dataloader, optimizer, scheduler, device, num_epochs=3):
    """Train the model with improved training strategy"""
    model.to(device)
    best_val_loss = float('inf')
    
    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        train_progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Training]")
        
        for batch in train_progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            
            optimizer.zero_grad()
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            loss = outputs.loss
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            
            optimizer.step()
            scheduler.step()
            
            total_train_loss += loss.item()
            train_progress_bar.set_postfix({"loss": loss.item()})
        
        avg_train_loss = total_train_loss / len(train_dataloader)
        logger.info(f"Epoch {epoch+1}/{num_epochs} - Average training loss: {avg_train_loss:.4f}")
        
        model.eval()
        total_val_loss = 0
        val_progress_bar = tqdm(val_dataloader, desc=f"Epoch {epoch+1}/{num_epochs} [Validation]")
        
        with torch.no_grad():
            for batch in val_progress_bar:
                input_ids = batch["input_ids"].to(device)
                attention_mask = batch["attention_mask"].to(device)
                labels = batch["labels"].to(device)
                
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                
                loss = outputs.loss
                total_val_loss += loss.item()
                val_progress_bar.set_postfix({"loss": loss.item()})
        
        avg_val_loss = total_val_loss / len(val_dataloader)
        logger.info(f"Epoch {epoch+1}/{num_epochs} - Average validation loss: {avg_val_loss:.4f}")
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            logger.info(f"New best validation loss: {best_val_loss:.4f}")
            
            output_dir = "bert_gpt2_qg_model"
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            
            model_path = os.path.join(output_dir, "best_model.pt")
            torch.save(model.state_dict(), model_path)
            logger.info(f"Model saved to {model_path}")
    
    return model


In [15]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
def calculate_bleu_score(references, candidates):
    """
    Calculate BLEU score for generated questions.
    """
    smoothie = SmoothingFunction().method1
    scores = []
    
    for ref, cand in zip(references, candidates):
        ref_tokens = nltk.word_tokenize(ref.lower())
        cand_tokens = nltk.word_tokenize(cand.lower())
        
        score = sentence_bleu([ref_tokens], cand_tokens, smoothing_function=smoothie)
        scores.append(score)
    
    return np.mean(scores)

In [16]:
def evaluate_model(model, test_dataloader, gpt2_tokenizer, device):
    """Evaluate the model and calculate BLEU score"""
    model.to(device)
    model.eval()
    
    all_predictions = []
    all_references = []
    
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc="Generating questions"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            
            outputs = model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_length=64,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=2,
                length_penalty=1.0,
                repetition_penalty=1.2
            )
            
            predictions = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True)
            references = batch["raw_question"]
            
            all_predictions.extend(predictions)
            all_references.extend(references)
    
    bleu_score = calculate_bleu_score(all_references, all_predictions)
    
    logger.info(f"BLEU Score: {bleu_score:.4f}")
    
    return bleu_score, all_predictions, all_references


In [17]:
def generate_questions(model, bert_tokenizer, gpt2_tokenizer, context, difficulty='medium', num_questions=3, device='cuda'):
    """
    Generate diverse questions for a given context using improved generation strategies
    """
    model.to(device)
    model.eval()
    
    question_types = {
        "easy": [
            "factual", "definition", "identification", "basic concept", "simple explanation"
        ],
        "medium": [
            "analytical", "inference", "connection", "application", "comparison"
        ],
        "hard": [
            "evaluation", "synthesis", "critique", "hypothetical", "implications"
        ]
    }
    
    generated_questions = []
    unique_questions = set()
    
    for i in range(min(len(question_types[difficulty]), num_questions * 2)): 
        q_type = question_types[difficulty][i % len(question_types[difficulty])]
        
        formatted_input = f"[CONTEXT] {context} [{difficulty.upper()}] Generate a {q_type} question. [QUESTION]"
        
        encoder_inputs = bert_tokenizer(
            formatted_input,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        ).to(device)
        
        with torch.no_grad():
            outputs = model.generate(
                input_ids=encoder_inputs.input_ids,
                attention_mask=encoder_inputs.attention_mask,
                max_length=48 + (i * 2), 
                do_sample=True,
                top_p=0.92 - (i * 0.02),  
                temperature=0.7 + (i * 0.05),
                num_beams=4,
                no_repeat_ngram_size=2,
                length_penalty=1.0,
                repetition_penalty=1.2 + (i * 0.1),
                num_return_sequences=1
            )
        
        question = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        question = clean_question(question)
        
        if question and question not in unique_questions:
            unique_questions.add(question)
            generated_questions.append(question)
    
    filtered_questions = filter_questions(generated_questions, context)
    
    question_templates = {
        "easy": [
            "What is {topic}?",
            "Can you define {topic}?",
            "What does {topic} refer to?"
        ],
        "medium": [
            "How does {topic} impact {related_topic}?",
            "What connection exists between {topic} and {related_topic}?",
            "Why is {topic} important to understand?"
        ],
        "hard": [
            "What would happen if {topic} was fundamentally different?",
            "How might future developments in {topic} affect {related_topic}?",
            "What are the broader implications of {topic} for society?"
        ]
    }
    
    topics = extract_topics(context)
    
    while len(filtered_questions) < num_questions and topics:
        template = random.choice(question_templates[difficulty])
        topic = topics.pop(0) if topics else "this topic"
        related_topic = topics.pop(0) if topics else "related areas"
        
        templated_question = template.format(topic=topic, related_topic=related_topic)
        if templated_question not in unique_questions:
            unique_questions.add(templated_question)
            filtered_questions.append(templated_question)
    
    return filtered_questions[:num_questions]


In [18]:
def clean_question(question):
    """Clean up a generated question"""
    if "?" in question:
        question = question.split("?")[0] + "?"
    
    artifacts = ["References", "Reply", "Delete", "Click", "Comments", "Contents", 
                 "Editors", "Source", "credit", "Please see", "More", "Thank", 
                 "credited", "Logged", "context", "*"]
                
    for artifact in artifacts:
        if artifact in question:
            question = question.split(artifact)[0].strip()
    
    if not question.endswith("?"):
        question += "?"
    
    common_prefixes = ["I think", "It seems", "Maybe", "Perhaps", "I believe", "Can you explain"]
    for prefix in common_prefixes:
        if question.startswith(prefix):
            rest_of_question = question[len(prefix):].strip()
            if any(q_word in rest_of_question.lower() for q_word in ["what", "why", "how", "when", "where", "which", "who", "can", "does", "is", "are"]):
                question = rest_of_question
                
    return question.strip()

In [19]:
def filter_questions(questions, context):
    """Filter out low-quality questions"""
    filtered = []
    context_lower = context.lower()
    
    topics = extract_topics(context)
    topic_words = set(word.lower() for word in topics)
    
    for question in questions:
        question_lower = question.lower()
        
        is_relevant = any(topic in question_lower for topic in topic_words)
        
        has_question_word = any(q_word in question_lower for q_word in 
                             ["what", "why", "how", "when", "where", "which", "who",
                              "can", "does", "is", "are"])
        
        proper_length = 15 <= len(question) <= 150
        
        if is_relevant and has_question_word and proper_length:
            filtered.append(question)
    
    return filtered

In [20]:
def extract_topics(context):
    """Extract key topics from context using basic NLP techniques"""
    tokens = context.lower().split()
    
    stopwords = ["the", "and", "is", "in", "it", "to", "that", "of", "for", "a", "an", "with", "by"]
    tokens = [token for token in tokens if token not in stopwords]
    
    token_counts = {}
    for token in tokens:
        if len(token) > 3:  
            token_counts[token] = token_counts.get(token, 0) + 1
    
    sorted_tokens = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
    
    return [token for token, count in sorted_tokens[:10]]

In [None]:
from rouge_score import rouge_scorer

def calculate_rouge_scores(references, predictions):
    """Calculate Rouge scores between predicted and reference questions"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for ref, pred in zip(references, predictions):
        scores = scorer.score(ref, pred)
        rouge1_scores.append(scores['rouge1'].fmeasure)
        rouge2_scores.append(scores['rouge2'].fmeasure)
        rougeL_scores.append(scores['rougeL'].fmeasure)
    
    avg_rouge1 = sum(rouge1_scores) / len(rouge1_scores) if rouge1_scores else 0
    avg_rouge2 = sum(rouge2_scores) / len(rouge2_scores) if rouge2_scores else 0
    avg_rougeL = sum(rougeL_scores) / len(rougeL_scores) if rougeL_scores else 0
    
    return {
        "rouge1": avg_rouge1,
        "rouge2": avg_rouge2,
        "rougeL": avg_rougeL
    }

def evaluate_with_rouge(model, test_dataloader, gpt2_tokenizer, device):
    """Evaluate model and calculate Rouge scores"""
    model.to(device)
    model.eval()
    
    all_predictions = []
    all_references = []
    all_difficulties = []
    
    with torch.no_grad():
        for i, batch in enumerate(test_dataloader):
            logger.info(f"Processing batch {i+1}/{len(test_dataloader)}")
            
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            
            try:
                outputs = model.generate(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    max_length=150,
                    min_length=5,
                    num_beams=4,
                    no_repeat_ngram_size=3,
                    early_stopping=True
                )
                
                predictions = gpt2_tokenizer.batch_decode(outputs, skip_special_tokens=True)
                references = batch["raw_question"]
                difficulties = batch["difficulty"]
                
                all_predictions.extend(predictions)
                all_references.extend(references)
                all_difficulties.extend(difficulties)
                
                if i < 2:
                    logger.info("\nBatch examples:")
                    for j in range(min(2, len(predictions))):
                        logger.info(f"Difficulty: {difficulties[j]}")
                        logger.info(f"Reference: {references[j]}")
                        logger.info(f"Prediction: {predictions[j]}")
                        logger.info("-" * 30)
                
            except RuntimeError as e:
                logger.error(f"Error in batch {i+1}: {e}")
                continue
    
    if not all_predictions:
        logger.error("No predictions were generated. Check the model and generation parameters.")
        return {}, [], []
    
    rouge_scores = calculate_rouge_scores(all_references, all_predictions)
    
    for metric, score in rouge_scores.items():
        logger.info(f"{metric.upper()}: {score:.4f}")
    
    difficulties = ['easy', 'medium', 'hard']
    for difficulty in difficulties:
        difficulty_indices = [i for i, d in enumerate(all_difficulties) if d == difficulty]
        if not difficulty_indices:
            continue
            
        diff_references = [all_references[i] for i in difficulty_indices]
        diff_predictions = [all_predictions[i] for i in difficulty_indices]
        
        diff_rouge_scores = calculate_rouge_scores(diff_references, diff_predictions)
        logger.info(f"\n{difficulty.upper()} questions:")
        for metric, score in diff_rouge_scores.items():
            logger.info(f"{metric.upper()}: {score:.4f}")
    
    num_examples = min(5, len(all_predictions))
    logger.info("\nFinal examples:")
    for i in range(num_examples):
        logger.info(f"Difficulty: {all_difficulties[i]}")
        logger.info(f"Reference: {all_references[i]}")
        logger.info(f"Prediction: {all_predictions[i]}")
        logger.info("-" * 40)
    
    return rouge_scores, all_predictions, all_references

In [31]:
def main():
    """
    Main function to run the full question generation pipeline:
    1. Set up model and data
    2. Train the model
    3. Evaluate the model using both BLEU and ROUGE scores
    4. Demonstrate question generation
    """
    set_seed(42)
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Using device: {device}")
    
    data_path = "Json_merged_with_difficulty.json"  
    batch_size = 4  
    learning_rate = 2e-5  
    num_epochs = 1  
    
    model, bert_tokenizer, gpt2_tokenizer = create_bert_gpt2_model()
    
    data = load_data(data_path)
    
    train_dataset, val_dataset, test_dataset = prepare_datasets(
        data, bert_tokenizer, gpt2_tokenizer, 
        train_size=0.8, val_size=0.1, test_size=0.1
    )
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size)
    
    optimizer = AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=learning_rate,
        weight_decay=0.01,
        eps=1e-8
    )
    
    total_steps = len(train_dataloader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(0.1 * total_steps),  
        num_training_steps=total_steps
    )
    
    model = train_model(
        model, 
        train_dataloader, 
        val_dataloader, 
        optimizer, 
        scheduler, 
        device, 
        num_epochs=num_epochs
    )
    
    best_model_path = os.path.join("bert_gpt2_qg_model", "best_model.pt")
    if os.path.exists(best_model_path):
        model.load_state_dict(torch.load(best_model_path))
        logger.info(f"Loaded best model from {best_model_path}")
    
    logger.info("\n--- BLEU Score Evaluation ---")
    bleu_score, bleu_predictions, bleu_references = evaluate_model(
        model, 
        test_dataloader, 
        gpt2_tokenizer, 
        device,
    )
    
    logger.info("\n--- ROUGE Score Evaluation ---")
    small_test_dataset = QuestionGenerationDataset(data[-10:], bert_tokenizer, gpt2_tokenizer)
    small_test_dataloader = DataLoader(small_test_dataset, batch_size=2)
    
    rouge_scores, rouge_predictions, rouge_references = evaluate_with_rouge(
        model,
        small_test_dataloader,
        gpt2_tokenizer,
        device
    )
    
    output_dir = "results"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    with open(os.path.join(output_dir, "predictions.txt"), "w") as f:
        for prediction in bleu_predictions:
            f.write(prediction + "\n")
    
    with open(os.path.join(output_dir, "references.txt"), "w") as f:
        for reference in bleu_references:
            f.write(reference + "\n")
    
    with open(os.path.join(output_dir, "rouge_scores.json"), "w") as f:
        json.dump(rouge_scores, f, indent=4)
    
    logger.info(f"Saved evaluation results to {output_dir}/")
    
    sample_context = """
    Climate change refers to long-term shifts in temperatures and weather patterns. 
    These shifts may be natural, but since the 1800s, human activities have been the 
    main driver of climate change, primarily due to the burning of fossil fuels 
    (like coal, oil, and gas), which produces heat-trapping gases. The consequences 
    of climate change include more frequent and intense droughts, storms, heat waves, 
    rising sea levels, melting glaciers, and warming oceans which can directly harm 
    animals, destroy the places they live, and disrupt people's livelihoods.
    """
    
    logger.info("\nDemonstrating question generation:")
    
    for difficulty in ['easy', 'medium', 'hard']:
        logger.info(f"\n{difficulty.upper()} questions:")
        questions = generate_questions(
            model, 
            bert_tokenizer, 
            gpt2_tokenizer, 
            sample_context, 
            difficulty=difficulty, 
            num_questions=3, 
            device=device
        )
        
        for i, question in enumerate(questions, 1):
            logger.info(f"{i}. {question}")

if __name__ == "__main__":
    main()

2025-04-25 04:12:52,660 - __main__ - INFO - Using device: cuda
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['transformer.h.0.crossattention.c_attn.bias', 'transformer.h.0.crossattention.c_attn.weight', 'transformer.h.0.crossattention.c_proj.bias', 'transformer.h.0.crossattention.c_proj.weight', 'transformer.h.0.crossattention.q_attn.bias', 'transformer.h.0.crossattention.q_attn.weight', 'transformer.h.0.ln_cross_attn.bias', 'transformer.h.0.ln_cross_attn.weight', 'transformer.h.1.crossattention.c_attn.bias', 'transformer.h.1.crossattention.c_attn.weight', 'transformer.h.1.crossattention.c_proj.bias', 'transformer.h.1.crossattention.c_proj.weight', 'transformer.h.1.crossattention.q_attn.bias', 'transformer.h.1.crossattention.q_attn.weight', 'transformer.h.1.ln_cross_attn.bias', 'transformer.h.1.ln_cross_attn.weight', 'transformer.h.10.crossattention.c_attn.bias', 'transformer.h.10.crossattention.c_attn.weight', 'trans