In [None]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    T5ForConditionalGeneration, 
    T5Tokenizer,
    TrainingArguments,
    Trainer
)
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset as HFDataset
import wandb
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
MODEL_TYPE = "bert"  # or "t5"
BERT_MODEL_NAME = "dmis-lab/biobert-v1.1"  # or "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
T5_MODEL_NAME = "razent/SciFive-base-Pubmed_PMC"  # or "t5-small"
MAX_LENGTH = 512
BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_EPOCHS = 3
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
OUTPUT_DIR = "./space_biology_model"

# 1. Dataset Collection and Preprocessing

def create_dataset_catalog():
    """Create a catalog of available biomedical and space biology datasets."""
    
    biomedical_datasets = [
        {
            "name": "PubMed Abstracts (Space Biology)",
            "source": "PubMed",
            "data_type": "text",
            "size": "~10,000 abstracts",
            "url": "https://pubmed.ncbi.nlm.nih.gov/",
            "description": "Scientific abstracts related to space biology research",
            "domains": ["microgravity", "radiation", "space adaptation"]
        },
        {
            "name": "UniProt Space Biology Entries",
            "source": "UniProt",
            "data_type": "protein annotations",
            "size": "~1,000 entries",
            "url": "https://www.uniprot.org/",
            "description": "Protein annotations for organisms studied in space",
            "domains": ["protein function", "structural biology"]
        }
        # Add more datasets as needed
    ]
    
    space_biology_datasets = [
        {
            "name": "NASA GeneLab",
            "source": "NASA",
            "data_type": "omics data + metadata",
            "size": "500+ datasets",
            "url": "https://genelab.nasa.gov/",
            "description": "Omics data from space biology experiments",
            "domains": ["transcriptomics", "proteomics", "genomics"]
        },
        {
            "name": "OSDR WGS Dataset 466",
            "source": "NASA OSDR",
            "data_type": "genomic + metadata",
            "size": "1 dataset",
            "url": "https://osdr.nasa.gov/bio/repo/data/studies/OSD-466",
            "description": "Whole Genome Sequencing data with descriptions",
            "domains": ["genomics", "microbiology"]
        }
        # Add more datasets as needed
    ]
    
    # Save to CSV files
    pd.DataFrame(biomedical_datasets).to_csv("biomedical_datasets.csv", index=False)
    pd.DataFrame(space_biology_datasets).to_csv("space_biology_datasets.csv", index=False)
    
    return biomedical_datasets, space_biology_datasets

def download_and_preprocess_data():
    """Download and preprocess selected datasets."""
    
    # For demonstration, we'll create a simulated dataset
    # In a real scenario, you would download data from the sources listed above
    
    # Simulated pre-training data (biomedical text data)
    pretraining_texts = [
        "Rodents exposed to microgravity exhibit muscle atrophy and bone density loss.",
        "Arabidopsis thaliana shows altered gene expression in spaceflight conditions.",
        # Add more examples
    ]
    
    pretraining_labels = [
        {"organism": "Rodents", "condition": "microgravity", "effect": "muscle atrophy, bone density loss"},
        {"organism": "Arabidopsis thaliana", "condition": "spaceflight", "effect": "altered gene expression"},
        # Add more examples
    ]
    
    # Simulated fine-tuning data (space biology specific)
    finetuning_texts = [
        "Caenorhabditis elegans cultured on the ISS showed changes in metabolic pathways related to oxidative stress.",
        "Drosophila melanogaster reared in microgravity exhibited developmental delays and reduced lifespan.",
        # Add more examples
    ]
    
    finetuning_labels = [
        {"organism": "Caenorhabditis elegans", "condition": "ISS culture", "effect": "changes in metabolic pathways, oxidative stress"},
        {"organism": "Drosophila melanogaster", "condition": "microgravity", "effect": "developmental delays, reduced lifespan"},
        # Add more examples
    ]
    
    # Convert to pandas DataFrames
    pretraining_df = pd.DataFrame({
        "text": pretraining_texts,
        "labels": [str(label) for label in pretraining_labels]  # Convert dict to string for storage
    })
    
    finetuning_df = pd.DataFrame({
        "text": finetuning_texts,
        "labels": [str(label) for label in finetuning_labels]
    })
    
    # Save to disk
    pretraining_df.to_csv("pretraining_data.csv", index=False)
    finetuning_df.to_csv("finetuning_data.csv", index=False)
    
    return pretraining_df, finetuning_df

# 2. Dataset Classes

class SpaceBiologyDataset(Dataset):
    """Dataset class for BERT model."""
    
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # Remove batch dimension added by tokenizer
        encoding = {k: v.squeeze(0) for k, v in encoding.items()}
        
        # For BERT classification - you would need to define your label mapping
        # This is simplified for demonstration
        if isinstance(label, dict):
            # This would need to be adjusted based on your specific task
            encoding["labels"] = torch.tensor(1)  # Placeholder
        else:
            encoding["labels"] = torch.tensor(label)
        
        return encoding

class SpaceBiologyT5Dataset(Dataset):
    """Dataset class for T5 model."""
    
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        # For T5, we format the input as "extract info: {text}"
        input_text = f"extract info: {text}"
        
        # Convert label dict to string format: "organism: X, condition: Y, effect: Z"
        if isinstance(label, dict):
            target_text = ", ".join([f"{k}: {v}" for k, v in label.items()])
        else:
            target_text = str(label)
        
        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # Remove batch dimension added by tokenizer
        input_ids = input_encoding["input_ids"].squeeze(0)
        attention_mask = input_encoding["attention_mask"].squeeze(0)
        target_ids = target_encoding["input_ids"].squeeze(0)
        
        # Replace padding token id with -100 for labels
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100
        
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": target_ids
        }

# 3. Model Training Functions

def train_bert_model(train_dataset, val_dataset, model_name, output_dir):
    """Train a BERT-based model for space biology information extraction."""
    
    # Load pretrained model
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2  # Adjust based on your task
    )
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    
    # Train the model
    trainer.train()
    
    # Save the final model
    trainer.save_model(output_dir)
    
    return model, trainer

def train_t5_model(train_dataset, val_dataset, model_name, output_dir):
    """Train a T5-based model for space biology information extraction."""
    
    # Load pretrained model
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    
    # Define training arguments
    training_args = TrainingArguments(
        output_dir=output_dir,
        num_train_epochs=NUM_EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        warmup_steps=500,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
        greater_is_better=False
    )
    
    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    
    # Train the model
    trainer.train()
    
    # Save the final model
    trainer.save_model(output_dir)
    
    return model, trainer

# 4. Inference Functions

def extract_info_bert(text, model, tokenizer, max_length=512):
    """Extract information using BERT model."""
    
    inputs = tokenizer(
        text,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    model = model.to(DEVICE)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # This implementation depends on your specific task
    # This is a placeholder for demonstration
    probabilities = torch.softmax(outputs.logits, dim=1)
    prediction = torch.argmax(probabilities, dim=1).item()
    
    return prediction

def extract_info_t5(text, model, tokenizer, max_length=512):
    """Extract information using T5 model."""
    
    input_text = f"extract info: {text}"
    
    inputs = tokenizer(
        input_text,
        max_length=max_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
    model = model.to(DEVICE)
    
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_length,
            num_beams=4,
            early_stopping=True
        )
    
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Parse the output into a structured format
    info_dict = {}
    for item in decoded_output.split(", "):
        if ":" in item:
            key, value = item.split(":", 1)
            info_dict[key.strip()] = value.strip()
    
    return info_dict

# 5. Main Pipeline Function

def run_space_biology_model_pipeline():
    """Run the complete pipeline for space biology model development."""
    
    # Set up wandb for tracking experiments
    wandb.init(project="space-biology-model-zoo")
    
    # 1. Dataset Collection
    print("Creating dataset catalog...")
    biomedical_datasets, space_biology_datasets = create_dataset_catalog()
    
    # 2. Data Preprocessing
    print("Downloading and preprocessing data...")
    pretraining_df, finetuning_df = download_and_preprocess_data()
    
    # 3. Split data for fine-tuning
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        finetuning_df["text"].tolist(),
        finetuning_df["labels"].tolist(),
        test_size=0.2,
        random_state=42
    )
    
    # 4. Model and Tokenizer Setup
    if MODEL_TYPE == "bert":
        print(f"Loading BERT model: {BERT_MODEL_NAME}")
        tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
        
        # Create datasets
        train_dataset = SpaceBiologyDataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
        val_dataset = SpaceBiologyDataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
        
        # Train model
        print("Training BERT model...")
        model, trainer = train_bert_model(
            train_dataset, 
            val_dataset, 
            BERT_MODEL_NAME, 
            OUTPUT_DIR
        )
        
        # Example inference
        sample_text = "Astronauts exposed to microgravity show bone density loss after extended spaceflight."
        prediction = extract_info_bert(sample_text, model, tokenizer)
        print(f"Sample prediction (BERT): {prediction}")
        
    elif MODEL_TYPE == "t5":
        print(f"Loading T5 model: {T5_MODEL_NAME}")
        tokenizer = T5Tokenizer.from_pretrained(T5_MODEL_NAME)
        
        # Create datasets
        train_dataset = SpaceBiologyT5Dataset(train_texts, train_labels, tokenizer, MAX_LENGTH)
        val_dataset = SpaceBiologyT5Dataset(val_texts, val_labels, tokenizer, MAX_LENGTH)
        
        # Train model
        print("Training T5 model...")
        model, trainer = train_t5_model(
            train_dataset, 
            val_dataset, 
            T5_MODEL_NAME, 
            OUTPUT_DIR
        )
        
        # Example inference
        sample_text = "Astronauts exposed to microgravity show bone density loss after extended spaceflight."
        prediction = extract_info_t5(sample_text, model, tokenizer)
        print(f"Sample prediction (T5): {prediction}")
    
    # 5. Evaluation
    print("Evaluating model performance...")
    eval_results = trainer.evaluate()
    wandb.log({"eval_loss": eval_results["eval_loss"]})
    
    # 6. Save model and artifacts
    print(f"Saving model to {OUTPUT_DIR}")
    
    # Create model card
    model_card = f"""
    # Space Biology Information Extraction Model
    
    ## Model Description
    - Model Type: {MODEL_TYPE.upper()}
    - Base Model: {BERT_MODEL_NAME if MODEL_TYPE == "bert" else T5_MODEL_NAME}
    - Task: Extract information about organisms, conditions, and effects from space biology text
    
    ## Training Data
    - Pretrained on biomedical literature
    - Fine-tuned on space biology datasets
    
    ## Performance
    - Evaluation Loss: {eval_results["eval_loss"]:.4f}
    
    ## Usage
    ```python
    # Example usage code for inference
    from transformers import AutoTokenizer, AutoModel
    
    # Load model and tokenizer
    tokenizer = AutoTokenizer.from_pretrained("{OUTPUT_DIR}")
    model = AutoModel.from_pretrained("{OUTPUT_DIR}")
    
    # Run inference
    text = "Your space biology text here"
    # Follow the extract_info_{MODEL_TYPE} function for inference
    ```
    
    ## Limitations
    - This model is specifically designed for space biology text analysis
    - Performance may vary on general biological text
    """
    
    with open(f"{OUTPUT_DIR}/MODEL_CARD.md", "w") as f:
        f.write(model_card)
    
    print("Pipeline completed successfully!")
    return model, tokenizer, eval_results

if __name__ == "__main__":
    run_space_biology_model_pipeline()

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.applications import EfficientNetB0, ResNet50V2
from tensorflow.keras.layers import Dense, LSTM, Embedding, TimeDistributed, Attention, Input, Flatten, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

# Constants
IMG_SIZE = 224
MAX_TEXT_LENGTH = 100
VOCAB_SIZE = 5000
EMBEDDING_DIM = 256
BATCH_SIZE = 16
EPOCHS = 20

# Function to load and preprocess the dataset
def load_space_biology_dataset(csv_path, img_dir):
    """
    Load space biology image dataset with corresponding text descriptions
    
    Args:
        csv_path: Path to CSV file containing image filenames and descriptions
        img_dir: Directory containing the images
        
    Returns:
        images: List of preprocessed images
        descriptions: List of text descriptions
    """
    # Load metadata
    df = pd.read_csv(csv_path)
    
    # Image preprocessing with data augmentation for training
    datagen = ImageDataGenerator(
        rescale=1./255,
        rotation_range=20,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest',
        validation_split=0.4  # 40% will be used for validation+testing
    )
    
    # Generate batches of augmented image data
    train_generator = datagen.flow_from_directory(
        img_dir,
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode=None,
        subset='training'
    )
    
    validation_generator = datagen.flow_from_directory(
        img_dir,
        target_size=(IMG_SIZE, IMG_SIZE),
        batch_size=BATCH_SIZE,
        class_mode=None,
        subset='validation'
    )
    
    # Tokenize text descriptions
    tokenizer = Tokenizer(num_words=VOCAB_SIZE, oov_token="<OOV>")
    tokenizer.fit_on_texts(df['description'])
    
    # Convert text to sequences
    sequences = tokenizer.texts_to_sequences(df['description'])
    padded_sequences = pad_sequences(sequences, maxlen=MAX_TEXT_LENGTH, padding='post')
    
    return train_generator, validation_generator, padded_sequences, tokenizer

# Model 1: CNN-LSTM (EfficientNet + LSTM)
def build_cnn_lstm_model():
    """
    Build an image captioning model using EfficientNetB0 and LSTM
    
    Returns:
        model: Compiled Keras model
    """
    # Image feature extraction with EfficientNetB0
    base_model = EfficientNetB0(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
    
    # Freeze base model layers
    for layer in base_model.layers:
        layer.trainable = False
    
    # Image input
    image_input = Input(shape=(IMG_SIZE, IMG_SIZE, 3))
    encoded_image = base_model(image_input)
    encoded_image = Flatten()(encoded_image)
    encoded_image = Dense(EMBEDDING_DIM, activation='relu')(encoded_image)
    
    # Text input
    text_input = Input(shape=(MAX_TEXT_LENGTH,))
    embedding = Embedding(VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(text_input)
    
    # Decode combined features
    decoder = LSTM(512, return_sequences=True)(embedding)
    decoder = LSTM(512)(decoder)
    
    # Combine image and text features
    decoder_combined = tf.concat([encoded_image, decoder], axis=-1)
    output = Dense(512, activation='relu')(decoder_combined)
    output = Dropout(0.3)(output)
    output = Dense(VOCAB_SIZE, activation='softmax')(output)
    
    model = Model(inputs=[image_input, text_input], outputs=output)
    
    # Compile model
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        metrics=['accuracy']
    )
    
    return model

# Model 2: CNN-Transformer (ResNet + Transformer)
def build_cnn_transformer_model():
    """
    Build an image captioning model using ResNet50V2 and Transformer
    
    Returns:
        model: Compiled Keras model
    """
    # Image feature extraction with ResNet50V2
    base_model = ResNet50V2(weights='imagenet', include_top=False, input_shape=(IMG_SIZE, IMG_SIZE, 3))
    
    # Freeze base model layers
    for layer in base_model.layers:
        layer.trainable = False
    
    # Image input
    image_input = Input(shape=(IMG_SIZE, IMG_SIZE, 3))
    encoded_image = base_model(image_input)
    encoded_image = Flatten()(encoded_image)
    encoded_image = Dense(EMBEDDING_DIM, activation='relu')(encoded_image)
    
    # Text input
    text_input = Input(shape=(MAX_TEXT_LENGTH,))
    embedding = Embedding(VOCAB_SIZE, EMBEDDING_DIM, mask_zero=True)(text_input)
    
    # Transformer layers (simplified)
    # Note: For a full transformer implementation, you'd use multiple self-attention layers
    attention_output = Attention()([embedding, embedding])
    decoder = TimeDistributed(Dense(512, activation='relu'))(attention_output)
    decoder = Flatten()(decoder)
    
    # Combine image and text features
    decoder_combined = tf.concat([encoded_image, decoder], axis=-1)
    output = Dense(512, activation='relu')(decoder_combined)
    output = Dropout(0.3)(output)
    output = Dense(VOCAB_SIZE, activation='softmax')(output)
    
    model = Model(inputs=[image_input, text_input], outputs=output)
    
    # Compile model
    model.compile(
        loss='sparse_categorical_crossentropy',
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
        metrics=['accuracy']
    )
    
    return model

# Training function
def train_and_evaluate_models(train_gen, val_gen, text_data, tokenizer):
    """
    Train and evaluate both models
    
    Args:
        train_gen: Training data generator
        val_gen: Validation data generator
        text_data: Processed text data
        tokenizer: Text tokenizer
        
    Returns:
        models: Dictionary containing trained models
        histories: Dictionary containing training histories
    """
    # Initialize models
    cnn_lstm_model = build_cnn_lstm_model()
    cnn_transformer_model = build_cnn_transformer_model()
    
    # Split validation data into validation and test sets
    val_images = []
    for i in range(len(val_gen)):
        batch = val_gen[i]
        val_images.extend(batch)
        if len(val_images) >= 40:  # 40 = 20 validation + 20 test samples
            break
    
    val_images = np.array(val_images[:40])
    val_text = text_data[-40:]
    
    val_images_split, test_images, val_text_split, test_text = train_test_split(
        val_images, val_text, test_size=0.5, random_state=42
    )
    
    # Train CNN-LSTM model
    cnn_lstm_history = cnn_lstm_model.fit(
        [train_gen, np.ones((len(train_gen), MAX_TEXT_LENGTH))],  # Placeholder for text input during training
        np.ones((len(train_gen), VOCAB_SIZE)),  # Placeholder for target
        epochs=EPOCHS,
        validation_data=(
            [val_images_split, np.ones((len(val_images_split), MAX_TEXT_LENGTH))],
            np.ones((len(val_images_split), VOCAB_SIZE))
        )
    )
    
    # Train CNN-Transformer model
    cnn_transformer_history = cnn_transformer_model.fit(
        [train_gen, np.ones((len(train_gen), MAX_TEXT_LENGTH))],  # Placeholder for text input during training
        np.ones((len(train_gen), VOCAB_SIZE)),  # Placeholder for target
        epochs=EPOCHS,
        validation_data=(
            [val_images_split, np.ones((len(val_images_split), MAX_TEXT_LENGTH))],
            np.ones((len(val_images_split), VOCAB_SIZE))
        )
    )
    
    # Evaluate on test set
    cnn_lstm_results = cnn_lstm_model.evaluate(
        [test_images, np.ones((len(test_images), MAX_TEXT_LENGTH))],
        np.ones((len(test_images), VOCAB_SIZE))
    )
    
    cnn_transformer_results = cnn_transformer_model.evaluate(
        [test_images, np.ones((len(test_images), MAX_TEXT_LENGTH))],
        np.ones((len(test_images), VOCAB_SIZE))
    )
    
    print("CNN-LSTM Test Results:", cnn_lstm_results)
    print("CNN-Transformer Test Results:", cnn_transformer_results)
    
    models = {
        "cnn_lstm": cnn_lstm_model,
        "cnn_transformer": cnn_transformer_model
    }
    
    histories = {
        "cnn_lstm": cnn_lstm_history,
        "cnn_transformer": cnn_transformer_history
    }
    
    return models, histories

# Generate captions from images
def generate_caption(model, image, tokenizer):
    """
    Generate a caption for a given image
    
    Args:
        model: Trained model
        image: Input image
        tokenizer: Text tokenizer
        
    Returns:
        caption: Generated caption
    """
    # Preprocess image
    processed_image = tf.image.resize(image, (IMG_SIZE, IMG_SIZE))
    processed_image = processed_image / 255.0
    processed_image = tf.expand_dims(processed_image, 0)
    
    # Start with a blank sequence
    caption = ['<start>']
    
    # Generate the caption word by word
    for i in range(MAX_TEXT_LENGTH):
        sequence = tokenizer.texts_to_sequences([' '.join(caption)])[0]
        sequence = pad_sequences([sequence], maxlen=MAX_TEXT_LENGTH, padding='post')
        
        # Predict next word
        prediction = model.predict([processed_image, sequence], verbose=0)
        predicted_id = np.argmax(prediction)
        
        # Convert id to word
        word = tokenizer.index_word.get(predicted_id, '')
        
        # Stop if we predict the end token
        if word == '<end>' or word == '':
            break
            
        # Add predicted word to caption
        caption.append(word)
    
    # Remove start token
    return ' '.join(caption[1:])

# Main execution
def main():
    # Path to dataset (example paths)
    csv_path = 'space_biology_dataset.csv'
    img_dir = 'space_biology_images'
    
    # Load and preprocess data
    train_gen, val_gen, text_data, tokenizer = load_space_biology_dataset(csv_path, img_dir)
    
    # Train and evaluate models
    models, histories = train_and_evaluate_models(train_gen, val_gen, text_data, tokenizer)
    
    # Plot training history
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.plot(histories['cnn_lstm'].history['accuracy'], label='Train Accuracy')
    plt.plot(histories['cnn_lstm'].history['val_accuracy'], label='Val Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('CNN-LSTM Model')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    plt.plot(histories['cnn_transformer'].history['accuracy'], label='Train Accuracy')
    plt.plot(histories['cnn_transformer'].history['val_accuracy'], label='Val Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.title('CNN-Transformer Model')
    plt.legend()
    
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    
    # Save models
    models['cnn_lstm'].save('cnn_lstm_space_biology.h5')
    models['cnn_transformer'].save('cnn_transformer_space_biology.h5')
    
    # Save tokenizer
    import pickle
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    print("Models and tokenizer saved successfully!")
    
if __name__ == "__main__":
    main()