In [1]:
!pip install -q transformers datasets peft bitsandbytes evaluate accelerate
!pip install -q ipywidgets
!nvidia-smi

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m122.4/122.4 MB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the followi

In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig
)
from peft import (
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model,
    PeftModel
)
import evaluate
import numpy as np
from time import time
import os
import psutil
from google.colab import drive
import gc
from tqdm.auto import tqdm

# Check GPU availability
print("GPU Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Model:", torch.cuda.get_device_name(0))
    print("Current GPU Memory Usage:", torch.cuda.memory_allocated(0)/1e9, "GB")


GPU Available: True
GPU Model: Tesla T4
Current GPU Memory Usage: 0.0 GB


In [3]:
drive.mount('/content/drive')
save_path = "/content/drive/MyDrive/nli_model"
os.makedirs(save_path, exist_ok=True)

Mounted at /content/drive


In [4]:
def clear_memory():
    """Clear GPU memory"""
    gc.collect()
    torch.cuda.empty_cache()

def get_gpu_memory():
    """Get current GPU memory usage"""
    return torch.cuda.memory_allocated()/1e9

def load_and_preprocess_data():
    print("Loading SNLI dataset...")
    dataset = load_dataset("snli")

    # Function to filter valid labels
    def is_valid_sample(example):
        # SNLI labels are typically 0 (entailment), 1 (neutral), 2 (contradiction)
        # -1 indicates invalid/missing label
        return example['label'] != -1

    # Filter out invalid labels before selection
    train_filtered = dataset["train"].filter(is_valid_sample)
    test_filtered = dataset["test"].filter(is_valid_sample)
    val_filtered = dataset["validation"].filter(is_valid_sample)

    # Select samples from filtered datasets
    train_indices = list(range(0, len(train_filtered), len(train_filtered)//1000))[:1000]
    test_indices = list(range(0, len(test_filtered), len(test_filtered)//100))[:100]
    val_indices = list(range(0, len(val_filtered), len(val_filtered)//100))[:100]

    train_dataset = train_filtered.select(train_indices)
    test_dataset = test_filtered.select(test_indices)
    validation_dataset = val_filtered.select(val_indices)

    print("Dataset statistics:")
    print(f"Train set - Total: {len(train_dataset)} samples")
    print(f"Test set - Total: {len(test_dataset)} samples")
    print(f"Validation set - Total: {len(validation_dataset)} samples")

    # Verify no invalid labels remain
    assert all(example['label'] != -1 for example in train_dataset), "Invalid labels found in train set"
    assert all(example['label'] != -1 for example in test_dataset), "Invalid labels found in test set"
    assert all(example['label'] != -1 for example in validation_dataset), "Invalid labels found in val set"

    return train_dataset, test_dataset, validation_dataset

def init_model_and_tokenizer():
    print("Initializing model and tokenizer...")
    model_name = "microsoft/phi-2"

    # Clear memory before loading model
    clear_memory()

    # Initialize tokenizer with proper padding settings
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"  # Ensure right-padding

    # Configure quantization
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
    )

    # Load model with proper padding token configuration
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        quantization_config=bnb_config,
        device_map="auto",
        torch_dtype=torch.float16,
        pad_token_id=tokenizer.pad_token_id  # Explicitly set pad token ID
    )

    # Set pad token for model config
    model.config.pad_token_id = tokenizer.pad_token_id

    model = prepare_model_for_kbit_training(model)

    # Configure LoRA
    lora_config = LoraConfig(
        r=16,
        lora_alpha=32,
        target_modules=["linear", "Linear", "dense", "Dense"],
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_CLS"
    )

    model = get_peft_model(model, lora_config)
    print(f"Current GPU memory after model loading: {get_gpu_memory():.2f} GB")

    return model, tokenizer

def preprocess_function(examples, tokenizer):
    # Combine premise and hypothesis with special tokens
    text_pairs = list(zip(examples["premise"], examples["hypothesis"]))

    # Tokenize with explicit padding
    encoded = tokenizer(
        text_pairs,
        max_length=128,
        padding="max_length",
        truncation=True,
        return_tensors=None,  # Changed from "pt" to None for dataset mapping
        return_token_type_ids=False
    )

    # Add labels to the encoded dict
    encoded["labels"] = examples["label"]

    return encoded

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy_metric = evaluate.load("accuracy")
    return accuracy_metric.compute(predictions=predictions, references=labels)

def analyze_predictions(trainer, dataset, num_examples=5):
    """Analyze model predictions including both successful and failed cases"""
    predictions = trainer.predict(dataset)
    pred_labels = np.argmax(predictions.predictions, axis=1)

    label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}

    print("\nAnalysis of Predictions:")
    print("------------------------")

    # Analyze both correct and incorrect predictions
    correct_count = 0
    incorrect_count = 0

    for idx, (pred, true) in enumerate(zip(pred_labels, dataset["label"])):
        if (pred == true and correct_count < num_examples) or \
           (pred != true and incorrect_count < num_examples):
            print(f"\nCase {idx}:")
            print(f"Premise: {dataset[idx]['premise']}")
            print(f"Hypothesis: {dataset[idx]['hypothesis']}")
            print(f"Predicted: {label_map[pred]}")
            print(f"True: {label_map[true]}")
            print(f"Status: {'Correct' if pred == true else 'Incorrect'}")

            if pred == true:
                correct_count += 1
            else:
                incorrect_count += 1

        if correct_count >= num_examples and incorrect_count >= num_examples:
            break


In [5]:
def main():
    start_time = time()

    # Load data
    train_dataset, test_dataset, validation_dataset = load_and_preprocess_data()

    # Initialize model and tokenizer
    model, tokenizer = init_model_and_tokenizer()

    print("Preprocessing datasets...")
    # Update label mapping - SNLI uses different label indices
    label_map = {"entailment": 0, "neutral": 1, "contradiction": 2}

    def preprocess_labels(examples):
        # Convert text labels to integers if necessary
        if isinstance(examples["label"], str):
            examples["label"] = label_map.get(examples["label"], -1)
        return examples

    # Apply label preprocessing first
    train_dataset = train_dataset.map(preprocess_labels)
    validation_dataset = validation_dataset.map(preprocess_labels)
    test_dataset = test_dataset.map(preprocess_labels)

    # Then apply tokenization
    train_dataset = train_dataset.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=train_dataset.column_names,
        batch_size=8
    )
    validation_dataset = validation_dataset.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=validation_dataset.column_names,
        batch_size=8
    )
    test_dataset = test_dataset.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=test_dataset.column_names,
        batch_size=8
    )

    # Set datasets format
    train_dataset.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "labels"]
    )
    validation_dataset.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "labels"]
    )
    test_dataset.set_format(
        type="torch",
        columns=["input_ids", "attention_mask", "labels"]
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir=save_path,
        learning_rate=2e-4,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=5,
        weight_decay=0.01,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        report_to="none",
        gradient_accumulation_steps=8,
        logging_steps=10,
        remove_unused_columns=False  # Added to prevent label removal
    )

    # Initialize trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        compute_metrics=compute_metrics,
    )

    print("Starting training...")
    # Print sample batch to verify data format
    print("\nSample batch format:")
    sample_batch = next(iter(trainer.get_train_dataloader()))
    for k, v in sample_batch.items():
        print(f"{k}: {v.shape}")

    trainer.train()

    # Save final model
    print("Saving model...")
    trainer.save_model(f"{save_path}/final_model")

    # Calculate time taken
    training_time = time() - start_time

    # Evaluate on test set
    print("Evaluating model...")
    test_results = trainer.evaluate(test_dataset)

    # Print results
    print("\n=== Training Results ===")
    print(f"Training completed in {training_time/60:.2f} minutes")
    print(f"Test accuracy: {test_results['eval_accuracy']:.4f}")
    print(f"Peak GPU memory usage: {torch.cuda.max_memory_allocated()/1e9:.2f} GB")
    print(f"CPU Memory usage: {psutil.Process().memory_info().rss/1e9:.2f} GB")

    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"\n=== Model Parameters ===")
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")

    return model, tokenizer, test_results

def validate_dataset(dataset, name="dataset"):
    print(f"\nValidating {name}:")
    print(f"Number of examples: {len(dataset)}")
    print("Features:", dataset.features)
    print("First example:")
    print(dataset[0])
    return dataset

In [6]:
if __name__ == "__main__":
    model, tokenizer, results = main()

Loading SNLI dataset...


README.md:   0%|          | 0.00/16.0k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/412k [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/413k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/19.6M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/550152 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10000 [00:00<?, ? examples/s]

Dataset statistics:
Train set - Total: 1000 samples
Test set - Total: 100 samples
Validation set - Total: 100 samples
Initializing model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of PhiForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Current GPU memory after model loading: 1.84 GB
Preprocessing datasets...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]



Starting training...

Sample batch format:
input_ids: torch.Size([1, 128])
attention_mask: torch.Size([1, 128])
labels: torch.Size([1])


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
1,0.6566,0.691662,0.72
2,0.4969,0.496304,0.79
3,0.1981,0.526545,0.86
4,0.1527,0.539806,0.85
5,0.0656,0.590282,0.88


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Saving model...
Evaluating model...



=== Training Results ===
Training completed in 88.93 minutes
Test accuracy: 0.8100
Peak GPU memory usage: 2.19 GB
CPU Memory usage: 1.91 GB

=== Model Parameters ===
Total parameters: 1,392,906,240
Trainable parameters: 2,629,120


In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
import torch
import os

def load_saved_model(model_path):
    """
    Load the saved model and tokenizer with proper error handling and configuration.

    Args:
        model_path (str): Path to the saved model directory

    Returns:
        tuple: (model, tokenizer) pair
    """
    print("Loading saved model and tokenizer...")

    try:
        # First check if the path exists
        if not os.path.exists(model_path):
            raise ValueError(f"Model path '{model_path}' does not exist")

        # Try to load config first to verify the model type
        config = AutoConfig.from_pretrained(
            model_path,
            num_labels=3,
            problem_type="single_label_classification"
        )

        # If config doesn't specify model_type, set it to a default
        if not hasattr(config, 'model_type'):
            config.model_type = 'bert'  # or whatever model type you're using

        # Load model with config
        model = AutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=config
        )

        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_path)

        # Move model to GPU if available
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model = model.to(device)
        model.eval()

        return model, tokenizer

    except Exception as e:
        print(f"Error loading model: {str(e)}")
        print("\nTrying alternative loading method...")

        # Alternative loading method - specify model type explicitly
        try:
            config = AutoConfig.from_pretrained('bert-base-uncased')  # or your specific model type
            config.num_labels = 3
            config.problem_type = "single_label_classification"

            model = AutoModelForSequenceClassification.from_pretrained(
                model_path,
                config=config
            )

            tokenizer = AutoTokenizer.from_pretrained(
                'bert-base-uncased'  # or your specific model type
            )

            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            model = model.to(device)
            model.eval()

            return model, tokenizer

        except Exception as e:
            raise Exception(f"Failed to load model with both methods. Error: {str(e)}")

def run_inference(model, tokenizer, premise, hypothesis):
    """
    Run inference on a single premise-hypothesis pair with error handling.

    Args:
        model: The loaded model
        tokenizer: The loaded tokenizer
        premise (str): The premise text
        hypothesis (str): The hypothesis text

    Returns:
        dict: Prediction results and confidence scores
    """
    try:
        # Prepare input
        inputs = tokenizer(
            premise,
            hypothesis,
            padding=True,
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        # Move inputs to same device as model
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Run inference
        with torch.no_grad():
            outputs = model(**inputs)
            predictions = torch.softmax(outputs.logits, dim=1)
            predicted_class = torch.argmax(predictions, dim=1).item()

        # Map predictions to labels
        label_map = {0: "entailment", 1: "neutral", 2: "contradiction"}
        confidence_scores = predictions[0].tolist()

        return {
            "prediction": label_map[predicted_class],
            "confidence_scores": {
                "entailment": confidence_scores[0],
                "neutral": confidence_scores[1],
                "contradiction": confidence_scores[2]
            }
        }

    except Exception as e:
        raise Exception(f"Error during inference: {str(e)}")

def main():
    try:
        # Load the saved model
        save_path = "/content/drive/MyDrive/nli_model"  # Update this to your model path
        print(f"Attempting to load model from: {save_path}/final_model")
        model, tokenizer = load_saved_model(f"{save_path}/final_model")

        # Example test cases
        test_cases = [
            {
                "premise": "A person on a horse jumps over a broken down wall.",
                "hypothesis": "A person is outdoors, on a horse."
            },
            {
                "premise": "A soccer game with multiple males playing.",
                "hypothesis": "Some men are playing a sport."
            },
            {
                "premise": "A smiling costumed woman is holding an umbrella.",
                "hypothesis": "A sad woman is crying."
            }
        ]

        # Run inference on test cases
        print("\nRunning inference on test cases:")
        for i, test_case in enumerate(test_cases, 1):
            print(f"\nTest Case {i}:")
            print(f"Premise: {test_case['premise']}")
            print(f"Hypothesis: {test_case['hypothesis']}")

            result = run_inference(model, tokenizer, test_case['premise'], test_case['hypothesis'])

            print("\nResults:")
            print(f"Prediction: {result['prediction']}")
            print("Confidence Scores:")
            for label, score in result['confidence_scores'].items():
                print(f"  {label}: {score:.4f}")

    except Exception as e:
        print(f"Error in main execution: {str(e)}")

if __name__ == "__main__":
    main()

Attempting to load model from: /content/drive/MyDrive/nli_model/final_model
Loading saved model and tokenizer...
Error loading model: Unrecognized model in /content/drive/MyDrive/nli_model/final_model. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: albert, align, altclip, audio-spectrogram-transformer, autoformer, bark, bart, beit, bert, bert-generation, big_bird, bigbird_pegasus, biogpt, bit, blenderbot, blenderbot-small, blip, blip-2, bloom, bridgetower, bros, camembert, canine, chameleon, chinese_clip, chinese_clip_vision_model, clap, clip, clip_text_model, clip_vision_model, clipseg, clvp, code_llama, codegen, cohere, conditional_detr, convbert, convnext, convnextv2, cpmant, ctrl, cvt, dac, data2vec-audio, data2vec-text, data2vec-vision, dbrx, deberta, deberta-v2, decision_transformer, deformable_detr, deit, depth_anything, deta, detr, dinat, dinov2, distilbert, donut-swin, dpr, dpt, efficientformer, efficientnet, electra, en

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/phi-2 and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weig

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


Running inference on test cases:

Test Case 1:
Premise: A person on a horse jumps over a broken down wall.
Hypothesis: A person is outdoors, on a horse.

Results:
Prediction: neutral
Confidence Scores:
  entailment: 0.2845
  neutral: 0.3871
  contradiction: 0.3284

Test Case 2:
Premise: A soccer game with multiple males playing.
Hypothesis: Some men are playing a sport.

Results:
Prediction: neutral
Confidence Scores:
  entailment: 0.2713
  neutral: 0.3714
  contradiction: 0.3572

Test Case 3:
Premise: A smiling costumed woman is holding an umbrella.
Hypothesis: A sad woman is crying.

Results:
Prediction: neutral
Confidence Scores:
  entailment: 0.2781
  neutral: 0.3711
  contradiction: 0.3508
