## Import libraries

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

## Check GPU availability and setup

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")

# Cell 4: Load the IMDB dataset
print("Loading IMDB dataset...")
dataset = load_dataset("imdb")
print(f"Train samples: {len(dataset['train'])}")
print(f"Test samples: {len(dataset['test'])}")

# Display sample data
print("\nSample movie reviews:")
for i in range(3):
    label_text = 'Positive' if dataset['test'][i]['label'] == 1 else 'Negative'
    print(f"\nReview {i+1} ({label_text}):")
    print(f"{dataset['test'][i]['text'][:300]}...")
    print("-" * 80)

Using device: cuda
GPU: NVIDIA RTX A6000
CUDA Version: 12.6
Loading IMDB dataset...
Train samples: 25000
Test samples: 25000

Sample movie reviews:

Review 1 (Negative):
I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, C...
--------------------------------------------------------------------------------

Review 2 (Negative):
Worth the entertainment value of a rental, especially if you like action movies. This one features the usual car chases, fights with the great Van Damme kick style, shooting battles with the 40 shell load shotgun, and even terrorist style bombs. All of this is entertaining and competently handled bu...
--------------------------------------------------------------------------------

Review 3 (Negative):
its a totally av

In [4]:
model_name = "Qwen/Qwen2.5-7B-Instruct"
print(f"Loading model: {model_name}")

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    cache_dir="/nfs/datz/olmo_models/al_exp",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None
)

# Add padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"Model loaded successfully!")
print(f"Tokenizer vocab size: {tokenizer.vocab_size:,}")
print(f"Model parameters: {model.num_parameters():,}")

Loading model: Qwen/Qwen2.5-7B-Instruct
The history saving thread hit an unexpected error (OperationalError('disk I/O error')).History will not be written to the database.


config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

Model loaded successfully!
Tokenizer vocab size: 151,643
Model parameters: 7,615,616,512


In [5]:
def create_sentiment_prompt(text):
    """Create a prompt for sentiment classification"""
    # Clean the text to avoid very long inputs
    text = text.strip()[:1500]  # Limit text length
    
    prompt = f"""<|im_start|>system
You are a helpful assistant that analyzes movie review sentiments. Your task is to classify movie reviews as either "positive" or "negative" based on the overall sentiment expressed.
<|im_end|>
<|im_start|>user
Please analyze the sentiment of this movie review and respond with only "positive" or "negative":

Review: {text}

Sentiment:<|im_end|>
<|im_start|>assistant
"""
    return prompt

# Test the prompt template
sample_text = dataset['test'][0]['text']
test_prompt = create_sentiment_prompt(sample_text)
print("Sample prompt structure:")
print(test_prompt[:500] + "...")

Sample prompt structure:
<|im_start|>system
You are a helpful assistant that analyzes movie review sentiments. Your task is to classify movie reviews as either "positive" or "negative" based on the overall sentiment expressed.
<|im_end|>
<|im_start|>user
Please analyze the sentiment of this movie review and respond with only "positive" or "negative":

Review: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really di...


## Sentiment Classification Function

In [6]:
def classify_sentiment(text, model, tokenizer, max_length=1024):
    """Classify sentiment of a single text using the model"""
    prompt = create_sentiment_prompt(text)
    
    # Tokenize input
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
        padding=True
    ).to(model.device)
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=5,  # We only need "positive" or "negative"
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode the response
    response = tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True)
    response = response.lower().strip()
    
    # Extract sentiment with better parsing
    if 'positive' in response:
        return 1, 'positive', response
    elif 'negative' in response:
        return 0, 'negative', response
    else:
        # Default to negative if unclear (you can adjust this logic)
        return 0, 'unclear', response

In [7]:
print("Testing classification on sample reviews:")
print("=" * 80)

test_reviews = [
    "This movie was absolutely amazing! Great acting, wonderful plot, and beautiful cinematography.",
    "Terrible film. Boring storyline, bad acting, complete waste of time.",
    "It was okay, nothing special but not terrible either.",
]

for i, review in enumerate(test_reviews):
    pred_label, sentiment, raw_response = classify_sentiment(review, model, tokenizer)
    print(f"Review {i+1}: {review}")
    print(f"Predicted: {sentiment.upper()}")
    print(f"Raw response: '{raw_response}'")
    print("-" * 40)

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Testing classification on sample reviews:


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Review 1: This movie was absolutely amazing! Great acting, wonderful plot, and beautiful cinematography.
Predicted: POSITIVE
Raw response: 'positive'
----------------------------------------
Review 2: Terrible film. Boring storyline, bad acting, complete waste of time.
Predicted: NEGATIVE
Raw response: 'negative'
----------------------------------------
Review 3: It was okay, nothing special but not terrible either.
Predicted: NEGATIVE
Raw response: 'negative'
----------------------------------------


In [8]:
def evaluate_model_on_imdb(test_dataset, model, tokenizer, max_samples=500):
    """Evaluate the model on IMDB test dataset"""
    predictions = []
    true_labels = []
    raw_responses = []
    processing_times = []
    
    # Use a subset for evaluation
    test_subset = test_dataset.select(range(min(max_samples, len(test_dataset))))
    
    print(f"Evaluating on {len(test_subset)} IMDB test samples...")
    
    start_time = time.time()
    
    for i, example in enumerate(tqdm(test_subset)):
        example_start = time.time()
        
        text = example['text']
        true_label = example['label']
        
        pred_label, sentiment, raw_response = classify_sentiment(text, model, tokenizer)
        
        predictions.append(pred_label)
        true_labels.append(true_label)
        raw_responses.append(raw_response)
        processing_times.append(time.time() - example_start)
    
    total_time = time.time() - start_time
    avg_time = np.mean(processing_times)
    
    print(f"\nEvaluation completed in {total_time:.2f} seconds")
    print(f"Average time per sample: {avg_time:.3f} seconds")
    
    return predictions, true_labels, raw_responses

In [9]:
print("Starting evaluation on IMDB test set...")
predictions, true_labels, raw_responses = evaluate_model_on_imdb(
    dataset['test'], model, tokenizer, max_samples=200  # Adjust this number as needed
)

# Cell 10: Calculate and display metrics
accuracy = accuracy_score(true_labels, predictions)
print(f"\n{'='*60}")
print(f"IMDB SENTIMENT CLASSIFICATION RESULTS")
print(f"{'='*60}")
print(f"Total samples evaluated: {len(predictions)}")
print(f"Overall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")

print(f"\nDetailed Classification Report:")
print(classification_report(true_labels, predictions, 
                          target_names=['Negative', 'Positive'], 
                          digits=4))

# Analyze by true label
pos_indices = [i for i, label in enumerate(true_labels) if label == 1]
neg_indices = [i for i, label in enumerate(true_labels) if label == 0]

pos_correct = sum(1 for i in pos_indices if predictions[i] == 1)
neg_correct = sum(1 for i in neg_indices if predictions[i] == 0)

print(f"\nBreakdown by True Label:")
print(f"Positive reviews accuracy: {pos_correct/len(pos_indices):.4f} ({pos_correct}/{len(pos_indices)})")
print(f"Negative reviews accuracy: {neg_correct/len(neg_indices):.4f} ({neg_correct}/{len(neg_indices)})")

# Cell 11: Visualize results
plt.figure(figsize=(12, 5))

# Confusion Matrix
plt.subplot(1, 2, 1)
cm = confusion_matrix(true_labels, predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Positive'], 
            yticklabels=['Negative', 'Positive'])
plt.title(f'Confusion Matrix\n(Accuracy: {accuracy:.3f})')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

# Accuracy by label
plt.subplot(1, 2, 2)
labels = ['Negative', 'Positive']
accuracies = [neg_correct/len(neg_indices), pos_correct/len(pos_indices)]
bars = plt.bar(labels, accuracies, color=['lightcoral', 'lightblue'])
plt.title('Accuracy by Sentiment')
plt.ylabel('Accuracy')
plt.ylim(0, 1)

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{acc:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Cell 12: Show example predictions
print(f"\n{'='*80}")
print("EXAMPLE PREDICTIONS")
print(f"{'='*80}")

# Show correct and incorrect predictions
correct_indices = [i for i, (p, t) in enumerate(zip(predictions, true_labels)) if p == t]
incorrect_indices = [i for i, (p, t) in enumerate(zip(predictions, true_labels)) if p != t]

print("✅ CORRECT PREDICTIONS:")
for i in correct_indices[:3]:  # Show first 3 correct
    text = dataset['test'][i]['text'][:200]
    true_label = 'Positive' if true_labels[i] == 1 else 'Negative'
    pred_label = 'Positive' if predictions[i] == 1 else 'Negative'
    
    print(f"\nReview: {text}...")
    print(f"True: {true_label} | Predicted: {pred_label} | Response: '{raw_responses[i]}'")

print(f"\n❌ INCORRECT PREDICTIONS:")
for i in incorrect_indices[:3]:  # Show first 3 incorrect
    text = dataset['test'][i]['text'][:200]
    true_label = 'Positive' if true_labels[i] == 1 else 'Negative'
    pred_label = 'Positive' if predictions[i] == 1 else 'Negative'
    
    print(f"\nReview: {text}...")
    print(f"True: {true_label} | Predicted: {pred_label} | Response: '{raw_responses[i]}'")


Starting evaluation on IMDB test set...
Evaluating on 200 IMDB test samples...


  0%|          | 0/200 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  0%|          | 1/200 [00:00<00:28,  6.98it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  2%|▏         | 3/200 [00:00<00:20,  9.80it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
  2%|▏         | 4/200 [00:00<00:21,  9.21it/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be igno


Evaluation completed in 19.54 seconds
Average time per sample: 0.097 seconds

IMDB SENTIMENT CLASSIFICATION RESULTS
Total samples evaluated: 200
Overall Accuracy: 0.9450 (94.50%)

Detailed Classification Report:
              precision    recall  f1-score   support

    Negative     1.0000    0.9450    0.9717       200
    Positive     0.0000    0.0000    0.0000         0

    accuracy                         0.9450       200
   macro avg     0.5000    0.4725    0.4859       200
weighted avg     1.0000    0.9450    0.9717       200


Breakdown by True Label:


ZeroDivisionError: division by zero

In [None]:
def test_custom_reviews():
    """Interactive function to test custom movie reviews"""
    print(f"\n{'='*60}")
    print("CUSTOM MOVIE REVIEW TESTING")
    print(f"{'='*60}")
    
    custom_reviews = [
        "This movie was absolutely incredible! The cinematography was breathtaking and the acting was superb. One of the best films I've seen this year.",
        
        "What a disappointment. The plot made no sense, the dialogue was terrible, and I couldn't wait for it to end. Complete waste of time and money.",
        
        "The movie had some good moments but overall it was just average. Not bad enough to walk out, but nothing I'd recommend to friends.",
        
        "Brilliant storytelling combined with outstanding performances. This film will stay with you long after the credits roll. Highly recommended!",
        
        "Boring and predictable. The characters were one-dimensional and the ending was completely unsatisfying. Skip this one."
    ]
    
    for i, review in enumerate(custom_reviews):
        pred_label, sentiment, raw_response = classify_sentiment(review, model, tokenizer)
        #confidence = "High" if raw_response.strip() in ['positive', 'negative'] else "Low"
        
        print(f"\nTest Review #{i+1}:")
        print(f"Text: {review}")
        print(f"Predicted Sentiment: {sentiment.upper()}")
        print(f"Raw Model Response: '{raw_response}'")
        #print(f"Confidence: {confidence}")
        print("-" * 60)

# Run interactive testing
test_custom_reviews()

The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



CUSTOM MOVIE REVIEW TESTING

Test Review #1:
Text: This movie was absolutely incredible! The cinematography was breathtaking and the acting was superb. One of the best films I've seen this year.
Predicted Sentiment: POSITIVE
Raw Model Response: 'positive'
------------------------------------------------------------

Test Review #2:
Text: What a disappointment. The plot made no sense, the dialogue was terrible, and I couldn't wait for it to end. Complete waste of time and money.
Predicted Sentiment: NEGATIVE
Raw Model Response: 'negative'
------------------------------------------------------------

Test Review #3:
Text: The movie had some good moments but overall it was just average. Not bad enough to walk out, but nothing I'd recommend to friends.
Predicted Sentiment: NEGATIVE
Raw Model Response: 'negative'
------------------------------------------------------------


The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.



Test Review #4:
Text: Brilliant storytelling combined with outstanding performances. This film will stay with you long after the credits roll. Highly recommended!
Predicted Sentiment: POSITIVE
Raw Model Response: 'positive'
------------------------------------------------------------

Test Review #5:
Text: Boring and predictable. The characters were one-dimensional and the ending was completely unsatisfying. Skip this one.
Predicted Sentiment: NEGATIVE
Raw Model Response: 'negative'
------------------------------------------------------------


In [12]:
def print_performance_summary():
    print(f"\n{'='*80}")
    print("PERFORMANCE SUMMARY & INSIGHTS")
    print(f"{'='*80}")
    
    print(f"🎯 Overall Performance:")
    print(f"   • Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
    print(f"   • Total samples: {len(predictions)}")
    print(f"   • Correct predictions: {sum(1 for p, t in zip(predictions, true_labels) if p == t)}")
    
    print(f"\n📊 Class-wise Performance:")
    print(f"   • Positive reviews: {pos_correct/len(pos_indices):.3f} accuracy")
    print(f"   • Negative reviews: {neg_correct/len(neg_indices):.3f} accuracy")
    
    # Analyze response quality
    clear_responses = sum(1 for r in raw_responses if r.strip() in ['positive', 'negative'])
    print(f"\n🔍 Response Quality:")
    print(f"   • Clear responses: {clear_responses}/{len(raw_responses)} ({clear_responses/len(raw_responses)*100:.1f}%)")
    
    print(f"\n💡 Key Insights:")
    print(f"   • The model performs zero-shot sentiment classification")
    print(f"   • No fine-tuning was required")
    print(f"   • Performance could be improved with few-shot prompting")
    print(f"   • Model shows good understanding of movie review sentiment")
    
    print(f"\n🚀 Potential Improvements:")
    print(f"   • Add few-shot examples in the prompt")
    print(f"   • Fine-tune on domain-specific data")
    print(f"   • Experiment with different prompt formats")
    print(f"   • Use ensemble methods with multiple models")

print_performance_summary()


PERFORMANCE SUMMARY & INSIGHTS
🎯 Overall Performance:
   • Accuracy: 0.940 (94.0%)
   • Total samples: 200
   • Correct predictions: 188

📊 Class-wise Performance:


ZeroDivisionError: division by zero

In [14]:
def save_results():
    """Save evaluation results to files"""
    results_df = pd.DataFrame({
        'review_text': [dataset['test'][i]['text'] for i in range(len(predictions))],
        'true_label': true_labels,
        'predicted_label': predictions,
        'true_sentiment': ['Positive' if l == 1 else 'Negative' for l in true_labels],
        'predicted_sentiment': ['Positive' if l == 1 else 'Negative' for l in predictions],
        'raw_response': raw_responses,
        'correct': [p == t for p, t in zip(predictions, true_labels)]
    })
    
    # Save to CSV
    results_df.to_csv('imdb_sentiment_results.csv', index=False)
    print("\n💾 Results saved to 'imdb_sentiment_results.csv'")
    
    # Save summary statistics
    summary = {
        'accuracy': float(accuracy),
        'total_samples': len(predictions),
        'correct_predictions': int(sum(1 for p, t in zip(predictions, true_labels) if p == t)),
        'positive_accuracy': float(pos_correct/len(pos_indices)),
        'negative_accuracy': float(neg_correct/len(neg_indices)),
        'clear_responses': int(sum(1 for r in raw_responses if r.strip() in ['positive', 'negative']))
    }
    
    with open('evaluation_summary.json', 'w') as f:
        import json
        json.dump(summary, f, indent=2)
    
    print("📊 Summary statistics saved to 'evaluation_summary.json'")
    return results_df

# Uncomment the line below to save results
# results_df = save_results()

print(f"\n{'='*80}")


