In [None]:
!pip install transformers torch datasets matplotlib

In [None]:
#Load Data and Preprocess
from datasets import load_dataset

# Load the IMDB dataset
dataset = load_dataset("imdb")
train_data = dataset["train"].shuffle(seed=42).select(range(1000))  # Use 1k samples for speed
test_data = dataset["test"].shuffle(seed=42).select(range(200))

print("Sample review:", train_data[0]["text"])
print("Label (0=negative, 1=positive):", train_data[0]["label"])

In [None]:
#Load a Pre-Trained Model
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load tokenizer and model
model_name = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

In [None]:
#Tokenize the Data
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=512)

# Tokenize datasets
train_data = train_data.map(tokenize, batched=True, batch_size=16)
test_data = test_data.map(tokenize, batched=True, batch_size=16)

# Format for PyTorch
train_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_data.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
#Train the Model (Fine-Tuning)
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding

# Created a data collator that will dynamically pad the batched samples
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Training setup
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,  # Keep it short for demo purposes
    per_device_train_batch_size=8,
    # evaluation_strategy="epoch",  # Corrected parameter name
    logging_dir="./logs",
    # Added this to disable wandb
    report_to=["none"]  # Disable wandb and other integrations
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    data_collator=data_collator,  # Added the data collator
)

# Train!
trainer.train()

In [None]:
#Evaluate the Model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Create a function to analyze new customer reviews
def analyze_sentiment(review_text):
    inputs = tokenizer(review_text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        outputs = model(**inputs)

    # Get prediction (0 = negative, 1 = positive)
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)
    label = torch.argmax(prediction, dim=1).item()
    score = prediction[0][label].item()

    sentiment = "positive" if label == 1 else "negative"
    return {
        "sentiment": sentiment,
        "confidence": score,
        "review": review_text
    }

# Example usage with Jumpstart fashion retail reviews
sample_reviews = [
    "I absolutely love this dress! The fabric is high quality and the fit is perfect.",
    "The shirt I ordered was too small and the color was different from what was shown online.",
    "Shipping was fast but the product quality was disappointing.",
    "These jeans are the best I've ever owned. Will definitely buy more!",
    "Customer service was unhelpful when I tried to return my order."
]

for review in sample_reviews:
    result = analyze_sentiment(review)
    print(f"Review: {result['review'][:50]}...")
    print(f"Sentiment: {result['sentiment']} (Confidence: {result['confidence']:.2f})")
    print("-" * 50)

In [None]:
# Visualize sentiment distribution
import matplotlib.pyplot as plt
import numpy as np

# Function to analyze a batch of reviews
def analyze_batch(reviews):
    results = [analyze_sentiment(review) for review in reviews]
    return results

# Example: Analyze test data
test_reviews = [example["text"] for example in dataset["test"].select(range(100))]
sentiment_results = analyze_batch(test_reviews)

# Count positive and negative reviews
positive_count = sum(1 for r in sentiment_results if r["sentiment"] == "positive")
negative_count = sum(1 for r in sentiment_results if r["sentiment"] == "negative")

# Create visualization
labels = ['Positive', 'Negative']
counts = [positive_count, negative_count]

plt.figure(figsize=(10, 6))
plt.bar(labels, counts, color=['green', 'red'])
plt.title('Sentiment Distribution in Customer Reviews')
plt.ylabel('Number of Reviews')
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add count labels on bars
for i, count in enumerate(counts):
    plt.text(i, count + 0.5, str(count), ha='center')

plt.tight_layout()
plt.savefig('sentiment_distribution.png')
plt.show()

In [None]:
# Save the fine-tuned model and tokenizer
model_save_path = "./jumpstart_sentiment_model"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Create a directory in your Google Drive
import os
drive_path = "/content/drive/My Drive/jumpstart_sentiment_model"
os.makedirs(drive_path, exist_ok=True)

# Copy the model to Google Drive
!cp -r ./jumpstart_sentiment_model/* "/content/drive/My Drive/jumpstart_sentiment_model/"

print(f"Model saved to Google Drive at: {drive_path}")