<a href="https://colab.research.google.com/github/archikumari0770/sentiment_analysis2/blob/main/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm import tqdm

# Download NLTK data
nltk.download('vader_lexicon')

# Initialize the VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Example text
text = "I love this product! It's amazing and works perfectly."

# Get sentiment scores
sentiment = sia.polarity_scores(text)

print(sentiment)

In [None]:
!pip install transformers

from transformers import pipeline

# Load the sentiment analysis pipeline
sentiment_pipeline = pipeline("sentiment-analysis")

# Analyze sentiment
result = sentiment_pipeline("I hate when my phone battery dies quickly")[0]
print(f"Label: {result['label']}, with score: {round(result['score'], 4)}")


In [None]:
# Load a sample dataset
!pip install datasets
from datasets import load_dataset
from tqdm import tqdm # Import tqdm here

# Load the IMDb reviews dataset
dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

# Let's look at one example
print(train_data[0])

# Function to get sentiment
def get_sentiment(text):
    # Truncate text to the maximum length the model can handle (usually 512 for this model)
    max_len = 512
    truncated_text = text[:max_len]
    result = sentiment_pipeline(truncated_text)[0]
    return result['label'], result['score']

# Apply to a sample of the dataset
sample_size = 100
results = []
for i in tqdm(range(sample_size)):
    text = train_data[i]['text']
    label = train_data[i]['label']
    pred_label, pred_score = get_sentiment(text)
    results.append({
        'text': text[:100] + "...",  # Show first 100 chars
        'true_label': 'positive' if label == 1 else 'negative',
        'pred_label': pred_label,
        'pred_score': pred_score
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)
print(results_df.head())

In [None]:
!pip install fsspec==2023.9.0  # Downgrade to a stable version


In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import matplotlib.pyplot as plt

# Convert labels to numerical (0=negative, 1=positive)
results_df['true_num'] = results_df['true_label'].apply(lambda x: 1 if x == 'positive' else 0)
results_df['pred_num'] = results_df['pred_label'].apply(lambda x: 1 if x == 'POSITIVE' else 0)

# Compute accuracy
accuracy = accuracy_score(results_df['true_num'], results_df['pred_num'])
print(f"Accuracy: {accuracy:.2f}")

# Plot confusion matrix
cm = confusion_matrix(results_df['true_num'], results_df['pred_num'])
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
import torch

# Load tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

small_train_dataset = train_data.select(range(1000)).map(tokenize_function, batched=True)
small_test_dataset = test_data.select(range(1000)).map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to="none",  # Disable Weights & Biases logging
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_test_dataset,
)

# Train the model
trainer.train()

# Evaluate
eval_result = trainer.evaluate()
print(f"Evaluation results: {eval_result}")


In [None]:
test_sentences = [
    "I love this product! It's amazing!",  # Positive
    "This is the worst experience ever.",  # Negative
    "The weather is okay today.",           # Neutral
]

for sentence in test_sentences:
    result = sentiment_pipeline(sentence)[0]
    print(f"Text: {sentence}")
    print(f"Predicted: {result['label']} (Score: {result['score']:.2f})\n")

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Example: Test on 100 samples
true_labels = []
pred_labels = []

for i in range(100):
    text = test_data[i]["text"]
    true_label = test_data[i]["label"]  # 0=negative, 1=positive

    # Truncate text to the model's maximum sequence length
    max_len = 512
    truncated_text = text[:max_len]

    pred_label = sentiment_pipeline(truncated_text)[0]["label"]

    true_labels.append(true_label)
    pred_labels.append(1 if pred_label == "POSITIVE" else 0)

# Calculate accuracy
accuracy = accuracy_score(true_labels, pred_labels)
print(f"Accuracy: {accuracy:.2f}")

# Detailed report
print(classification_report(true_labels, pred_labels, target_names=["Negative", "Positive"]))

In [None]:
edge_cases = [
    "The movie was not bad.",               # Negation â†’ Positive
    "Meh, it was okay I guess.",           # Weak sentiment
    "I hate this, but the design is good." # Mixed sentiment
]

for text in edge_cases:
    result = sentiment_pipeline(text)[0]
    print(f"Text: {text}")
    print(f"Predicted: {result['label']} (Score: {result['score']:.2f})\n")