In [None]:
import requests
import tarfile
import os

# Define paths
base_path = os.path.join(os.getcwd(), "Dataset", "Text_Classification")
os.makedirs(base_path, exist_ok=True)  # Create the folder if it doesn't exist
dataset_path = os.path.join(base_path, "aclImdb_v1.tar.gz")
extract_path = os.path.join(base_path, "aclImdb")

# URL of the IMDB dataset
url = "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"

# Download the dataset
print("Downloading IMDB dataset...")
response = requests.get(url, stream=True)
with open(dataset_path, "wb") as file:
    for chunk in response.iter_content(chunk_size=1024):
        if chunk:
            file.write(chunk)
print("Download complete.")

# Extract the dataset
print("Extracting dataset...")
with tarfile.open(dataset_path, "r:gz") as tar:
    tar.extractall(path=extract_path)
print("Extraction complete.")

# Clean up the tar.gz file
os.remove(dataset_path)
print("Dataset is ready at:", extract_path)

In [None]:
# Add these imports at the beginning of the notebook
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette("husl")

In [None]:
# Load the data
def load_imdb_data(base_path):
    texts = []
    labels = []
    
    # Load positive reviews
    pos_path = os.path.join(base_path, 'train', 'pos')
    for filename in os.listdir(pos_path):
        if filename.endswith('.txt'):
            with open(os.path.join(pos_path, filename), 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(1)
    
    # Load negative reviews
    neg_path = os.path.join(base_path, 'train', 'neg')
    for filename in os.listdir(neg_path):
        if filename.endswith('.txt'):
            with open(os.path.join(neg_path, filename), 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(0)
    
    return texts, labels

# Load the data
texts, labels = load_imdb_data(extract_path)

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [None]:
# Initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(texts, labels, tokenizer, max_length=512):
    encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length, return_tensors='pt')
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    labels = torch.tensor(labels)
    return input_ids, attention_mask, labels

# Tokenize train and validation data
train_input_ids, train_attention_mask, train_labels = tokenize_data(train_texts, train_labels, tokenizer)
val_input_ids, val_attention_mask, val_labels = tokenize_data(val_texts, val_labels, tokenizer)

# Create DataLoaders
batch_size = 16

train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = TensorDataset(val_input_ids, val_attention_mask, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

In [None]:
# Initialize model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Create lists to store metrics
training_losses = []
validation_accuracies = []
validation_losses = []

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette("husl")

# Training loop
epochs = 3

for epoch in range(epochs):
    model.train()
    total_train_loss = 0
    
    # Training phase
    for batch in train_dataloader:
        batch = tuple(b.to(device) for b in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'labels': batch[2]
        }
        
        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        optimizer.step()
    
    # Validation phase
    model.eval()
    val_accuracy = 0
    val_steps = 0
    total_val_loss = 0
    
    with torch.no_grad():
        for batch in val_dataloader:
            batch = tuple(b.to(device) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }
            
            outputs = model(**inputs)
            loss = outputs.loss
            total_val_loss += loss.item()
            predictions = torch.argmax(outputs.logits, dim=1)
            val_accuracy += (predictions == batch[2]).sum().item()
            val_steps += len(batch[2])
    
    # Calculate average losses and accuracy
    avg_train_loss = total_train_loss / len(train_dataloader)
    avg_val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = val_accuracy / val_steps
    
    # Store metrics
    training_losses.append(avg_train_loss)
    validation_losses.append(avg_val_loss)
    validation_accuracies.append(val_accuracy)
    
    print(f"Epoch {epoch + 1}:")
    print(f"Average training loss: {avg_train_loss:.4f}")
    print(f"Average validation loss: {avg_val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")
    print("----------------------------------------")

In [None]:
def predict_sentiment(text, model, tokenizer, device):
    # Prepare the text
    encoding = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    # Get prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        prediction = torch.argmax(outputs.logits, dim=1)
        
    return "Positive" if prediction.item() == 1 else "Negative"

# Test the model
test_text = "This movie was really great! The acting was superb and the plot was engaging."
result = predict_sentiment(test_text, model, tokenizer, device)
print(f"Test text: {test_text}")

In [None]:
def plot_training_metrics():
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot training loss
    ax1.plot(range(1, epochs + 1), training_losses, marker='o')
    ax1.set_title('Training Loss Over Epochs')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.grid(True)
    
    # Plot validation accuracy
    ax2.plot(range(1, epochs + 1), validation_accuracies, marker='o', color='green')
    ax2.set_title('Validation Accuracy Over Epochs')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.grid(True)
    
    plt.tight_layout()
    plt.show()

def plot_confusion_matrix():
    # Get predictions for validation set
    model.eval()
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in val_dataloader:
            batch = tuple(b.to(device) for b in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1]
            }
            
            outputs = model(**inputs)
            predictions = torch.argmax(outputs.logits, dim=1)
            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(batch[2].cpu().numpy())
    
    # Create confusion matrix
    cm = confusion_matrix(all_labels, all_predictions)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.show()
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(all_labels, all_predictions, 
                              target_names=['Negative', 'Positive']))

def plot_text_length_distribution():
    # Calculate text lengths
    train_lengths = [len(text.split()) for text in train_texts]
    val_lengths = [len(text.split()) for text in val_texts]
    
    plt.figure(figsize=(12, 6))
    sns.histplot(data=train_lengths, label='Training Set', alpha=0.5, bins=50)
    sns.histplot(data=val_lengths, label='Validation Set', alpha=0.5, bins=50)
    plt.title('Distribution of Text Lengths')
    plt.xlabel('Number of Words')
    plt.ylabel('Count')
    plt.legend()
    plt.show()
    
    print(f"Average length in training set: {np.mean(train_lengths):.2f} words")
    print(f"Average length in validation set: {np.mean(val_lengths):.2f} words")

def plot_label_distribution():
    plt.figure(figsize=(10, 5))
    
    # Training set distribution
    plt.subplot(1, 2, 1)
    sns.countplot(x=train_labels)
    plt.title('Label Distribution in Training Set')
    plt.xlabel('Sentiment (0=Negative, 1=Positive)')
    
    # Validation set distribution
    plt.subplot(1, 2, 2)
    sns.countplot(x=val_labels)
    plt.title('Label Distribution in Validation Set')
    plt.xlabel('Sentiment (0=Negative, 1=Positive)')
    
    plt.tight_layout()
    plt.show()

In [None]:
# After training is complete, generate all visualizations
print("Generating visualizations...")

print("\n1. Training Metrics:")
plot_training_metrics()

print("\n2. Confusion Matrix and Classification Report:")
plot_confusion_matrix()

print("\n3. Text Length Distribution:")
plot_text_length_distribution()

print("\n4. Label Distribution:")
plot_label_distribution()