<a href="https://colab.research.google.com/github/akshayaa-403/imdb-sentiment-master/blob/main/IMDB_Sentiment_Master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip3 install -q transformers tensorflow nltk pandas scikit-learn datasets imblearn spacy tensorflow-addons

In [6]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m43.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import nltk
import spacy
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_curve
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import re
import os
from datetime import datetime

In [2]:
# Download required NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [3]:
# Load spaCy model
nlp = spacy.load('en_core_web_sm')

In [4]:
# Create output directory
output_dir = "/content/results"
os.makedirs(output_dir, exist_ok=True)

In [5]:
# Constants
MAX_SEQUENCE_LENGTH = 200
VOCAB_SIZE = 10000
BATCH_SIZE = 32
EPOCHS = 5

In [6]:
def preprocess_text(text):
    """Basic text preprocessing"""
    # Convert to string and lowercase
    text = str(text).lower()

    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)

    # Remove special characters and digits
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespace
    text = ' '.join(text.split())

    return text

In [7]:
def load_and_prepare_data():
    """Load and prepare the IMDb dataset"""
    try:
        # Load IMDb dataset
        from datasets import load_dataset
        dataset = load_dataset('imdb')

        # Convert to pandas DataFrames
        train_df = pd.DataFrame(dataset['train'])
        test_df = pd.DataFrame(dataset['test'])

        # Combine train and test
        df = pd.concat([train_df, test_df], axis=0).reset_index(drop=True)

        # Preprocess texts
        print("Preprocessing texts...")
        df['processed_text'] = df['text'].apply(preprocess_text)

        return df

    except Exception as e:
        print(f"Error loading data: {str(e)}")
        raise

In [8]:
def create_model(vocab_size, embedding_dim=100):
    """Create the model architecture"""
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=MAX_SEQUENCE_LENGTH),
        tf.keras.layers.SpatialDropout1D(0.2),
        tf.keras.layers.Conv1D(64, 5, activation='relu'),
        tf.keras.layers.MaxPooling1D(pool_size=4),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

In [9]:
def plot_training_history(history):
    """Plot and save training history"""
    plt.figure(figsize=(12, 4))

    # Plot accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()

    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, 'training_history.png'))
    plt.close()

In [10]:
def save_evaluation_results(y_true, y_pred, y_pred_proba):
    """Save evaluation metrics and plots"""
    # Classification Report
    report = classification_report(y_true, y_pred, output_dict=True)
    pd.DataFrame(report).transpose().to_csv(os.path.join(output_dir, 'classification_report.csv'))

    # Confusion Matrix
    plt.figure(figsize=(8, 6))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(output_dir, 'confusion_matrix.png'))
    plt.close()

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_true, y_pred_proba)
    plt.figure(figsize=(8, 6))
    plt.plot(fpr, tpr)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.savefig(os.path.join(output_dir, 'roc_curve.png'))
    plt.close()

In [11]:
# Main execution
if __name__ == "__main__":
    try:
        print("Starting Text Intelligence Project...")
        print(f"Output directory: {output_dir}")

        # Load and prepare data
        print("Loading and preparing data...")
        df = load_and_prepare_data()

        # Create and configure the vectorizer
        print("Creating text vectorizer...")
        vectorizer = tf.keras.layers.TextVectorization(
            max_tokens=VOCAB_SIZE,
            output_mode='int',
            output_sequence_length=MAX_SEQUENCE_LENGTH
        )

        # Replace the vectorization and data splitting section with this:

        # Adapt the vectorizer to the text data
        text_ds = tf.data.Dataset.from_tensor_slices(df['processed_text'].values)
        vectorizer.adapt(text_ds)

        # First split the data before vectorization
        X_train, X_test, y_train, y_test = train_test_split(
            df['processed_text'].values,
            df['label'].values,
            test_size=0.2,
            random_state=42
        )

        # Then vectorize the split data
        X_train = vectorizer(X_train)
        X_test = vectorizer(X_test)

        # Create model
        print("Building model...")
        model = create_model(VOCAB_SIZE)

        # Create callbacks
        callbacks = [
            tf.keras.callbacks.ModelCheckpoint(
                os.path.join(output_dir, 'best_model.h5'),
                save_best_only=True,
                monitor='val_loss'
            ),
            tf.keras.callbacks.EarlyStopping(
                monitor='val_loss',
                patience=3,
                restore_best_weights=True
            )
        ]

        # Train model
        print("Training model...")
        history = model.fit(
            X_train, y_train,
            validation_split=0.2,
            epochs=EPOCHS,
            batch_size=BATCH_SIZE,
            callbacks=callbacks
        )

        # Plot training history
        plot_training_history(history)

        # Evaluate model
        print("Evaluating model...")
        y_pred_proba = model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int)

        # Save evaluation results
        save_evaluation_results(y_test, y_pred, y_pred_proba)

        # Print final accuracy
        test_loss, test_accuracy = model.evaluate(X_test, y_test)
        print(f"\nFinal Test Accuracy: {test_accuracy:.4f}")

        # Save final model
        model.save(os.path.join(output_dir, 'final_model.h5'))

        print(f"Project completed. Results saved in {output_dir}")

    except Exception as e:
        print(f"Error in main execution: {str(e)}")
        raise

Starting Text Intelligence Project...
Output directory: /content/results
Loading and preparing data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Preprocessing texts...
Creating text vectorizer...
Building model...
Training model...
Epoch 1/5




[1m 997/1000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - accuracy: 0.5706 - loss: 0.6463



[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 18ms/step - accuracy: 0.5710 - loss: 0.6459 - val_accuracy: 0.8530 - val_loss: 0.3508
Epoch 2/5
[1m 998/1000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - accuracy: 0.8756 - loss: 0.3167



[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 17ms/step - accuracy: 0.8756 - loss: 0.3167 - val_accuracy: 0.8709 - val_loss: 0.3141
Epoch 3/5
[1m 998/1000[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 15ms/step - accuracy: 0.9097 - loss: 0.2398



[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 17ms/step - accuracy: 0.9097 - loss: 0.2398 - val_accuracy: 0.8712 - val_loss: 0.3078
Epoch 4/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 17ms/step - accuracy: 0.9278 - loss: 0.2000 - val_accuracy: 0.8590 - val_loss: 0.3579
Epoch 5/5
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 16ms/step - accuracy: 0.9427 - loss: 0.1662 - val_accuracy: 0.8686 - val_loss: 0.3419
Evaluating model...
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.8707 - loss: 0.3088





Final Test Accuracy: 0.8730
Project completed. Results saved in /content/results


In [15]:
# Add this at the end to download results
from google.colab import files

# Zip the results directory
!zip -r /content/results.zip /content/results

# Download the zip file
files.download('/content/results.zip')

  adding: content/results/ (stored 0%)
  adding: content/results/final_model.h5 (deflated 7%)
  adding: content/results/roc_curve.png (deflated 13%)
  adding: content/results/best_model.h5 (deflated 7%)
  adding: content/results/training_history.png (deflated 8%)
  adding: content/results/confusion_matrix.png (deflated 19%)
  adding: content/results/classification_report.csv (deflated 41%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>