In [7]:
import numpy as np
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from gensim.models import Word2Vec

# Function to load and split the IMDB dataset
def load_and_split_data(num_words=20000):
    """
    Loads the IMDB dataset and splits it into training and testing data.
    Limits the data to the top 'num_words' most frequent words.
    Returns the training and testing data.
    """
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_words)
    return X_train, y_train, X_test, y_test
    
def decode_reviews(X, word_index):
    """
    Converts the integer-encoded reviews into human-readable text using the word index.
    """
    reverse_word_index = {}# Start with an empty dictionary
    for key, value in word_index.items():
        reverse_word_index[value] = key # Swap key and value
        
    decoded_reviews_list = []
    # Loop over each review in X
    for review in X:
        decoded_review_single = []
        for i in review:
            # Subtract 3 to handle special tokens, and get the corresponding word
            # Default to '?' if the word is not in the dictionary
            word = reverse_word_index.get(i-3,'?')
            decoded_review_single.append(word)
        decoded_reviews_list.append(' '.join(decoded_review_single))
        
    return decoded_reviews_list


def train_word2vec_model(decoded_reviews):
    """
    Trains a Word2Vec model using the decoded reviews (list of lists of words).
    """
    sentences = [review.split() for review in decoded_reviews]  # Tokenize the reviews
    model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=0)
    model.save("word2vec_imdb.model")  # Save the model for later use
    return model
    

def vectorize_reviews_with_word2vec(model, reviews):
    """
    Converts the reviews into vectors by averaging the word vectors.
    """
    review_vectors = []

    for review in reviews:
        words = review.split()  # Split the review into words
        word_vectors = []

        # Get the vector for each word in the review
        for word in words:
            if word in model.wv:  # Check if the word is in the Word2Vec model's vocabulary
                word_vectors.append(model.wv[word])

        # If there are any word vectors, average them to get the review vector
        if word_vectors:
            review_vectors.append(np.mean(word_vectors, axis=0))
        else:
            review_vectors.append(np.zeros(model.vector_size))  # If no word vectors, use a zero vector

    return np.array(review_vectors)


def build_model(input_dim):
    """
    Builds a simple neural network model for binary classification.
    """
    model = Sequential()
    model.add(Dense(150, input_dim=input_dim, activation='relu'))  # First hidden layer with 128 neurons
    model.add(Dropout(0.4))  # Dropout layer to prevent overfitting
    model.add(Dense(75, activation='relu'))  # Second hidden layer with 64 neurons
    model.add(Dense(1, activation='sigmoid'))  # Output layer with sigmoid activation (binary classification)
    
    model.compile(loss='binary_crossentropy', optimizer=Adam(), metrics=['accuracy'])  # Compile the model
    return model

# Load the IMDB data
X_train, y_train, X_test, y_test = load_and_split_data()
# Step 2: Decode the integer-encoded reviews into human-readable text
word_index = imdb.get_word_index()
decoded_train_reviews = decode_reviews(X_train, word_index)
decoded_test_reviews = decode_reviews(X_test, word_index)
# Step 3: Train Word2Vec model on the training reviews
word2vec_model = train_word2vec_model(decoded_train_reviews)
# Step 4:  Vectorize training and test reviews using the trained Word2Vec model
X_train_word2vec = vectorize_reviews_with_word2vec(word2vec_model, decoded_train_reviews)
X_test_word2vec = vectorize_reviews_with_word2vec(word2vec_model, decoded_test_reviews)
# Step 5: Build and train the neural network model
input_dim = X_train_word2vec.shape[1]  # Size of the Word2Vec vectors (100-dimensional vectors)
model = build_model(input_dim)
# Train the model using the Word2Vec vectors
model.fit(X_train_word2vec, y_train, epochs=5, batch_size=64, validation_data=(X_test_word2vec, y_test))

# Evaluate the model using the test data
accuracy = model.evaluate(X_test_word2vec, y_test)
print(f"Test Accuracy: {accuracy[1] * 100:.2f}%")

Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.7090 - loss: 0.5481 - val_accuracy: 0.8261 - val_loss: 0.3920
Epoch 2/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8195 - loss: 0.4039 - val_accuracy: 0.8261 - val_loss: 0.3837
Epoch 3/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8321 - loss: 0.3863 - val_accuracy: 0.8347 - val_loss: 0.3740
Epoch 4/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8350 - loss: 0.3793 - val_accuracy: 0.8362 - val_loss: 0.3713
Epoch 5/5
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8375 - loss: 0.3748 - val_accuracy: 0.8367 - val_loss: 0.3700
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 778us/step - accuracy: 0.8363 - loss: 0.3714
Test Accuracy: 83.67%
