In [3]:
import numpy as np
from keras.datasets import imdb

# Function to load and split the IMDB dataset
def load_and_split_data(num_words=10000):
    """
    Loads the IMDB dataset and splits it into training and testing data.
    Limits the data to the top 'num_words' most frequent words.
    Returns the training and testing data.
    """
    (X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=num_words)
    return X_train, y_train, X_test, y_test
    
def decode_reviews(X, word_index):
    """
    Converts the integer-encoded reviews into human-readable text using the word index.
    """
    reverse_word_index = {}# Start with an empty dictionary
    for key, value in word_index.items():
        reverse_word_index[value] = key # Swap key and value
        
    decoded_reviews_list = []
    # Loop over each review in X
    for review in X:
        decoded_review_single = []
        for i in review:
            # Subtract 3 to handle special tokens, and get the corresponding word
            # Default to '?' if the word is not in the dictionary
            word = reverse_word_index.get(i-3,'?')
            decoded_review_single.append(word)
        decoded_reviews_list.append(' '.join(decoded_review_single))
        
    return decoded_reviews_list


def vectorize_reviews_with_word2vec(model, reviews):
    """
    Converts the reviews into vectors by averaging the word vectors.
    """
    review_vectors = []

    for review in reviews:
        words = review.split()  # Split the review into words
        word_vectors = []

        # Get the vector for each word in the review
        for word in words:
            if word in model.wv:  # Check if the word is in the Word2Vec model's vocabulary
                word_vectors.append(model.wv[word])

        # If there are any word vectors, average them to get the review vector
        if word_vectors:
            review_vectors.append(np.mean(word_vectors, axis=0))
        else:
            review_vectors.append(np.zeros(model.vector_size))  # If no word vectors, use a zero vector

    return np.array(review_vectors)

# Load the IMDB data
X_train, y_train, X_test, y_test = load_and_split_data()
# Example usage: decode train and test reviews
word_index = imdb.get_word_index()
decoded_train_reviews = decode_reviews(X_train, word_index)
decoded_test_reviews = decode_reviews(X_test, word_index)
# Example usage: Vectorize training and test reviews using the trained Word2Vec model
# X_train_word2vec = vectorize_reviews_with_word2vec(word2vec_model, decoded_train_reviews)
# X_test_word2vec = vectorize_reviews_with_word2vec(word2vec_model, decoded_test_reviews)