In [None]:
# Import necessary libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np

In [None]:
# Download dataset
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [None]:
# unpack the dataset
!tar -xf aclImdb_v1.tar.gz

In [None]:
# Define paths to data
train_path = "aclImdb/train/"
test_path = "aclImdb/test/"

# Define parameters
max_words = 10000
max_len = 200
batch_size = 32
epochs = 10

In [None]:
# Function to read data from files
def read_data(path):
    """
    Reads data from the specified path and returns lists of reviews and labels.

    Args:
        path: Path to the data directory.

    Returns:
        reviews: List of movie reviews.
        labels: List of sentiment labels (0 for negative, 1 for positive).
    """

    # import libraries
    import os

    # create variables for returning
    # reviews and their labels
    reviews = []
    labels = []

    # Reading the data
    for sentiment in ["pos", "neg"]:
        sentiment_path = path + sentiment + "/"
        for filename in os.listdir(sentiment_path):
            with open(os.path.join(sentiment_path, filename), "r", encoding="utf-8") as f:
                reviews.append(f.read())
                labels.append(1 if sentiment == "pos" else 0)

    # return the reviews and their labels
    return reviews, labels


In [None]:
# Read training and testing data
train_reviews, train_labels = read_data(train_path)
test_reviews, test_labels = read_data(test_path)

In [None]:
# Preprocess text data
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_reviews)
train_sequences = tokenizer.texts_to_sequences(train_reviews)
test_sequences = tokenizer.texts_to_sequences(test_reviews)
train_padded = pad_sequences(train_sequences, maxlen=max_len)
test_padded = pad_sequences(test_sequences, maxlen=max_len)

In [None]:
# Split data into training and validation sets
train_data, val_data, train_labels, val_labels = train_test_split(train_padded, train_labels, test_size=0.2)

# Create the model
model = keras.Sequential([
    keras.layers.Embedding(max_words, 128, input_length=max_len),
    keras.layers.LSTM(64),
    keras.layers.Dense(1, activation="sigmoid")
])

# Compile the model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
# Train the model
train_labels = np.array(train_labels)
val_labels = np.array(val_labels)
history = model.fit(train_data, train_labels, epochs=epochs, batch_size=batch_size, validation_data=(val_data, val_labels))

In [None]:
# Evaluate the model
test_labels = np.array(test_labels)
test_loss, test_acc = model.evaluate(test_padded, test_labels)
print("Test accuracy:", test_acc)

In [None]:
# Plot epochs vs loss
import matplotlib.pyplot as plt
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()