In [1]:
#Get the data, the source is sited.
# @InProceedings{maas-EtAl:2011:ACL-HLT2011,
#   author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
#   title     = {Learning Word Vectors for Sentiment Analysis},
#   booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
#   month     = {June},
#   year      = {2011},
#   address   = {Portland, Oregon, USA},
#   publisher = {Association for Computational Linguistics},
#   pages     = {142--150},
#   url       = {http://www.aclweb.org/anthology/P11-1015}
# }
import os
import glob
def load_data(directory):
    texts = []
    labels = []
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(directory, label_type)
        for fname in glob.glob(os.path.join(dir_name, '*.txt')):
            with open(fname, 'r', encoding='utf-8') as f:
                texts.append(f.read())
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
    return texts, labels

train_texts, train_labels = load_data('../aclImdb_data/train')
test_texts, test_labels = load_data('../aclImdb_data/test')

In [2]:
#Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Download the NLTK data package
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialising word reducers and deactivators
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(texts):
    preprocessed_texts = []
    for text in texts:
        # Text cleaning: removes non-alphabetic characters
        text = re.sub(r'\W', ' ', text)

        # Tokenization
        words = nltk.word_tokenize(text)

        # Word Restoration and Deactivation Removal
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

        preprocessed_texts.append(' '.join(words))
    return preprocessed_texts
# Pre-processed text
train_texts = preprocess_text(train_texts)
test_texts = preprocess_text(test_texts)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
import numpy as np
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
# Set the maximum number of words we want to keep based on frequency
max_words = 10000

# Initialize a tokenizer
tokenizer = Tokenizer(num_words=max_words)

# Fit it on the texts
tokenizer.fit_on_texts(train_texts)


train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

maxlen = 500
train_data = pad_sequences(train_sequences, maxlen=maxlen)
test_data = pad_sequences(test_sequences, maxlen=maxlen)

vocabulary_size = max_words
embedding_dim = 128

model = Sequential()
model.add(Embedding(vocabulary_size, embedding_dim, input_length=maxlen))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

train_labels_array = np.array(train_labels)
model.fit(train_data, train_labels_array, batch_size=32, epochs=100, validation_split=0.2)

Num GPUs Available:  0
Epoch 1/100


KeyboardInterrupt: 