In [36]:
#Get the data, the source is sited.
# @InProceedings{maas-EtAl:2011:ACL-HLT2011,
#   author    = {Maas, Andrew L.  and  Daly, Raymond E.  and  Pham, Peter T.  and  Huang, Dan  and  Ng, Andrew Y.  and  Potts, Christopher},
#   title     = {Learning Word Vectors for Sentiment Analysis},
#   booktitle = {Proceedings of the 49th Annual Meeting of the Association for Computational Linguistics: Human Language Technologies},
#   month     = {June},
#   year      = {2011},
#   address   = {Portland, Oregon, USA},
#   publisher = {Association for Computational Linguistics},
#   pages     = {142--150},
#   url       = {http://www.aclweb.org/anthology/P11-1015}
# }
import os
import glob

def load_data(directory):
    texts = []
    labels = []
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(directory, label_type)
        for fname in glob.glob(os.path.join(dir_name, '*.txt')):
            with open(fname, 'r', encoding='utf-8') as f:
                texts.append(f.read())
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
    return texts, labels

train_texts, train_labels = load_data('../aclImdb_data/train')
test_texts, test_labels = load_data('../aclImdb_data/test')

In [37]:
#Preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


# Download the NLTK data package
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialising word reducers and deactivators
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(texts):
    preprocessed_texts = []
    for text in texts:
        # Text cleaning: removes non-alphabetic characters
        text = re.sub(r'\W', ' ', text)

        # Tokenization
        words = nltk.word_tokenize(text)

        # Word Restoration and Deactivation Removal
        words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]

        preprocessed_texts.append(' '.join(words))
    return preprocessed_texts
# Pre-processed text
train_texts = preprocess_text(train_texts)
test_texts = preprocess_text(test_texts)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\76219\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
#Form the vocabulary list & Vectorize
from sklearn.feature_extraction.text import CountVectorizer
def vectorize(texts):
    vectorizer = CountVectorizer()

    vectorizer.fit(texts)
    # vocab = vectorizer.get_feature_names_out()

    sequences = vectorizer.transform(texts)

    return sequences

train_sequences = vectorize(train_texts)
test_sequences = vectorize(test_texts)
print(train_sequences[0])
print(test_sequences[0])

  (0, 1310)	2
  (0, 4702)	1
  (0, 6716)	1
  (0, 8552)	1
  (0, 10879)	1
  (0, 11722)	1
  (0, 12648)	1
  (0, 14389)	1
  (0, 14794)	1
  (0, 16999)	1
  (0, 19400)	1
  (0, 20809)	1
  (0, 21215)	1
  (0, 21230)	1
  (0, 21371)	1
  (0, 22512)	1
  (0, 23859)	1
  (0, 23896)	1
  (0, 24198)	1
  (0, 24664)	2
  (0, 25265)	1
  (0, 26194)	1
  (0, 26448)	1
  (0, 26641)	1
  (0, 31512)	1
  :	:
  (0, 49245)	1
  (0, 53437)	1
  (0, 54048)	1
  (0, 54719)	1
  (0, 54728)	1
  (0, 55301)	1
  (0, 56373)	1
  (0, 58616)	1
  (0, 58677)	1
  (0, 58733)	1
  (0, 59150)	1
  (0, 61362)	1
  (0, 61687)	1
  (0, 61857)	1
  (0, 62032)	1
  (0, 62051)	1
  (0, 62401)	1
  (0, 63949)	2
  (0, 64913)	1
  (0, 65194)	1
  (0, 66636)	1
  (0, 66683)	1
  (0, 68023)	1
  (0, 68784)	1
  (0, 69812)	1
  (0, 3461)	1
  (0, 3598)	1
  (0, 3989)	1
  (0, 4207)	1
  (0, 4215)	1
  (0, 6544)	1
  (0, 6583)	1
  (0, 9844)	3
  (0, 10745)	3
  (0, 12027)	2
  (0, 12110)	1
  (0, 12222)	1
  (0, 12528)	1
  (0, 13917)	4
  (0, 13960)	1
  (0, 18444)	1
  (0, 18628)	1
 