In [1]:
import nltk
from nltk.corpus import movie_reviews
import random

nltk.download('movie_reviews')
nltk.download('punkt')

docs = [(list(movie_reviews.words(fileid)), category)
        for category in movie_reviews.categories()
        for fileid in movie_reviews.fileids(category)]

random.shuffle(docs)

X_raw = [" ".join(words) for words, label in docs]
y = [1 if label == 'pos' else 0 for words, label in docs]

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\ARITRA\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ARITRA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
num_reviews = len(docs)
num_pos = sum(1 for _, label in docs if label == 'pos')
num_neg = sum(1 for _, label in docs if label == 'neg')
total_tokens = sum(len(words) for words, _ in docs)
avg_tokens = total_tokens / num_reviews

print(f"Total Reviews           : {num_reviews}")
print(f" - Positive Reviews     : {num_pos}")
print(f" - Negative Reviews     : {num_neg}")
print(f"Total Word Tokens       : {total_tokens}")
print(f"Average Tokens/Review   : {avg_tokens:.2f}")

Total Reviews           : 2000
 - Positive Reviews     : 1000
 - Negative Reviews     : 1000
Total Word Tokens       : 1583820
Average Tokens/Review   : 791.91


In [15]:
sample_review_words, sample_label = docs[0]
sample_text = " ".join(sample_review_words)
print(f"Label : {'Positive' if sample_label == 'pos' else 'Negative'}")
print("Text  :")
print(sample_text[:1000] + ("..." if len(sample_text) > 1000 else ""))

Label : Positive
Text  :
one fun activity for parents during the holidays is to suggest an old film and see if they can interest their kids . although black - and - white films are frequently viewed as suspect , ones in color are greeted with more of an open mind . and if you can find a colorful action film , even if it is from six decades ago , then there is a real possibility of a take home hit . so it was in our family when we wandered over to the classic section of our local video store the other day and picked up a copy of the adventures of robin hood , a high spirited version of the walter scott story . nominated for the 1938 academy award for best picture and winner of three oscars for erich wolfgang korngold ' s melodramatic music , ralph dawson ' s fast paced editing and carl jules weyl ' s lush sets , the film is probably best remembered for errol flynn ' s charismatic acting as sir robin of locksley , a . k . a . robin hood . flynn , with his handsome figure and toothy smile

In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ARITRA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ARITRA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import numpy as np

def sentence_vector(tokens, embeddings, dim=100):
    vectors = [embeddings[word] for word in tokens if word in embeddings]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

In [16]:
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

X_tokens = [preprocess(text) for text in X_raw]

w2v_cbow = Word2Vec(sentences=X_tokens, vector_size=100, window=5, sg=0, min_count=2)
w2v_skipgram = Word2Vec(sentences=X_tokens, vector_size=100, window=5, sg=1, min_count=2)

X_cbow = np.array([sentence_vector(tokens, w2v_cbow.wv, 100) for tokens in X_tokens])
X_train, X_test, y_train, y_test = train_test_split(X_cbow, y, test_size=0.3, random_state=42)

clf_cbow = LogisticRegression(max_iter=1000)
clf_cbow.fit(X_train, y_train)
print("CBOW Accuracy:", accuracy_score(y_test, clf_cbow.predict(X_test)))

X_skip = np.array([sentence_vector(tokens, w2v_skipgram.wv, 100) for tokens in X_tokens])
X_train, X_test, y_train, y_test = train_test_split(X_skip, y, test_size=0.3, random_state=42)

clf_skip = LogisticRegression(max_iter=1000)
clf_skip.fit(X_train, y_train)
print("SkipGram Accuracy:", accuracy_score(y_test, clf_skip.predict(X_test)))

CBOW Accuracy: 0.635
SkipGram Accuracy: 0.7266666666666667


In [17]:
import numpy as np

def load_glove_embeddings(path=r"C:\Users\ARITRA\Downloads\glove.6B (1)\glove.6B.100d.txt"):
    embeddings = {}
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vec = np.array(values[1:], dtype='float32')
            embeddings[word] = vec
    return embeddings

glove = load_glove_embeddings()

X_glove = np.array([sentence_vector(tokens, glove, 100) for tokens in X_tokens])

X_train, X_test, y_train, y_test = train_test_split(X_glove, y, test_size=0.3, random_state=42)
clf_glove = LogisticRegression(max_iter=1000)
clf_glove.fit(X_train, y_train)
print("GloVe Accuracy:", accuracy_score(y_test, clf_glove.predict(X_test)))

GloVe Accuracy: 0.7616666666666667


In [18]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer_bow = CountVectorizer(stop_words='english')
X_bow = vectorizer_bow.fit_transform(X_raw)

X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.3, random_state=42)
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("BoW Accuracy:", accuracy_score(y_test, clf_bow.predict(X_test)))


BoW Accuracy: 0.855


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer_tfidf.fit_transform(X_raw)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("TF-IDF Accuracy:", accuracy_score(y_test, clf_tfidf.predict(X_test)))

TF-IDF Accuracy: 0.835


Interesting thing: Here we have averaged all the word vectors to represent each review - this can probably explain why WordtoVec and GloVE give worse accuracies than BoW and TF-IDF. But there are several other better ways, which you will (hopefully) explore in the upcoming assignment. Some of these ways are - 

i) Max Pooling

ii) Weighted averaging (TF-IDF weighted)

iii) Doc2Vec

iv) RNNS and LSTMS on these word embeddings (in these the final hidden state is generally used for the representing each document (review in our case)).

