In [120]:
import gensim
import nltk
import json

from nltk.corpus import movie_reviews
from nltk.corpus import shakespeare
from nltk.corpus import reuters
nltk.download('reuters')
nltk.download('shakespeare')
nltk.download('movie_reviews')
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

[nltk_data] Downloading package reuters to
[nltk_data]     C:\Users\alex-\AppData\Roaming\nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package shakespeare to
[nltk_data]     C:\Users\alex-\AppData\Roaming\nltk_data...
[nltk_data]   Package shakespeare is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\alex-\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alex-\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [122]:
nltk.download('movie_reviews')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\alex-\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alex-\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [124]:
# Load movie reviews for classification

nltk.download('movie_reviews')

reviews = [(movie_reviews.words(fileid), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

# Preprocess the dataset

X = [' '.join(words) for words, _ in reviews]
y = [category for _, category in reviews]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=1)

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\alex-\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [125]:
# Tokenizer

def tokenize(corpus):
    return [nltk.word_tokenize(doc.lower()) for doc in corpus]

# Train Word2Vec models
def train_word2vec(corpus, vector_size=100):
    tokenized_corpus = tokenize(corpus)
    model = gensim.models.Word2Vec(sentences=tokenized_corpus, vector_size=vector_size, window=5, sg=1, min_count=2)
    return model

# Represent movie reviews using averaged word embeddings
def get_document_vector(doc, model):
    words = nltk.word_tokenize(doc.lower())
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

In [149]:
# Train model on Shakespeare

shakespeare_corpus = [' '.join(shakespeare.words(fileid)) for fileid in shakespeare.fileids()]
shakespeare_model = train_word2vec(shakespeare_corpus)

# Approx. number of words in Shakespeare Corpus

num_words = sum(len(file.split(' ')) for file in shakespeare_corpus)
print(num_words)

240577


In [151]:
# Train model on a temporary corpus, using roughly the same amount of data as with the Shakespeare corpus to make it a fair comparison

contemporary_corpus = [' '.join(reuters.words(fileid)) for fileid in reuters.fileids()]

contemporary_words = 0
for i in range(0, len(contemporary_corpus)):
    contemporary_words += len(contemporary_corpus[i].split(' '))
    if contemporary_words > num_words:
        break

contemporary_corpus = contemporary_corpus[:i]
contemporary_model = train_word2vec(contemporary_corpus)

In [197]:
# Prepare features
X_train_shakespeare = [get_document_vector(doc, shakespeare_model) for doc in X_train]
X_test_shakespeare = [get_document_vector(doc, shakespeare_model) for doc in X_test]

X_train_contemporary = [get_document_vector(doc, contemporary_model) for doc in X_train]
X_test_contemporary = [get_document_vector(doc, contemporary_model) for doc in X_test]

In [224]:
positives = 0
for i in range(0, len(y_test)):
    positives += (y_test[i] == 'pos')

In [230]:
# Train and evaluate classifiers
def evaluate(X_train, X_test, y_train, y_test):
    clf = LogisticRegression(max_iter=1000)
    clf.fit(X_train, y_train)
    predictions = clf.predict(X_test)
    return accuracy_score(y_test, predictions)

accuracy_shakespeare = evaluate(X_train_shakespeare, X_test_shakespeare, y_train, y_test)
accuracy_contemporary = evaluate(X_train_contemporary, X_test_contemporary, y_train, y_test)

print(f" {(positives / len(y_test)) * 100:.2f}% of the test examples are positive")
print(f"Accuracy using Shakespeare embeddings: {accuracy_shakespeare}")
print(f"Accuracy using Contemporary embeddings: {accuracy_contemporary}")

 53.33% of the test examples are positive
Accuracy using Shakespeare embeddings: 0.5866666666666667
Accuracy using Contemporary embeddings: 0.62


We can see that out of all the movie reviews from the test set, 53% were positive. Of our two models trained:

The one trained on Shakespeare achieves an accuracy of 58.7%
The one trained on Reuters data achieves an accuracy of 62%

Both are better than a simple coin toss (50-50 odds), so there is some signal there. However, these models could improve a lot. Some of the reasons I suspect they are not doing very well are:

1. Shakespeare's data uses an outdated language that is not commonly used today, so it does not resemble the language used in movie reviews. Although Reuters uses a more contemporary language, it is still very niche and heavily focused on news, which might not be exactly the style of language used for a movie review. Therefore, word2vec is creating embeddings on a writing style that does not resemble much the writing style on which it was trained on, which is probably causing it not to create very meaningful embeddings.
2. A logistic regression model might not be the best one to use for this classification task. A neural network or an ensemble model like Random Forest would probably do better.