In [1]:
import random
from nltk.corpus import movie_reviews

documents = []

for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))

random.shuffle(documents)

In [2]:
import string
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
punctuations = list(string.punctuation)

stop_words.extend(punctuations)

In [3]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

In [4]:
from nltk.corpus import wordnet
from nltk import pos_tag

def simple_pos_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [5]:
def clean_review(review):
    output_review = []

    for word in review:
        if word.lower() not in stop_words:
            pos = pos_tag([word])

            lemmatizer_word = lemmatizer.lemmatize(word, pos=simple_pos_tag(pos[0][1]))

            output_review.append(lemmatizer_word)

    return output_review

In [6]:
documents = [(clean_review(doc), category) for doc, category in documents]

In [7]:
from nltk import FreqDist

all_words = []

for doc in documents:
    all_words += doc[0]

frequency = FreqDist(all_words)

common_words = frequency.most_common(3000)
vocab_words = [word[0] for word in common_words]

In [8]:
def get_document_features(document):
    document_words = set(document)
    features = {}

    for word in vocab_words:
        features[word] = (word in document_words)

    return features

In [9]:
training_set = [(get_document_features(doc), category) for doc, category in documents[:1600]]
test_set = [(get_document_features(doc), category) for doc, category in documents[1600:]]

In [10]:
from nltk import NaiveBayesClassifier

classifier = NaiveBayesClassifier.train(training_set)

In [11]:
from nltk.classify import accuracy

accuracy(classifier, test_set)

0.7575

In [12]:
from sklearn.ensemble import RandomForestClassifier
from nltk.classify.scikitlearn import SklearnClassifier

random_forest = SklearnClassifier(RandomForestClassifier())

random_forest.train(training_set)

<SklearnClassifier(RandomForestClassifier())>

In [28]:
accuracy(random_forest, test_set)

0.7925

In [29]:
category = [category for _, category in documents]

text_doc = [' '.join(doc) for doc, _ in documents]

In [30]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(text_doc, category, test_size=0.2)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer(max_features=2000, ngram_range=(1, 2), min_df=5, max_df=0.8)
x_train_count_vector = count_vectorizer.fit_transform(x_train)
x_test_count_vector = count_vectorizer.transform(x_test)

In [None]:
count_vectorizer.get_feature_names_out()

array(['000', '10', '100', ..., 'young', 'young man', 'zero'],
      dtype=object)

In [None]:
from sklearn.svm import SVC

svm_count_vector_classifier = SVC()

svm_count_vector_classifier.fit(x_train_count_vector, y_train)

svm_count_vector_classifier.score(x_test_count_vector, y_test)

0.815

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2), min_df=5, max_df=0.8)
x_train_tfidf_vector = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf_vector = tfidf_vectorizer.transform(x_test)

In [54]:
svm_tfidf_vector_classifier = SVC()

svm_tfidf_vector_classifier.fit(x_train_tfidf_vector, y_train)

svm_tfidf_vector_classifier.score(x_test_tfidf_vector, y_test)

0.85