TO JEST BRUDNOPIS i historia porównywania klasyfikatorów

In [1]:
import nltk
from typing import List
from nltk.sentiment import SentimentIntensityAnalyzer
import re
import numpy as np
import pandas as pd
import pickle

In [2]:
sample_review = f"""Good. It IS a page turner. You can read this book in one day, two at the most, and the plot drives the whole book. The unreliable narrators (there are two besides the main character) are as unlikable as they are unreliable, and there isn't a nice male in the book. Entirely plot driven; the characters are paper thin. You can figure out who-dunnit by the middle of the book. The ending is weak. I can't imagine what all the fuss is about, except that it is quick and there are lots of twists and turns, and you can't trust anyone to tell the truth."""

In [3]:
nltk.download([
     "names",
     "stopwords",
     "averaged_perceptron_tagger",
     "vader_lexicon",
     "punkt",
], quiet=True);

In [4]:
def preprocess(review: str) -> List[List[str]]:
    """Function preprocesses the given text. It switches all the letters to lowercase,
    deletes the stopwords and divides the review by sentences.

    Args:
        review (str): Review text in start format

    Returns:
        List[List[str]]: List of sentences, divided by words, without stopwords and lowercase
    """

    stopwords: List[str] = nltk.corpus.stopwords.words("english")
    end_sentence_chars: List[str] = ["?", ".", "!"]

    review_tokenized: List[str] = [word.lower() for word in nltk.word_tokenize(sample_review) if word.isalpha() and word not in stopwords]
    review_by_sentences = re.split(r'[.?!] ', sample_review) # divide the string into sentences

    # review_tokenized: List[List[str]] = [] #list of sentences every one of which is split into lowercase words without stopwords

    # for sentence in review_by_sentences:
    #     review_tokenized.append([word.lower() for word in nltk.word_tokenize(sentence) if word.isalpha() and word not in stopwords])

    return review_by_sentences


In [5]:
def is_positive(review: List[List[str]])-> bool:
    """This function analyses the sentiment of a given review.

    Args:
        review (List[List[str]): Processed review (tokenized, lowercase and without stopwords, divided by sentences)

    Returns:
        bool: False if sentiment is negative, True if it is positive
    """
    sia = SentimentIntensityAnalyzer()
    for sentence in review:
        print(sentence)
        print(sia.polarity_scores(sentence)["compound"])
    sentences_scores = [sia.polarity_scores(sentence)["compound"] for sentence in review]

    return np.mean(sentences_scores) > 0


print(is_positive(preprocess(sample_review)))


Good
0.4404
It IS a page turner
0.0
You can read this book in one day, two at the most, and the plot drives the whole book
0.0
The unreliable narrators (there are two besides the main character) are as unlikable as they are unreliable, and there isn't a nice male in the book
-0.3252
Entirely plot driven; the characters are paper thin
0.0
You can figure out who-dunnit by the middle of the book
0.0
The ending is weak
-0.4404
I can't imagine what all the fuss is about, except that it is quick and there are lots of twists and turns, and you can't trust anyone to tell the truth.
-0.1032
False


In [6]:
unwanted = nltk.corpus.stopwords.words("english")
unwanted.extend([w.lower() for w in nltk.corpus.names.words()])

def skip_unwanted(pos_tuple):
    word, tag = pos_tuple
    if not word.isalpha() or word in unwanted:
        return False
    if tag.startswith("NN"):
        return False
    return True

positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
)]

negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
)]

positive_fd = nltk.FreqDist(positive_words)
negative_fd = nltk.FreqDist(negative_words)

common_set = set(positive_fd).intersection(negative_fd)

for word in common_set:
    del positive_fd[word]
    del negative_fd[word]

top_100_positive = {word for word, count in positive_fd.most_common(500)}
top_100_negative = {word for word, count in negative_fd.most_common(500)}

In [17]:
positive_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["pos"]))
)]

negative_words = [word for word, tag in filter(
    skip_unwanted,
    nltk.pos_tag(nltk.corpus.movie_reviews.words(categories=["neg"]))
)]

print(type(positive_words))

<class 'list'>


In [16]:

positive_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["pos"])
    if w.isalpha() and w not in unwanted])

negative_bigram_finder = nltk.collocations.BigramCollocationFinder.from_words([
    w for w in nltk.corpus.movie_reviews.words(categories=["neg"])
    if w.isalpha() and w not in unwanted])

print(type(positive_bigram_finder))

<nltk.collocations.BigramCollocationFinder object at 0x0000013275AA8FA0>


In [8]:
def extract_features(text):
    features = dict()
    wordcount = 0
    compound_scores = list()
    positive_scores = list()

    sia = SentimentIntensityAnalyzer()

    for sentence in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sentence):
            if word.lower() in top_100_positive:
                wordcount += 1
        compound_scores.append(sia.polarity_scores(sentence)["compound"])
        positive_scores.append(sia.polarity_scores(sentence)["pos"])

    # Adding 1 to the final compound score to always have positive numbers
    features["mean_compound"] = np.mean(compound_scores) + 1
    features["mean_positive"] = np.mean(positive_scores)
    features["wordcount"] = wordcount

    return features

In [23]:
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]

features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

print(type(positive_words))

<class 'list'>


In [10]:
from random import shuffle

train_count = 4*len(features) // 5
shuffle(features)
classifier = nltk.NaiveBayesClassifier.train(features[:train_count])
classifier.show_most_informative_features(10)
nltk.classify.accuracy(classifier, features[train_count:])

Most Informative Features
               wordcount = 8                 pos : neg    =     20.0 : 1.0
               wordcount = 7                 pos : neg    =     13.2 : 1.0
               wordcount = 5                 pos : neg    =      5.9 : 1.0
               wordcount = 0                 neg : pos    =      4.2 : 1.0
               wordcount = 4                 pos : neg    =      2.9 : 1.0
               wordcount = 1                 neg : pos    =      1.9 : 1.0
               wordcount = 3                 pos : neg    =      1.5 : 1.0
               wordcount = 2                 pos : neg    =      1.2 : 1.0
           mean_positive = 0.06466666666666666    neg : pos    =      1.0 : 1.0
           mean_positive = 0.09848780487804877    neg : pos    =      1.0 : 1.0


0.7375

In [11]:
from sklearn.naive_bayes import (
    BernoulliNB,
    ComplementNB,
    MultinomialNB,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

In [12]:
classifiers = {
    "BernoulliNB": BernoulliNB(),
    "ComplementNB": ComplementNB(),
    "MultinomialNB": MultinomialNB(),
    "KNeighborsClassifier": KNeighborsClassifier(),
    "DecisionTreeClassifier": DecisionTreeClassifier(),
    "RandomForestClassifier": RandomForestClassifier(),
    "LogisticRegression": LogisticRegression(),
    "MLPClassifier": MLPClassifier(max_iter=1000),
    "AdaBoostClassifier": AdaBoostClassifier(),
}

In [13]:
features = [
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "pos")
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]
features.extend([
    (extract_features(nltk.corpus.movie_reviews.raw(review)), "neg")
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
])

In [14]:
train_count = 4*len(features) // 5
shuffle(features)
for name, sklearn_classifier in classifiers.items():
    classifier = nltk.classify.SklearnClassifier(sklearn_classifier)
    classifier.train(features[:train_count])
    accuracy = nltk.classify.accuracy(classifier, features[train_count:])
    print(F"{accuracy:.2%} - {name}")

61.75% - BernoulliNB
69.50% - ComplementNB
69.50% - MultinomialNB
73.25% - KNeighborsClassifier
68.25% - DecisionTreeClassifier
73.25% - RandomForestClassifier
78.50% - LogisticRegression
77.00% - MLPClassifier
75.25% - AdaBoostClassifier


In [22]:
classifier = nltk.classify.SklearnClassifier(LogisticRegression())
classifier.train(features[:train_count])
accuracy = nltk.classify.accuracy(classifier, features[train_count:])

classifier.classify(extract_features(sample_review))

'neg'