In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")

Reusing dataset imdb (/home/atomesz/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
x_train = dataset["train"]["text"]
y_train = dataset["train"]["label"]

x_test = dataset["test"]["text"]
y_test = dataset["test"]["label"]

In [3]:
from multiprocessing import Pool
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

import functools


def get_gnb_score(preprocess_func, binary:bool = False, remove_stopwords: bool = False):    
    # We apply our preprocessing function to the training set
    with Pool() as p:
        preprocessed_x_train = p.map(functools.partial(preprocess_func, remove_stopwords=remove_stopwords), x_train)
        
    vectorizer = CountVectorizer(max_features=1000, binary=binary)
    X = vectorizer.fit_transform(preprocessed_x_train).toarray()
    
    gnb = GaussianNB()
    gnb.fit(X, y_train)

    # We apply our preprocessing function to the testing set
    with Pool() as p:
        preprocessed_x_test = p.map(functools.partial(preprocess_func, remove_stopwords=remove_stopwords), x_test)
        
    X_test = vectorizer.transform(preprocessed_x_test).toarray()
    
    y_pred = gnb.predict(X_test)
    
    print(classification_report(y_test, y_pred))

In [4]:
from preprocessing import *

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/atomesz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Without preprocessing

In [5]:
get_gnb_score(basic)

              precision    recall  f1-score   support

           0       0.78      0.84      0.81     12500
           1       0.83      0.77      0.80     12500

    accuracy                           0.80     25000
   macro avg       0.80      0.80      0.80     25000
weighted avg       0.80      0.80      0.80     25000



# Stopwords removal

In [6]:
get_gnb_score(basic, remove_stopwords=True)

              precision    recall  f1-score   support

           0       0.76      0.84      0.80     12500
           1       0.82      0.74      0.78     12500

    accuracy                           0.79     25000
   macro avg       0.79      0.79      0.79     25000
weighted avg       0.79      0.79      0.79     25000



# Stemming

In [7]:
get_gnb_score(stemming, remove_stopwords=False)

              precision    recall  f1-score   support

           0       0.72      0.85      0.78     12500
           1       0.82      0.67      0.74     12500

    accuracy                           0.76     25000
   macro avg       0.77      0.76      0.76     25000
weighted avg       0.77      0.76      0.76     25000



# Stopwords removal + Stemming

In [8]:
get_gnb_score(stemming, remove_stopwords=True)

              precision    recall  f1-score   support

           0       0.73      0.84      0.78     12500
           1       0.81      0.68      0.74     12500

    accuracy                           0.76     25000
   macro avg       0.77      0.76      0.76     25000
weighted avg       0.77      0.76      0.76     25000



# Lemming

In [9]:
get_gnb_score(lemming, remove_stopwords=False)

              precision    recall  f1-score   support

           0       0.74      0.84      0.79     12500
           1       0.82      0.70      0.76     12500

    accuracy                           0.77     25000
   macro avg       0.78      0.77      0.77     25000
weighted avg       0.78      0.77      0.77     25000



# Stopwords removal + Lemming

In [11]:
get_gnb_score(lemming, remove_stopwords=True)

              precision    recall  f1-score   support

           0       0.72      0.85      0.78     12500
           1       0.81      0.68      0.74     12500

    accuracy                           0.76     25000
   macro avg       0.77      0.76      0.76     25000
weighted avg       0.77      0.76      0.76     25000



# This time with binary version

In [12]:
get_gnb_score(basic, binary=True)

              precision    recall  f1-score   support

           0       0.82      0.82      0.82     12500
           1       0.82      0.82      0.82     12500

    accuracy                           0.82     25000
   macro avg       0.82      0.82      0.82     25000
weighted avg       0.82      0.82      0.82     25000



In [13]:
get_gnb_score(basic, binary=True, remove_stopwords=True)

              precision    recall  f1-score   support

           0       0.81      0.82      0.82     12500
           1       0.82      0.81      0.81     12500

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000



In [14]:
get_gnb_score(stemming, binary=True, remove_stopwords=False)

              precision    recall  f1-score   support

           0       0.80      0.82      0.81     12500
           1       0.82      0.79      0.80     12500

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000



In [15]:
get_gnb_score(stemming, binary=True, remove_stopwords=True)

              precision    recall  f1-score   support

           0       0.80      0.82      0.81     12500
           1       0.82      0.80      0.81     12500

    accuracy                           0.81     25000
   macro avg       0.81      0.81      0.81     25000
weighted avg       0.81      0.81      0.81     25000



In [16]:
get_gnb_score(lemming, binary=True, remove_stopwords=False)

              precision    recall  f1-score   support

           0       0.80      0.82      0.81     12500
           1       0.81      0.79      0.80     12500

    accuracy                           0.80     25000
   macro avg       0.80      0.80      0.80     25000
weighted avg       0.80      0.80      0.80     25000



In [17]:
get_gnb_score(lemming, binary=True, remove_stopwords=True)

              precision    recall  f1-score   support

           0       0.79      0.82      0.81     12500
           1       0.81      0.79      0.80     12500

    accuracy                           0.80     25000
   macro avg       0.80      0.80      0.80     25000
weighted avg       0.80      0.80      0.80     25000

