In [1]:
from datasets import load_dataset

dataset = load_dataset("imdb")

Reusing dataset imdb (/home/atomesz/.cache/huggingface/datasets/imdb/plain_text/1.0.0/e3c66f1788a67a89c7058d97ff62b6c30531e05b549de56d3ab91891f0561f9a)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
x_train = dataset["train"][:]["text"]
y_train = dataset["train"][:]["label"]

x_test = dataset["test"][:]["text"]
y_test = dataset["test"][:]["label"]

In [3]:
import spacy

# loading the small English model
nlp = spacy.load("en_core_web_sm")

In [4]:
import re
from multiprocessing import Pool
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

re_word = re.compile(r"^\w+$")

def get_gnb_score():  
    # We apply our preprocessing function to the training set
    with Pool() as p:
        preprocessed_x_train = p.map(preprocess_str, x_train)
        

    vectorizer = CountVectorizer(max_features=1000)
    X = vectorizer.fit_transform(preprocessed_x_train).toarray()
    
    gnb = GaussianNB()
    gnb.fit(X, y_train)

    # We apply our preprocessing function to the testing set
    with Pool() as p:
        preprocessed_x_test = p.map(preprocess_str, x_test)
        
    X_test = vectorizer.transform(preprocessed_x_test).toarray()
    
    y_pred = gnb.predict(X_test)
    
    print(classification_report(y_test, y_pred))

# Without preprocessing

In [5]:
def preprocess_str(text: str):
    return text

get_gnb_score()

              precision    recall  f1-score   support

           0       0.78      0.84      0.81     12500
           1       0.83      0.77      0.80     12500

    accuracy                           0.80     25000
   macro avg       0.80      0.80      0.80     25000
weighted avg       0.80      0.80      0.80     25000



# Stopwords removal

In [6]:
def preprocess_str(text: str):
    tokens_str = [str(token) for token in nlp(text.lower()) if re_word.match(token.text) and not token.is_stop]
    return " ".join(tokens_str)

get_gnb_score()

              precision    recall  f1-score   support

           0       0.76      0.84      0.80     12500
           1       0.82      0.74      0.78     12500

    accuracy                           0.79     25000
   macro avg       0.79      0.79      0.79     25000
weighted avg       0.79      0.79      0.79     25000



# Stemming

In [7]:
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer("english")

def preprocess_str(text: str):

    stemmed = [stemmer.stem(word) for word in word_tokenize(text.lower()) if re_word.match(word)]
    return " ".join(stemmed)

get_gnb_score()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/atomesz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

           0       0.72      0.85      0.78     12500
           1       0.82      0.67      0.74     12500

    accuracy                           0.76     25000
   macro avg       0.77      0.76      0.76     25000
weighted avg       0.77      0.76      0.76     25000



# Stopwords removal + Stemming

In [8]:
def preprocess_str(text: str):

    stemmed = [stemmer.stem(word) for word in word_tokenize(text.lower()) if re_word.match(word) and not word in stop_words]
    return " ".join(stemmed)

get_gnb_score()

              precision    recall  f1-score   support

           0       0.73      0.84      0.78     12500
           1       0.81      0.68      0.74     12500

    accuracy                           0.76     25000
   macro avg       0.77      0.76      0.76     25000
weighted avg       0.77      0.76      0.76     25000



# Lemming

In [9]:
def preprocess_str(text: str):

    lemmas = [token.lemma_ for token in nlp(text.lower()) if re_word.match(token.text)]
    return " ".join(lemmas)

get_gnb_score()

              precision    recall  f1-score   support

           0       0.74      0.84      0.79     12500
           1       0.82      0.70      0.76     12500

    accuracy                           0.77     25000
   macro avg       0.78      0.77      0.77     25000
weighted avg       0.78      0.77      0.77     25000



# Stopwords removal + Lemming

In [10]:
def preprocess_str(text: str):

    lemmas = [token.lemma_ for token in nlp(text.lower()) if re_word.match(token.text) and not token.is_stop]
    return " ".join(lemmas)

get_gnb_score()

              precision    recall  f1-score   support

           0       0.72      0.85      0.78     12500
           1       0.81      0.68      0.74     12500

    accuracy                           0.76     25000
   macro avg       0.77      0.76      0.76     25000
weighted avg       0.77      0.76      0.76     25000

