In [1]:
import pandas as pd
from os import getcwd

REVIEWS_DIR = getcwd() + '/data/book_reviews_nl.csv'
reviews_df = pd.read_csv(REVIEWS_DIR)

In [2]:
from sklearn.model_selection import train_test_split

X = reviews_df['text']
y = reviews_df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.4,
    random_state=42
)

In [3]:
from Stemmer import Stemmer
from sklearn.feature_extraction.text import TfidfVectorizer

class StemmedTfidfVectorizer(TfidfVectorizer):
    def build_analyzer(self):
        analyzer = super(TfidfVectorizer, self).build_analyzer()
        return lambda doc: Stemmer('nl').stemWords(analyzer(doc))

In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from multiprocessing import cpu_count
from stop_words import get_stop_words
from numpy import logspace
from time import time

class Model:
    def __init__(self):
        self.grid_pipeline = GridSearchCV(
            # Make the pipeline
            make_pipeline(
                StemmedTfidfVectorizer(
                    use_idf=True,
                    min_df=1,
                    stop_words=get_stop_words('dutch'),
                    analyzer='word',
                    ngram_range=(1, 1)
                ),
                LogisticRegression(
                    random_state=42
                )
            ),
            # Specify the hyperparametrization
            {
                'logisticregression__penalty': ['l1', 'l2'],
                'logisticregression__C': logspace(-4, 4, 20),
                'logisticregression__solver': ['liblinear']
            },
            scoring='accuracy',
            cv=5,
            # Use as many CPUs as possible
            n_jobs=cpu_count() - 1,
        )
    
    def train(self):
        t1 = time()
        # Mention that the training proces has started
        print('\n', 'Started training')
        self.grid_pipeline.fit(X_train, y_train)
        # Print the time that has elapsed
        print('\n', 'Training time elapsed: %s' % str(time() - t1))
        # Print the best parameters
        print('\n', 'Best parameters:', self.grid_pipeline.best_params_)
    
    def test(self):
        predictions = self.grid_pipeline.predict(X_test)
        # Test the pipeline
        print(
            '\n',
            'Classification report:',
            '\n',
            classification_report(y_test, predictions))
        print(
            '\n',
            'Confusion matrix:',
            '\n',
            confusion_matrix(y_test, predictions)
        )
    
    def predict(self, _X):
        return self.grid_pipeline.predict(_X)