# Preprocessing and Estimator Fine Tuning
In this notebook we will cover two stages that are essential for a proper training. \
These will be **data preprocessing** and **vectorizer**/**classificator finetuning**. 

# 1. Data Retrieval & Preprocessing


### 1.1. Load Datasets
- Load Rotten IMDB Dataset to train subjectivity detector.
- Load Movie Reviews Dataset to extract its vocabulary.
  
*note: as it's carried out in the original paper*

#### Functions

In [8]:
# Imports
import os
import nltk
from nltk.sentiment.util import mark_negation


def load_rotten_imdb(path):      #---> utils.preprocessing
    subjective_sentences = "quote.tok.gt9.5000"
    objective_sentences = "plot.tok.gt9.5000"

    subj = []
    with open(os.path.join(path, subjective_sentences), 'r') as f:
        [subj.append(sent.strip()) for sent in f.readlines()]

    obj = []
    with open(os.path.join(path, objective_sentences), 'r') as f:
        [obj.append(sent.strip()) for sent in f.readlines()]

    return subj, obj


def lol2str(doc):       #---> utils.preprocessing
    """Transforms a document in the list-of-lists format into
    a block of text (str type)."""
    return " ".join([word for sent in doc for word in sent])


def mr2str(dataset):    #---> utils.preprocessing
    """Transforms the Movie Reviews Dataset (or a slice) into a block of text."""
    return [lol2str(doc) for doc in dataset]


def get_movie_reviews_dataset(mark_negs:bool = True) -> str:    #---> utils.preprocessing
    """Uses the nltk library to download the "Movie Reviews" dateset,
    splitting it into negative reviews and positive reviews.
    Toggle :param mark_neg: if u wish sentences to be mark-negated or not."""
    nltk.download("movie_reviews")
    from nltk.corpus import movie_reviews
    neg = movie_reviews.paras(categories="neg")
    pos = movie_reviews.paras(categories="pos")
    if mark_negs:
        neg = [[mark_negation(sent) for sent in doc] for doc in neg]
        pos = [[mark_negation(sent) for sent in doc] for doc in pos]
    return pos, neg

In [9]:
PATH_TO_IMDB = 'data/rotten_imdb/'

subj, obj = load_rotten_imdb(path=PATH_TO_IMDB)
imdb_ds = subj + obj

In [10]:
PATH_TO_MOVIE_REVIEWS = 'data/movie_reviews/'

pos, neg = get_movie_reviews_dataset(mark_negs=True)
pos = mr2str(pos)
neg = mr2str(neg)
movie_reviews_ds = pos + neg

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/matteoambrosini/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


# 2. Word Vectorization

## 2.1. Define DiffPosNeg as in the original paper

#### Functions

In [37]:
# Imports
import sklearn
import numpy as np
from nltk.tokenize import word_tokenize


def hconcat(X1: np.ndarray, X2: np.ndarray) -> np.ndarray:      #---> utils.preprocessing
    """Applies horizontal concatenation to the X1 and X2 matrices, returning the concatenated matrix."""
    assert len(X1.shape) == len(
        X2.shape) == 2, "function 'hconcat' only works with matrices (np.array with 2 dimensions)."
    assert X1.shape[0] == X2.shape[0], "In order to hconcat matrices, they must have the same number of rows."
    N = X1.shape[0]
    M = X1.shape[1] + X2.shape[1]
    X = np.ndarray(shape=(N, M))
    X[:, :X1.shape[1]] = X1
    X[:, X1.shape[1]:] = X2
    return X


################################
########## DIFFPOSNEG ##########
################################
# Imports:
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet
import multiprocessing as mp
import time


pos2wn = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"}


def lesk(context_sentence, ambiguous_word, pos=None, synsets=None):
    """Return a synset for an ambiguous word in a context.

    :param iter context_sentence: The context sentence where the ambiguous word
         occurs, passed as an iterable of words.
    :param str ambiguous_word: The ambiguous word that requires WSD.
    :param str pos: A specified Part-of-Speech (POS).
    :param iter synsets: Possible synsets of the ambiguous word.
    :return: ``lesk_sense`` The Synset() object with the highest signature overlaps.
    """

    context = set(context_sentence)
    if synsets is None:
        synsets = wordnet.synsets(ambiguous_word)

    if pos:
        if pos == 'a':
            synsets = [ss for ss in synsets if str(ss.pos()) in ['a', 's']]
        else:
            synsets = [ss for ss in synsets if str(ss.pos()) == pos]

    if not synsets:
        return None

    _, sense = max(
        (len(context.intersection(ss.definition().split())), ss) for ss in synsets
    )

    return sense


def valence_count(sent, tokenizer, memory, update_mem):
    """Given a string :param: sent, returns the count of both
    positive and negative tokens in it."""
    tokens = tokenizer(sent)
    tagged_tokens = nltk.pos_tag(tokens, tagset="universal")
    tagged_tokens = [(t, pos2wn.get(pos_tag, None))
                     for (t, pos_tag) in tagged_tokens]
    sentence_counts = {"pos": 0, "neg": 0}
    for (t, pos_tag) in tagged_tokens:
        token_label = memory.get(t, None)
        if token_label is None:
            token_label = "neg"
            ss = lesk(tokens, t, pos=pos_tag)
            if ss:
                sense = swn.senti_synset(ss.name())
                if sense.pos_score() >= sense.neg_score():
                    token_label = "pos"
            if update_mem:
                memory[t] = token_label
        sentence_counts[token_label] += 1
    return sentence_counts


def swn_sentence_classification(sent, tokenizer, memory, update_mem):
    valence_counts = valence_count(sent, tokenizer, memory, update_mem)
    return 0 if valence_counts["neg"] > valence_counts["pos"] else 1


class DiffPosNegVectorizer(BaseEstimator, TransformerMixin):
    """Class for implementing the DiffPosNeg feature as described in https://aclanthology.org/I13-1114/
    through scikit-learn APIs."""
    
    def __init__(self, tokenizer=word_tokenize, lb=0, ub=1):
        """
        - :param tokenizer: Callable parameter, used to extract tokens from documents
        when vectorizing;
        - :param lb: lower bound for clipping absolute values of numerical distances once scaled;
        - :param rb: same as :param lb:, but upper bound.
        """
        super(BaseEstimator, self).__init__()
        super(TransformerMixin, self).__init__()
        self.tokenizer = tokenizer
        self.lb = lb
        self.ub = ub

    def diff_pos_neg_feature(self, doc, memory, update_mem=False, as_ratio=True) -> list:
        """Returns the DiffPosNeg feature of :param: doc.
        The feature is defined as the numerical distance between sentences
        with a positive orientation and sentences with a negative orientation."""
        pos_count, neg_count = 0, 0
        for sent in sent_tokenize(doc):
            sent_cls = swn_sentence_classification(
                sent, self.tokenizer, memory, update_mem)
            if sent_cls == 0:
                neg_count += 1
            else:
                pos_count += 1
        if pos_count >= neg_count:
            if as_ratio:
                return [abs(pos_count-neg_count)/(pos_count+neg_count), 1]
            return [abs(pos_count-neg_count), 1]
        if as_ratio:
            return [abs(pos_count-neg_count)/(pos_count+neg_count), 0]
        return [abs(pos_count - neg_count), 0]

    def fit(self, X, y=None, **fit_params):
        self.memory_ = {}
        # apply parallel execution of the 'diff_pos_neg' feature extraction function
        with mp.Manager() as manager:
            mem = manager.dict()
            with mp.Pool(processes=mp.cpu_count()) as pool:
                diff_pos_neg_feats = np.array(pool.starmap(
                    self.diff_pos_neg_feature, [(doc, mem, True) for doc in X]))
            self.memory_ = {k: v for k, v in mem.items()}
        distances = diff_pos_neg_feats[:, 0]
        self.min_ = np.amin(distances)
        self.max_ = np.amax(distances)
        return self

    def transform(self, X):
        in_time = time.time()
        # apply parallel execution of the 'diff_pos_neg' feature extraction function
        with mp.Manager() as manager:
            mem = manager.dict()
            mem = {k: v for k, v in self.memory_.items()}
            with mp.Pool(processes=mp.cpu_count()) as pool:
                diff_pos_neg_feats = np.array(pool.starmap(
                    self.diff_pos_neg_feature, [(doc, mem, False) for doc in X]))
        distances = diff_pos_neg_feats[:, 0]
        prevalences = diff_pos_neg_feats[:, -1]

        # scale the values in the range [0,100], taking care of possible values outside the fitted min/max by clipping
        distances = np.clip((distances - self.min_) / (self.max_ -
                            self.min_ + np.finfo(float).eps), a_min=self.lb, a_max=self.ub)
        distances = np.int16(distances*100)

        # put components together and return
        distances = np.expand_dims(distances, axis=-1)
        prevalences = np.expand_dims(np.array(prevalences), axis=-1)
        print(f"Transformed {len(X)} documents in {time.time()-in_time:.2f}s")
        return hconcat(distances, prevalences)

    def fit_transform(self, X, y=None, **fit_params):
        in_time = time.time()
        self.memory_ = {}
        # apply parallel execution of the 'diff_pos_neg' feature extraction function
        with mp.Manager() as manager:
            mem = manager.dict()
            with mp.Pool(processes=mp.cpu_count()) as pool:
                diff_pos_neg_feats = np.array(pool.starmap(
                    self.diff_pos_neg_feature, [(doc, mem, True) for doc in X]))
            self.memory_ = {k: v for k, v in mem.items()}
        distances = diff_pos_neg_feats[:, 0]
        prevalences = diff_pos_neg_feats[:, -1]
        print("Number of positive documents: {}".format(
            np.count_nonzero(prevalences)))

        # override stats inferred from the data
        self.min_ = np.amin(distances)
        self.max_ = np.amax(distances)

        # scaling the values of the distances in the range [0, 1]
        distances = (distances - self.min_) / \
            (self.max_ - self.min_ + np.finfo(float).eps)
        distances = np.int16(distances*100)

        # put the feature components back together after post-processing and return
        distances = np.expand_dims(distances, axis=-1)
        prevalences = np.expand_dims(prevalences, axis=-1)
        print(
            f"Fitted Model and transformed {len(X)} documents in {time.time()-in_time:.2f}s")
        return hconcat(distances, prevalences)


def switch_vectorizer(vectorizer_name="count"):     #---> utils.miscellaneous
    assert vectorizer_name in ("count", "tfidf", "diffposneg", "bert")
    if vectorizer_name == "count":
        return sklearn.feature_extraction.text.CountVectorizer(tokenizer=word_tokenize)
    elif vectorizer_name == "tfidf":
        return sklearn.feature_extraction.text.TfidfVectorizer(tokenizer=word_tokenize)
    elif vectorizer_name == "diffposneg":
        return DiffPosNegVectorizer()

def classification_report_csv(save_path, report):
    '''Saves Classification Report as csv'''
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split('      ')
        row['class'] = row_data[0]
        row['precision'] = float(row_data[1])
        row['recall'] = float(row_data[2])
        row['f1_score'] = float(row_data[3])
        row['support'] = float(row_data[4])
        report_data.append(row)
    dataframe = pd.DataFrame.from_dict(report_data)
    dataframe.to_csv(os.path.join(save_path, 'classification_report.csv'), index = False)

## 2.2. Testing Vectorizer and Classifier

#### Params

In [80]:
# TODO: wrap this up into a proper python file
#representation = 'count'
representation = 'tfidf'
#classifier = 'multinomial'
classifier = 'bernoulli'

In [81]:
assert representation in ('count', 'tfidf')
assert classifier in ('multinomial', 'bernoulli')

#### 2.2.1. Instantiate Vectorizer

In [82]:
# instantiate vectorizer
vectorizer = switch_vectorizer(representation)

#### 2.2.2. Instantiate Classifier

In [83]:
# Imports
from sklearn.naive_bayes import MultinomialNB, BernoulliNB


# instantiate classifier
if classifier == 'multinomial':
    clf = MultinomialNB()
else:
    clf = BernoulliNB()

#### 2.2.3. Fit Vectorizer
- At this step we first fit the vectorizer on the Movie Reviews dataset to link it to its vocab. All of this is done bearing in mind that *MovieReviews* will be the target dataset for the final evaluation!
- Then we vectorize the RottenIMDB Dataset with the vocab constraints from just extracted from Movie Reviews dataset.

In [84]:
# fit vectorizer on Movie Reviews
vectorizer.fit(movie_reviews_ds)

# vectorize the RottenIMDB Dataset with the vocab constraints from Movie Reviews
vectors = vectorizer.transform(imdb_ds)
labels = [1]*len(subj) + [0]*len(obj)



#### 2.2.4. N-Fold Cross Validation
- At this step we perform cross validation and grab the best estimator.

In [85]:
# Imports
from sklearn.model_selection import cross_validate, StratifiedKFold

scores = cross_validate(
    estimator=clf, X=vectors, y=labels, 
    cv=StratifiedKFold(n_splits=10), 
    scoring=['f1_micro'],
    return_estimator=True,
    n_jobs=-1
)

estimator = scores['estimator'][np.argmax(np.array(scores['test_f1_micro']))]


In [86]:
# Displaying cross validation results
average = sum(scores['test_f1_micro'])/len(scores['test_f1_micro'])
print("Average F1 Score from cross validation: {:.2f}".format(average))

Average F1 Score from cross validation: 0.92


#### 2.2.5. Get Best Estimator
Get best estimator on IMDB dataset

In [87]:
# Imports
from sklearn.metrics import classification_report
import pandas as pd


y_pred = estimator.predict(vectors)
report = classification_report(labels, y_pred)
# classification_report('tmp/tuning/',report)
print(report)

              precision    recall  f1-score   support

           0       0.97      0.93      0.95      4916
           1       0.93      0.98      0.95      4916

    accuracy                           0.95      9832
   macro avg       0.95      0.95      0.95      9832
weighted avg       0.95      0.95      0.95      9832



#### 2.2.6. Save Best Estimator and Vectorizer
Both vectorizer and estimator are saved for future uses.

In [13]:
# Imports
from joblib import dump

# save estimator
path_to_subj_detector = f'tmp/models/{representation}_{classifier}_subj_det_model.joblib'

if not os.path.exists(os.path.dirname(path_to_subj_detector)):
    os.makedirs(os.path.dirname(path_to_subj_detector))
print("Saving model at: ", path_to_subj_detector)
dump(estimator, path_to_subj_detector)


# save vectorizer
path_to_vectorizer = f'tmp/models/{representation}_{classifier}_subj_det_vectorizer.joblib'
if not os.path.exists(os.path.dirname(path_to_vectorizer)):
    os.makedirs(os.path.dirname(path_to_vectorizer))
print("Saving vectorizer at: ", path_to_vectorizer)
dump(vectorizer, path_to_vectorizer)

Saving model at:  tmp/models/count_multinomial_subj_det_model.joblib
Saving vectorizer at:  tmp/models/count_multinomial_subj_det_vectorizer.joblib


['tmp/models/count_multinomial_subj_det_vectorizer.joblib']

#### Sets up `nltk` packages for [main](02-main.py)

In [None]:
import nltk
nltk.download('movie_reviews')
nltk.download('sentiwordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')