## Best text variant notebook

### Group 8 Members
#### Spring Semester 2024-2025
- Alexandre Gonçalves - 20240738
- Bráulio Damba - 20240007
- Hugo Fonseca - 20240520
- Ricardo Pereira - 20240745
- Victoria Goon - 20240550

## 1 - Imports

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import scale
from sklearn.model_selection import GridSearchCV
import re
import string
import nltk
from sklearn.metrics.pairwise import cosine_distances
from collections import Counter
from nltk.corpus import stopwords
from wordcloud import WordCloud
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import re
import string

from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Text extraction 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_distances
from collections import Counter

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from gensim.models import FastText

import scipy.sparse
from scipy import sparse

import contractions
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
import gensim.downloader as api

from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from gensim.utils import simple_preprocess

import pickle
from sklearn.utils import resample

from itertools import product

from sentence_transformers import SentenceTransformer

# Deep Learning libraries
from keras.models import Sequential,Model
from keras.layers import Dense, Activation, Dropout, Flatten, Input
from keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, LSTM, Bidirectional, Dropout, Flatten, GRU
from tensorflow.keras.optimizers import Adam

from transformers import AutoTokenizer, AutoModel
import torch

# Set pd options to display all columns and rows
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 30)
pd.set_option('display.max_colwidth', None)  # Show full text without truncation


# Download required resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/ricardo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ricardo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/ricardo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/ricardo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
# Define the base directory (where the notebook is)
BASE_DIR = os.path.dirname(os.path.abspath("__file__"))

# Construct full paths to the CSV files
train_path = os.path.join(BASE_DIR, "data", "train.csv")
test_path = os.path.join(BASE_DIR, "data", "test.csv")

# Load the datasets
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

## 2 - Pre-Processing

In [3]:
# Source: https://www.nltk.org/api/nltk.stem.WordNetLemmatizer.html?highlight=wordnet
lemmatizer = WordNetLemmatizer()

# Source: https://www.nltk.org/api/nltk.tokenize.casual.html
# Difference between TweetTokenizer and Word_Tokenize: https://stackoverflow.com/questions/61919670/how-nltk-tweettokenizer-different-from-nltk-word-tokenize
tokenizer = TweetTokenizer()

# Source: https://www.nltk.org/_modules/nltk/stem/porter.html
stemmer = PorterStemmer()

# Set of English stop words from NLTK
stop_words = set(stopwords.words('english'))

In [4]:
def clean_text_column(text,lemmatizer=None, stemmer=None, remove_stopwords=None):
    text = text.lower()

    # Replace URLs and user mentions
    text = re.sub(r"http\S+|www\.\S+", "URL", text)
    text = re.sub(r"@\w+", "USER", text)

    # Expand contractions (we use contractions library for this)
    # Contractions library Source: https://pypi.org/project/contractions/
    text = contractions.fix(text)

    # # Replace numbers with [NUM]
    # text = re.sub(r"\d+(\.\d+)?", "[NUM]", text)

    # Convert to tickers (e.g., $AAPL to [TICKER])
    text = re.sub(r"\$[a-z]{1,5}", "[TICKER]", text)

    #Remove numbers
    text = re.sub(r"\d+", "", text)

    # Normalize punctuation repetitions
    text = re.sub(r"([!?\.])\1+", r"\1", text)

    # Tokenize
    tokens = tokenizer.tokenize(text)

    # Remove stop words and punctuation
    if remove_stopwords:
        tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    else:
        tokens = [token for token in tokens if token not in string.punctuation]
    
    # Lemmatization OR stemming 
    if lemmatizer is not None and stemmer is None:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    elif stemmer is not None and lemmatizer is None:
        tokens = [stemmer.stem(token) for token in tokens]
    elif lemmatizer is not None and stemmer is not None:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Else, leave tokens as is

    # Source: https://www.nltk.org/api/nltk.tokenize.treebank.html 
    # TreebankWordDetokenizer from NLTK takes care of the correct spacing and formatting, 
    # and we get a well-formed sentence that looks like natural English (e.g. without TreebankWordDetokinzer: This is an example tweet ! , With: This is an example tweet!)
    return TreebankWordDetokenizer().detokenize(tokens)

In [5]:
df_train_copy = df_train.copy()
df_test_copy = df_test.copy()

In [6]:
# Define the different pre-processing combinations to try

preproc_combinations = []

for lem, stm, rm_stop in product([None, lemmatizer], [None, stemmer], [False, True]):
    name = []
    name.append('lemma' if lem else 'no_lemma')
    name.append('stem' if stm else 'no_stem')
    name.append('no_stopwords' if rm_stop else 'with_stopwords')
    preproc_combinations.append({
        "lemmatizer": lem,
        "stemmer": stm,
        "remove_stopwords": rm_stop,
        "name": '_'.join(name)
    })

In [8]:
def apply_preproc_combinations(df, combinations, text_col="text"):
    for combo in combinations:
        column_name = f"text_{combo['name']}"
        print(f"Processing {column_name}...")
        df[column_name] = df[text_col].apply(
            lambda x: clean_text_column(
                x, 
                lemmatizer=combo['lemmatizer'], 
                stemmer=combo['stemmer'], 
                remove_stopwords=combo['remove_stopwords']
            )
        )
    return df

In [9]:
df_train_copy = apply_preproc_combinations(df_train_copy, preproc_combinations)

Processing text_no_lemma_no_stem_with_stopwords...
Processing text_no_lemma_no_stem_no_stopwords...
Processing text_no_lemma_stem_with_stopwords...
Processing text_no_lemma_stem_no_stopwords...
Processing text_lemma_no_stem_with_stopwords...
Processing text_lemma_no_stem_no_stopwords...
Processing text_lemma_stem_with_stopwords...
Processing text_lemma_stem_no_stopwords...


In [10]:
df_train_cleaned = df_train_copy.copy()

In [11]:
# Using stratify to maintain the distribution of classes in the train, validation, and test sets 
# Change this to 80% train, 10% val and 10% test
train_df, val_test_df = train_test_split(df_train_cleaned, test_size=0.3, stratify=df_train_cleaned['label'], random_state=42)
val_df, test_df = train_test_split(val_test_df, test_size=0.5, stratify=val_test_df['label'], random_state=42)

In [12]:
y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

In [13]:
train_df.drop(columns=['label'])

Unnamed: 0,text,text_no_lemma_no_stem_with_stopwords,text_no_lemma_no_stem_no_stopwords,text_no_lemma_stem_with_stopwords,text_no_lemma_stem_no_stopwords,text_lemma_no_stem_with_stopwords,text_lemma_no_stem_no_stopwords,text_lemma_stem_with_stopwords,text_lemma_stem_no_stopwords
7384,Today in Brexit: European Union members are ratcheting up their negotiating demands https://t.co/Qnh48BCc2l,today in brexit european union members are ratcheting up their negotiating demands URL,today brexit european union members ratcheting negotiating demands URL,today in brexit european union member are ratchet up their negoti demand url,today brexit european union member ratchet negoti demand url,today in brexit european union member are ratcheting up their negotiating demand URL,today brexit european union member ratcheting negotiating demand URL,today in brexit european union member are ratcheting up their negotiating demand URL,today brexit european union member ratcheting negotiating demand URL
8465,Did Service Corporation International's (NYSE:SCI) Share Price Deserve to Gain 96%?,did service corporation international's nyse sci share price deserve to gain,service corporation international's nyse sci share price deserve gain,did servic corpor international' nyse sci share price deserv to gain,servic corpor international' nyse sci share price deserv gain,did service corporation international's nyse sci share price deserve to gain,service corporation international's nyse sci share price deserve gain,did service corporation international's nyse sci share price deserve to gain,service corporation international's nyse sci share price deserve gain
4010,4 Money Moves That'll Make You Richer in 2020,money moves that will make you richer in,money moves make richer,money move that will make you richer in,money move make richer,money move that will make you richer in,money move make richer,money move that will make you richer in,money move make richer
3915,#HunterBiden is just killing my life .... it's so hard to be White now in America ..... it's like being William hun… https://t.co/uHk95qx1Ig,#hunterbiden is just killing my life it is so hard to be white now in america it is like being william hun … URL,#hunterbiden killing life hard white america like william hun … URL,#hunterbiden is just kill my life it is so hard to be white now in america it is like be william hun … url,#hunterbiden kill life hard white america like william hun … url,#hunterbiden is just killing my life it is so hard to be white now in america it is like being william hun … URL,#hunterbiden killing life hard white america like william hun … URL,#hunterbiden is just killing my life it is so hard to be white now in america it is like being william hun … URL,#hunterbiden killing life hard white america like william hun … URL
1022,Seattle Genetics earns drug approval in Canada,seattle genetics earns drug approval in canada,seattle genetics earns drug approval canada,seattl genet earn drug approv in canada,seattl genet earn drug approv canada,seattle genetics earns drug approval in canada,seattle genetics earns drug approval canada,seattle genetics earns drug approval in canada,seattle genetics earns drug approval canada
...,...,...,...,...,...,...,...,...,...
2722,Edited Transcript of LEE earnings conference call or presentation 12-Dec-19 3:00pm GMT,edited transcript of lee earnings conference call or presentation dec:p m gmt,edited transcript lee earnings conference call presentation dec:p gmt,edit transcript of lee earn confer call or present dec:p m gmt,edit transcript lee earn confer call present dec:p gmt,edited transcript of lee earnings conference call or presentation dec:p m gmt,edited transcript lee earnings conference call presentation dec:p gmt,edited transcript of lee earnings conference call or presentation dec:p m gmt,edited transcript lee earnings conference call presentation dec:p gmt
5170,The Best Books of 2019,the best books of,best books,the best book of,best book,the best book of,best book,the best book of,best book
5618,$HG1:COM $FCX $TECK - Copper pops to biggest gain in six years on stimulus bet https://t.co/RMjaEyvsZ8,TICKER]: com TICKER TICKER copper pops to biggest gain in six years on stimulus bet URL,TICKER]: com TICKER TICKER copper pops biggest gain six years stimulus bet URL,ticker]: com ticker ticker copper pop to biggest gain in six year on stimulu bet url,ticker]: com ticker ticker copper pop biggest gain six year stimulu bet url,TICKER]: com TICKER TICKER copper pop to biggest gain in six year on stimulus bet URL,TICKER]: com TICKER TICKER copper pop biggest gain six year stimulus bet URL,TICKER]: com TICKER TICKER copper pop to biggest gain in six year on stimulus bet URL,TICKER]: com TICKER TICKER copper pop biggest gain six year stimulus bet URL
8867,took $PLAY #4,took TICKER,took TICKER,took ticker,took ticker,took TICKER,took TICKER,took TICKER,took TICKER


### 3.1 - Statistical Methods

### 3.1.1 - Bag of Words

In [14]:
combinations = [
    ("text_no_lemma_no_stem_with_stopwords", train_df["text_no_lemma_no_stem_with_stopwords"]),
    ("text_lemma_no_stem_with_stopwords", train_df["text_lemma_no_stem_with_stopwords"]),
    ("text_no_lemma_stem_with_stopwords", train_df["text_no_lemma_stem_with_stopwords"]),
    ("text_no_lemma_no_stem_no_stopwords", train_df["text_no_lemma_no_stem_no_stopwords"]),
    ("text_lemma_no_stem_no_stopwords", train_df["text_lemma_no_stem_no_stopwords"]),
    ("text_no_lemma_stem_no_stopwords", train_df["text_no_lemma_stem_no_stopwords"]),
    ("text_lemma_stem_with_stopwords", train_df["text_lemma_stem_with_stopwords"]),
    ("text_lemma_stem_no_stopwords", train_df["text_lemma_stem_no_stopwords"]),
]

In [None]:
bow_vectors = {}

for column_name, _ in combinations:
    print(f"Fitting BOW vectorizer for {column_name}...")

    bow_vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=15_000)
    bow_vectorizer.fit(train_df[column_name]) 

    bow_vectors[column_name] = {
        "train": bow_vectorizer.transform(train_df[column_name]),
        "val": bow_vectorizer.transform(val_df[column_name]),
        "test": bow_vectorizer.transform(test_df[column_name]),
    }

bow_labels = {
"train": y_train,
"val": y_val,
"test": y_test
}

Fitting BOW vectorizer for text_no_lemma_no_stem_with_stopwords...
Fitting BOW vectorizer for text_lemma_no_stem_with_stopwords...
Fitting BOW vectorizer for text_no_lemma_stem_with_stopwords...
Fitting BOW vectorizer for text_no_lemma_no_stem_no_stopwords...
Fitting BOW vectorizer for text_lemma_no_stem_no_stopwords...
Fitting BOW vectorizer for text_no_lemma_stem_no_stopwords...
Fitting BOW vectorizer for text_lemma_stem_with_stopwords...
Fitting BOW vectorizer for text_lemma_stem_no_stopwords...


In [None]:
models = {
    "SVC": SVC(class_weight='balanced', random_state=42),
    "XGB": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=300, class_weight='balanced', random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}

In [None]:
results_bow = []

for col in combinations:
    column_name = col[0]
    print(f"\nEvaluating text variant: {column_name}")

    X_train = bow_vectors[column_name]["train"]
    X_val   = bow_vectors[column_name]["val"]

    for model_name, model in models.items():
        print(f"Training {model_name} on original data...")

        model_instance = clone(model)
        model_instance.fit(X_train, y_train)
        y_pred = model_instance.predict(X_val)

        report = classification_report(y_val, y_pred, output_dict=True)

        results_bow.append({
            "variant": column_name,
            "model": model_name,
            "accuracy": report["accuracy"],
            "macro_f1": report["macro avg"]["f1-score"],
            "macro_precision": report["macro avg"]["precision"],
            "macro_recall": report["macro avg"]["recall"],
            "weighted_f1": report["weighted avg"]["f1-score"],
            "weighted_precision": report["weighted avg"]["precision"],
            "weighted_recall": report["weighted avg"]["recall"],
        })

traditional_ml_bow = pd.DataFrame(results_bow)


Evaluating text variant: text_no_lemma_no_stem_with_stopwords
Training SVC on original data...
Training XGB on original data...
Training LogisticRegression on original data...
Training KNN on original data...

Evaluating text variant: text_lemma_no_stem_with_stopwords
Training SVC on original data...
Training XGB on original data...
Training LogisticRegression on original data...
Training KNN on original data...

Evaluating text variant: text_no_lemma_stem_with_stopwords
Training SVC on original data...
Training XGB on original data...
Training LogisticRegression on original data...
Training KNN on original data...

Evaluating text variant: text_no_lemma_no_stem_no_stopwords
Training SVC on original data...
Training XGB on original data...
Training LogisticRegression on original data...
Training KNN on original data...

Evaluating text variant: text_lemma_no_stem_no_stopwords
Training SVC on original data...
Training XGB on original data...
Training LogisticRegression on original data

In [None]:
# Sort and show as a full DataFrame
sorted_results_bow = traditional_ml_bow.sort_values(by="macro_f1", ascending=False)

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)

# Now display the full DataFrame
display(sorted_results_bow)

Unnamed: 0,variant,model,accuracy,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall
10,text_no_lemma_stem_with_stopwords,LogisticRegression,0.804333,0.729893,0.733927,0.726412,0.803083,0.802119,0.804333
2,text_no_lemma_no_stem_with_stopwords,LogisticRegression,0.804333,0.728305,0.734357,0.722844,0.802591,0.801212,0.804333
6,text_lemma_no_stem_with_stopwords,LogisticRegression,0.801537,0.72402,0.730592,0.718254,0.799584,0.798097,0.801537
26,text_lemma_stem_with_stopwords,LogisticRegression,0.801537,0.72402,0.730592,0.718254,0.799584,0.798097,0.801537
22,text_no_lemma_stem_no_stopwords,LogisticRegression,0.796646,0.722437,0.723031,0.721913,0.796537,0.79647,0.796646
18,text_lemma_no_stem_no_stopwords,LogisticRegression,0.789658,0.711204,0.7145,0.708107,0.788621,0.787707,0.789658
30,text_lemma_stem_no_stopwords,LogisticRegression,0.789658,0.711204,0.7145,0.708107,0.788621,0.787707,0.789658
4,text_lemma_no_stem_with_stopwords,SVC,0.795248,0.708807,0.733814,0.690534,0.788879,0.78682,0.795248
24,text_lemma_stem_with_stopwords,SVC,0.795248,0.708807,0.733814,0.690534,0.788879,0.78682,0.795248
14,text_no_lemma_no_stem_no_stopwords,LogisticRegression,0.786862,0.707967,0.712828,0.703438,0.78543,0.784258,0.786862


2) TF-IDF

In [20]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=15000)
tfidf_vectors = {}

for column_name, _ in combinations:
    print(f"Fitting TF-IDF vectorizer for {column_name}...")
    tfidf_vectorizer.fit(train_df[column_name])

    tfidf_vectors[column_name] = {
        "train": tfidf_vectorizer.transform(train_df[column_name]),
        "val": tfidf_vectorizer.transform(val_df[column_name]),
        "test": tfidf_vectorizer.transform(test_df[column_name])
    }

Fitting TF-IDF vectorizer for text_no_lemma_no_stem_with_stopwords...
Fitting TF-IDF vectorizer for text_lemma_no_stem_with_stopwords...
Fitting TF-IDF vectorizer for text_no_lemma_stem_with_stopwords...
Fitting TF-IDF vectorizer for text_no_lemma_no_stem_no_stopwords...
Fitting TF-IDF vectorizer for text_lemma_no_stem_no_stopwords...
Fitting TF-IDF vectorizer for text_no_lemma_stem_no_stopwords...
Fitting TF-IDF vectorizer for text_lemma_stem_with_stopwords...
Fitting TF-IDF vectorizer for text_lemma_stem_no_stopwords...


In [21]:
# Model training and evaluation
results_tfidf = []

for column_name, _ in combinations:
    print(f"\nEvaluating TF-IDF for: {column_name}")
    X_train = tfidf_vectors[column_name]["train"]
    X_val = tfidf_vectors[column_name]["val"]

    for model_name, model in models.items():
        print(f"Training {model_name}...")
        model_instance = clone(model)
        model_instance.fit(X_train, y_train)
        y_pred = model_instance.predict(X_val)

        report = classification_report(y_val, y_pred, output_dict=True)

        results_tfidf.append({
            "variant": column_name,
            "model": model_name,
            "accuracy": report["accuracy"],
            "macro_f1": report["macro avg"]["f1-score"],
            "macro_precision": report["macro avg"]["precision"],
            "macro_recall": report["macro avg"]["recall"],
            "weighted_f1": report["weighted avg"]["f1-score"],
            "weighted_precision": report["weighted avg"]["precision"],
            "weighted_recall": report["weighted avg"]["recall"],
        })

# Results DataFrame
traditional_ml_tfidf = pd.DataFrame(results_tfidf)


Evaluating TF-IDF for: text_no_lemma_no_stem_with_stopwords
Training SVC...
Training XGB...
Training LogisticRegression...
Training KNN...

Evaluating TF-IDF for: text_lemma_no_stem_with_stopwords
Training SVC...
Training XGB...
Training LogisticRegression...
Training KNN...

Evaluating TF-IDF for: text_no_lemma_stem_with_stopwords
Training SVC...
Training XGB...
Training LogisticRegression...
Training KNN...

Evaluating TF-IDF for: text_no_lemma_no_stem_no_stopwords
Training SVC...
Training XGB...
Training LogisticRegression...
Training KNN...

Evaluating TF-IDF for: text_lemma_no_stem_no_stopwords
Training SVC...
Training XGB...
Training LogisticRegression...
Training KNN...

Evaluating TF-IDF for: text_no_lemma_stem_no_stopwords
Training SVC...
Training XGB...
Training LogisticRegression...
Training KNN...

Evaluating TF-IDF for: text_lemma_stem_with_stopwords
Training SVC...
Training XGB...
Training LogisticRegression...
Training KNN...

Evaluating TF-IDF for: text_lemma_stem_no_s

In [22]:
sorted_results_tfidf = traditional_ml_tfidf.sort_values(by="macro_f1", ascending=False)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.max_colwidth", None)
display(sorted_results_tfidf)

Unnamed: 0,variant,model,accuracy,macro_f1,macro_precision,macro_recall,weighted_f1,weighted_precision,weighted_recall
8,text_no_lemma_stem_with_stopwords,SVC,0.815514,0.731998,0.778293,0.703326,0.806934,0.808865,0.815514
10,text_no_lemma_stem_with_stopwords,LogisticRegression,0.794549,0.726905,0.717443,0.737918,0.797449,0.80158,0.794549
2,text_no_lemma_no_stem_with_stopwords,LogisticRegression,0.791055,0.722747,0.714299,0.73255,0.793628,0.797305,0.791055
6,text_lemma_no_stem_with_stopwords,LogisticRegression,0.787561,0.720361,0.710561,0.731926,0.790626,0.795116,0.787561
26,text_lemma_stem_with_stopwords,LogisticRegression,0.787561,0.720361,0.710561,0.731926,0.790626,0.795116,0.787561
0,text_no_lemma_no_stem_with_stopwords,SVC,0.809224,0.718657,0.778195,0.68669,0.798557,0.803592,0.809224
4,text_lemma_no_stem_with_stopwords,SVC,0.806429,0.716831,0.769087,0.687642,0.796493,0.799723,0.806429
24,text_lemma_stem_with_stopwords,SVC,0.806429,0.716831,0.769087,0.687642,0.796493,0.799723,0.806429
22,text_no_lemma_stem_no_stopwords,LogisticRegression,0.781971,0.710177,0.70095,0.721531,0.785385,0.790555,0.781971
18,text_lemma_no_stem_no_stopwords,LogisticRegression,0.779175,0.703476,0.695583,0.712576,0.782105,0.786011,0.779175


We chose text_no_lemma_stem_with_stopwords by majority vote.