In [10]:
from utils import evaluation, composition

import pandas as pd
import numpy as np

import os
import nltk

from transformers import AutoTokenizer

from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings('ignore')

from gensim.models import KeyedVectors

In [11]:
# Global variables
N_FOLDS = 5
RANDOM_SEED = 32
DATA_PATH = './data'

In [12]:
# Read the data to be processed
data = pd.read_csv(os.path.join(DATA_PATH, 'gold_standard_27maig2020.csv'), sep='|')
data.head()

Unnamed: 0,source_id,text,constructive,toxic,toxicity_degree,sarcasm_irony,mockery_ridicule,insults,argument_discussion,negative_toxic_lang,aggressiveness,intolerance,type
0,1,Con un poco de suerte si esto sigue a este rit...,NO,SÍ,2,NO,NO,NO,NO,NO,NO,SÍ,INMIGRACIÓN
1,2,Telita con el artículo. No sé bien si lo que s...,SÍ,NO,1,NO,NO,NO,SÍ,NO,NO,NO,INMIGRACIÓN
2,2-bis,"Que eso ocurra en algunos casos es posible, cl...",SÍ,SÍ,2,NO,NO,NO,SÍ,NO,NO,NO,INMIGRACIÓN
3,3,"Al menos en Francia tienen el Frente Nacional,...",NO,SÍ,3,NO,SÍ,SÍ,NO,SÍ,NO,NO,INMIGRACIÓN
4,3-bis,Cuando dejen de soplar los vientos de cola ( p...,NO,SÍ,2,NO,NO,NO,SÍ,NO,NO,SÍ,INMIGRACIÓN


In [4]:
# Set the general evaluation configuration
eval_config = {
    'n_folds': N_FOLDS,
    'basic_manual_both': 0,
    'log': False
}

In [5]:
model_name = 'Random Forest'
params = {'name': [model_name],
        'n_trees': [75, 100, 150],
        'criterion': ['gini', 'entropy'],
        'n_feats': ['sqrt', 'log2'],
        'PCA_components': [0.2, 0.5, 0.8],
        'svd_solver': ['full'],
        'bootstrap': [True],
        'strip_accents': [None],
        'stop_words': [None]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.2, 'bootstrap': True, 'criterion': 'gini', 'n_feats': 'sqrt', 'n_trees': 75, 'name': 'Random Forest', 'svd_solver': 'full'}
Total Prediction Accuracy: 0.19264520452058162 ± 0.006437195293397951


In [6]:
model_name = 'SVC'
params = {'name': [model_name],
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'gamma': ['auto', 'scale'],
    'decision_func': ['ovr', 'ovo'],
    'PCA_components': [0.5],
    'svd_solver': ['full'],
    'penalty': [1, 100],
    'strip_accents': [None],
    'stop_words': [None]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.5, 'decision_func': 'ovr', 'gamma': 'scale', 'kernel': 'rbf', 'name': 'SVC', 'penalty': 100, 'svd_solver': 'full'}
Total Prediction Accuracy: 0.25309968755591344 ± 0.015773275614319453


In [7]:
model_name = 'Logistic Regression'
params = {'name': [model_name],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear'],
    'multi_class': ['auto'],
    'PCA_components': [0.5, 0.8],
    'svd_solver': ['full'],
    'strip_accents': [None],
    'stop_words': [None]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.8, 'multi_class': 'auto', 'name': 'Logistic Regression', 'penalty': 'l2', 'solver': 'lbfgs', 'svd_solver': 'full'}
Total Prediction Accuracy: 0.19517586120574204 ± 0.008952963441622845


## What happens if the number of PCA components is drastically reduced? 
### (accuracy is still the performance score in this section)

In [15]:
# Now let's try diminishing the number of PCA components drastically
model_name = 'Random Forest'

params = {}
model_config = {'criterion': [],
                'n_trees': [],
                
    'name': model_name,
    'n_trees': 150,
    'criterion': 'gini',
    'n_feats': 3,
    'PCA_components': 10,
    'svd_solver': 'full',
    'bootstrap': True
}

evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)

Accuracy for Fold 1 is: 0.5692
Accuracy for Fold 2 is: 0.5613
Accuracy for Fold 3 is: 0.5692
Accuracy for Fold 4 is: 0.5992
Accuracy for Fold 5 is: 0.6032
Total Prediction Accuracy is: 0.5804 ± 0.0173


In [16]:
model_name = 'SVC'
model_config = {
    'name': model_name,
    'kernel': 'linear',
    'gamma': 'auto',
    'decision_func': 'ovr',
    'PCA_components': 10,
    'svd_solver': 'full',
    'penalty': 100
}

evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)

Accuracy for Fold 1 is: 0.5889
Accuracy for Fold 2 is: 0.5889
Accuracy for Fold 3 is: 0.5929
Accuracy for Fold 4 is: 0.5952
Accuracy for Fold 5 is: 0.5952
Total Prediction Accuracy is: 0.5922 ± 0.0028


In [17]:
model_name = 'Logistic Regression'
model_config = {
    'name': model_name,
    'penalty': 'l2',
    'solver': 'lbfgs',
    'PCA_components': 10,
    'svd_solver': 'full',
    'multi_class': 'auto'
}

evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)

Accuracy for Fold 1 is: 0.5889
Accuracy for Fold 2 is: 0.5889
Accuracy for Fold 3 is: 0.5889
Accuracy for Fold 4 is: 0.5952
Accuracy for Fold 5 is: 0.5952
Total Prediction Accuracy is: 0.5915 ± 0.0031


## Manual Features

In [4]:
# Set the general evaluation configuration
eval_config = {
    'n_folds': N_FOLDS,
    'basic_manual_both': 1,
    'log': False
}

In [5]:
model_name = 'Random Forest'
params = {'name': [model_name],
        'n_trees': [75, 100, 150],
        'criterion': ['gini', 'entropy'],
        'n_feats': ['sqrt', 'log2'],
        'PCA_components': [0.2, 0.5, 0.8],
        'svd_solver': ['full'],
        'bootstrap': [True],
        'strip_accents': [None],
        'stop_words': [None]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.2, 'bootstrap': True, 'criterion': 'gini', 'n_feats': 'sqrt', 'n_trees': 75, 'name': 'Random Forest', 'svd_solver': 'full'}
Total Prediction Accuracy: 0.6469567043511599 ± 0.029735920331835315


In [6]:
model_name = 'SVC'
params = {'name': [model_name],
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'gamma': ['auto', 'scale'],
    'decision_func': ['ovr', 'ovo'],
    'PCA_components': [0.5],
    'svd_solver': ['full'],
    'penalty': [1, 100],
    'strip_accents': [None],
    'stop_words': [None]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.5, 'decision_func': 'ovr', 'gamma': 'auto', 'kernel': 'linear', 'name': 'SVC', 'penalty': 100, 'svd_solver': 'full'}
Total Prediction Accuracy: 0.7131239361859932 ± 0.024950245497339693


In [7]:
model_name = 'Logistic Regression'
params = {'name': [model_name],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear'],
    'multi_class': ['auto'],
    'PCA_components': [0.5, 0.8],
    'svd_solver': ['full'],
    'strip_accents': [None],
    'stop_words': [None]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.5, 'multi_class': 'auto', 'name': 'Logistic Regression', 'penalty': 'l2', 'solver': 'lbfgs', 'svd_solver': 'full'}
Total Prediction Accuracy: 0.6980215743408402 ± 0.030822866545072287


## Tf-Idf plus Manual Features

In [4]:
# Set the general evaluation configuration
eval_config = {
    'n_folds': N_FOLDS,
    'basic_manual_both': 2,
    'log': False
}

In [5]:
model_name = 'Random Forest'
params = {'name': [model_name],
        'n_trees': [75, 100, 150],
        'criterion': ['gini', 'entropy'],
        'n_feats': ['sqrt', 'log2'],
        'PCA_components': [0.2, 0.5, 0.8],
        'svd_solver': ['full'],
        'bootstrap': [True],
        'strip_accents': [None],
        'stop_words': [None]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.2, 'bootstrap': True, 'criterion': 'entropy', 'n_feats': 'log2', 'n_trees': 150, 'name': 'Random Forest', 'svd_solver': 'full'}
Total Prediction Accuracy: 0.6107953119440055 ± 0.038566543239510906


In [6]:
model_name = 'SVC'
params = {'name': [model_name],
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'gamma': ['auto', 'scale'],
    'decision_func': ['ovr', 'ovo'],
    'PCA_components': [0.5],
    'svd_solver': ['full'],
    'penalty': [1, 100],
    'strip_accents': [None],
    'stop_words': [None]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.5, 'decision_func': 'ovr', 'gamma': 'auto', 'kernel': 'sigmoid', 'name': 'SVC', 'penalty': 100, 'svd_solver': 'full'}
Total Prediction Accuracy: 0.6958708872673602 ± 0.03383274832104306


In [7]:
model_name = 'Logistic Regression'
params = {'name': [model_name],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear'],
    'multi_class': ['auto'],
    'PCA_components': [0.5, 0.8],
    'svd_solver': ['full'],
    'strip_accents': [None],
    'stop_words': [None]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.5, 'multi_class': 'auto', 'name': 'Logistic Regression', 'penalty': 'l2', 'solver': 'lbfgs', 'svd_solver': 'full'}
Total Prediction Accuracy: 0.7053333878582009 ± 0.04219655242386391


## Tf-Idf stripping accents and removing stopwords

In [13]:
sw = nltk.corpus.stopwords.words('spanish')

In [14]:
# Set the general evaluation configuration
eval_config = {
    'n_folds': N_FOLDS,
    'basic_manual_both': 0,
    'log': False
}

In [18]:
model_name = 'Random Forest'
params = {'name': [model_name],
        'n_trees': [75, 100, 150],
        'criterion': ['gini', 'entropy'],
        'n_feats': ['sqrt', 'log2'],
        'PCA_components': [0.2, 0.5, 0.8],
        'svd_solver': ['full'],
        'bootstrap': [True],
        'strip_accents': ['unicode'],
        'stop_words': [sw]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config
        
best_config.pop('stop_words')

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.2, 'bootstrap': True, 'criterion': 'gini', 'n_feats': 'sqrt', 'n_trees': 100, 'name': 'Random Forest', 'strip_accents': 'unicode', 'svd_solver': 'full'}
Total Prediction Accuracy: 0.19288771888606082 ± 0.0038792641586980602


In [19]:
model_name = 'SVC'
params = {'name': [model_name],
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'gamma': ['auto', 'scale'],
    'decision_func': ['ovr', 'ovo'],
    'PCA_components': [0.5],
    'svd_solver': ['full'],
    'penalty': [1, 100],
    'strip_accents': ['unicode'],
    'stop_words': [sw]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config
        
best_config.pop('stop_words')

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.5, 'decision_func': 'ovr', 'gamma': 'auto', 'kernel': 'linear', 'name': 'SVC', 'penalty': 100, 'strip_accents': 'unicode', 'svd_solver': 'full'}
Total Prediction Accuracy: 0.24120306526027363 ± 0.03470674848440035


In [20]:
model_name = 'Logistic Regression'
params = {'name': [model_name],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear'],
    'multi_class': ['auto'],
    'PCA_components': [0.5, 0.8],
    'svd_solver': ['full'],
    'strip_accents': ['unicode'],
    'stop_words': [sw]
}
best_accuracy = 0
best_std = 0
for model_config in ParameterGrid(params):
    mean_accuracy, std_accuracy = evaluation.evaluate_model(data, model_config, eval_config, seed=RANDOM_SEED)
    
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_std = std_accuracy
        best_config = model_config

best_config.pop('stop_words')

print('The best config is: {}\nTotal Prediction Accuracy: {} ± {}'.format(best_config, best_accuracy, best_std))

The best config is: {'PCA_components': 0.8, 'multi_class': 'auto', 'name': 'Logistic Regression', 'penalty': 'l2', 'solver': 'lbfgs', 'strip_accents': 'unicode', 'svd_solver': 'full'}
Total Prediction Accuracy: 0.19345404952344478 ± 0.013175796711970217


## BERT Models

In [5]:
# Check the BERT model
bert_name = 'bert-base-multilingual-cased'
bert_config = {
    'name': bert_name,
    'tokenizer': AutoTokenizer.from_pretrained(bert_name),
    'max_len': 100,
    'batch_size': 8,
    'learning_rate': 2e-5,
    'epochs': 10
}

evaluation.evaluate_model(data, bert_config, eval_config, seed=RANDOM_SEED)

Epoch 1/10
----------
Train loss 1.0489672952749598 accuracy 0.5752475247524752
Test  accuracy 0.5889328063241106

Epoch 2/10
----------
Train loss 0.9043727873817203 accuracy 0.6138613861386139
Test  accuracy 0.592885375494071

Epoch 3/10
----------
Train loss 0.7391209623475713 accuracy 0.7188118811881188
Test  accuracy 0.5968379446640316

Epoch 4/10
----------
Train loss 0.5361537444075262 accuracy 0.807920792079208
Test  accuracy 0.592885375494071

Epoch 5/10
----------
Train loss 0.4800312879209326 accuracy 0.8435643564356435
Test  accuracy 0.608695652173913

Epoch 6/10
----------
Train loss 0.34010526078742176 accuracy 0.8782178217821782
Test  accuracy 0.5849802371541502

Epoch 7/10
----------
Train loss 0.30596852426483173 accuracy 0.897029702970297
Test  accuracy 0.5691699604743082

Epoch 8/10
----------
Train loss 0.23009958306710623 accuracy 0.904950495049505
Test  accuracy 0.5889328063241106

Epoch 9/10
----------
Train loss 0.1944414880264213 accuracy 0.9188118811881189
Tes

In [5]:
# Check the BERT model but adding weighting so that the penalization for mistaken the smaller classes is bigger
bert_name = 'bert-base-multilingual-cased'
bert_config = {
    'name': bert_name,
    'tokenizer': AutoTokenizer.from_pretrained(bert_name),
    'max_len': 100,
    'batch_size': 8,
    'learning_rate': 2e-5,
    'epochs': 10
}

evaluation.evaluate_model(data, bert_config, eval_config, seed=RANDOM_SEED)

Epoch 1/10
----------
Train loss 1.1296878790292215 accuracy 0.48217821782178216
Test  accuracy 0.5810276679841897

Epoch 2/10
----------
Train loss 0.9449049937208807 accuracy 0.5702970297029704
Test  accuracy 0.5494071146245059

Epoch 3/10
----------
Train loss 0.6662546022846474 accuracy 0.7108910891089109
Test  accuracy 0.5691699604743082

Epoch 4/10
----------
Train loss 0.4131142388933932 accuracy 0.8148514851485149
Test  accuracy 0.5494071146245059

Epoch 5/10
----------
Train loss 0.25978799330668273 accuracy 0.8693069306930693
Test  accuracy 0.5849802371541502

Epoch 6/10
----------
Train loss 0.2455331687244869 accuracy 0.8831683168316832
Test  accuracy 0.5573122529644269

Epoch 7/10
----------
Train loss 0.24195834103509725 accuracy 0.8900990099009901
Test  accuracy 0.541501976284585

Epoch 8/10
----------
Train loss 0.1154627173714611 accuracy 0.9118811881188119
Test  accuracy 0.5059288537549407

Epoch 9/10
----------
Train loss 0.07009019493469125 accuracy 0.92673267326732

## Compositionality

In [23]:
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

In [24]:
WV_FILE = 'SBW-vectors-300-min5.bin'

word2vecs = KeyedVectors.load_word2vec_format(os.path.join(DATA_PATH, WV_FILE), binary=True)

In [11]:
comps = np.array(data.text.apply(lambda s: composition.compose(nltk.word_tokenize(s), word2vecs)).tolist())

In [12]:
skf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED)
accuracies = []
fold = 1

for train_index, test_index in skf.split(comps, data['toxicity_degree'].values):
    train_x = comps[train_index, :]
    test_x = comps[test_index, :]
    train_y = data['toxicity_degree'].values[train_index]
    test_y = data['toxicity_degree'].values[test_index]
    
    mlp = MLPClassifier(activation='tanh', hidden_layer_sizes=(128, 64), solver='lbfgs', learning_rate='adaptive',
                        learning_rate_init=2e-3, random_state=RANDOM_SEED, max_iter=100)
    mlp.fit(train_x, train_y)
    pred_y = mlp.predict(test_x)
    
    accuracies.append(f1_score(test_y, pred_y, labels=data['toxicity_degree'].unique(), average='macro'))
    
print('Total Prediction Accuracy is:', np.round(np.mean(accuracies), 4), '\u00B1', np.round(np.std(accuracies), 4))

Total Prediction Accuracy is: 0.3373 ± 0.0154


---

What I've observed from tweeking the parameters is that the lbfgs solver increases noticeably the f1-score (it is recommended in the sklearn documentation for small datasets), together with the number of iterations/epochs that we let the model train. The learning rate does not have a really big impact on performance when changed from invscaling to adaptive. The best activation function so far seems to be tanh.

---

In [13]:
skf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED)
accuracies = []
fold = 1

for train_index, test_index in skf.split(comps, data['toxicity_degree'].values):
    train_x = comps[train_index, :]
    test_x = comps[test_index, :]
    train_y = data['toxicity_degree'].values[train_index]
    test_y = data['toxicity_degree'].values[test_index]
    
    lr = LogisticRegression(penalty='l2', C=100, class_weight=None, random_state=RANDOM_SEED, solver='lbfgs')
    lr.fit(train_x, train_y)
    pred_y = lr.predict(test_x)
    
    accuracies.append(f1_score(test_y, pred_y, labels=data['toxicity_degree'].unique(), average='macro'))
    
print('Total Prediction Accuracy is:', np.round(np.mean(accuracies), 4), '\u00B1', np.round(np.std(accuracies), 4))

Total Prediction Accuracy is: 0.3501 ± 0.0325


---

It is interesting that the result actually gets worse when the class_weight parameter is set to balance to account for imbalanced distribution in the data. The results slightly improve when the penalty value is increased to 100. With regard to the solver, lbfgs does a good work for this model and dataset again.

---

### Now, let's see the effect of using other compositionality function such as the average

In [7]:
avg_comps = np.array(data.text.apply(lambda s: composition.compose(nltk.word_tokenize(s), word2vecs, 
                                                                   comp_func=composition._average)).tolist())

In [20]:
skf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED)
accuracies = []
fold = 1

for train_index, test_index in skf.split(avg_comps, data['toxicity_degree'].values):
    train_x = avg_comps[train_index, :]
    test_x = avg_comps[test_index, :]
    train_y = data['toxicity_degree'].values[train_index]
    test_y = data['toxicity_degree'].values[test_index]
    
    mlp = MLPClassifier(activation='identity', hidden_layer_sizes=(256, 64), solver='lbfgs', 
                        learning_rate='invscaling', learning_rate_init=2e-3, random_state=RANDOM_SEED, max_iter=100)
    mlp.fit(train_x, train_y)
    pred_y = mlp.predict(test_x)
    
    accuracies.append(f1_score(test_y, pred_y, labels=data['toxicity_degree'].unique(), average='macro'))
    
print('Total Prediction Accuracy is:', np.round(np.mean(accuracies), 4), '\u00B1', np.round(np.std(accuracies), 4))

Total Prediction Accuracy is: 0.2994 ± 0.0575


---

Using the same parameters as in the previous case, the result gets worse when the compositionality function is the average of the word_vectors. Increasing the size of the hidden layer slightly improves the accuracy score.

---

In [25]:
skf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED)
accuracies = []
fold = 1

for train_index, test_index in skf.split(avg_comps, data['toxicity_degree'].values):
    train_x = avg_comps[train_index, :]
    test_x = avg_comps[test_index, :]
    train_y = data['toxicity_degree'].values[train_index]
    test_y = data['toxicity_degree'].values[test_index]
    
    lr = LogisticRegression(penalty='l2', C=100, class_weight=None, random_state=RANDOM_SEED, solver='lbfgs')
    lr.fit(train_x, train_y)
    pred_y = lr.predict(test_x)
    
    accuracies.append(f1_score(test_y, pred_y, labels=data['toxicity_degree'].unique(), average='macro'))
    
print('Total Prediction Accuracy is:', np.round(np.mean(accuracies), 4), '\u00B1', np.round(np.std(accuracies), 4))

Total Prediction Accuracy is: 0.3029 ± 0.05


### What about removing the stopwords?

In [25]:
sw = nltk.corpus.stopwords.words('spanish')

In [35]:
comps = data.text.apply(lambda s: composition.compose(nltk.word_tokenize(s), word2vecs, 
                                                      comp_func=composition._average,
                                                      stopwords=sw)).tolist()
comps = [np.zeros(300,) if c is None else c for c in comps]
comps = np.array(comps)

In [36]:
skf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED)
accuracies = []
fold = 1

for train_index, test_index in skf.split(comps, data['toxicity_degree'].values):
    train_x = comps[train_index, :]
    test_x = comps[test_index, :]
    train_y = data['toxicity_degree'].values[train_index]
    test_y = data['toxicity_degree'].values[test_index]
    
    mlp = MLPClassifier(activation='identity', hidden_layer_sizes=(256, 64), solver='lbfgs', 
                        learning_rate='invscaling', learning_rate_init=2e-3, random_state=RANDOM_SEED, max_iter=100)
    mlp.fit(train_x, train_y)
    pred_y = mlp.predict(test_x)
    
    accuracies.append(f1_score(test_y, pred_y, labels=data['toxicity_degree'].unique(), average='macro'))
    
print('Total Prediction Accuracy is:', np.round(np.mean(accuracies), 4), '\u00B1', np.round(np.std(accuracies), 4))

Total Prediction Accuracy is: 0.276 ± 0.0266


In [37]:
skf = StratifiedKFold(n_splits=5, random_state=RANDOM_SEED)
accuracies = []
fold = 1

for train_index, test_index in skf.split(comps, data['toxicity_degree'].values):
    train_x = comps[train_index, :]
    test_x = comps[test_index, :]
    train_y = data['toxicity_degree'].values[train_index]
    test_y = data['toxicity_degree'].values[test_index]
    
    lr = LogisticRegression(penalty='l2', C=100, class_weight=None, random_state=RANDOM_SEED, solver='lbfgs')
    lr.fit(train_x, train_y)
    pred_y = lr.predict(test_x)
    
    accuracies.append(f1_score(test_y, pred_y, labels=data['toxicity_degree'].unique(), average='macro'))
    
print('Total Prediction Accuracy is:', np.round(np.mean(accuracies), 4), '\u00B1', np.round(np.std(accuracies), 4))

Total Prediction Accuracy is: 0.2813 ± 0.0316
