# Detection of TOXicity in comments in Spanish (DETOXIS 2021)

## SESIÓN 2.4: Combinación de clasificadores

### Realizado por Álvaro Mazcuñán y Miquel Marín

#### Librerías

In [33]:
import pandas as pd
import re
import string
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import StackingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

import warnings

In [6]:
warnings.filterwarnings("ignore")

In [7]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [8]:
df = pd.read_csv("DATASET_DETOXIS.csv")
df

Unnamed: 0,topic,thread_id,comment_id,reply_to,comment_level,comment,argumentation,constructiveness,positive_stance,negative_stance,target_person,target_group,stereotype,sarcasm,mockery,insult,improper_language,aggressiveness,intolerance,toxicity,toxicity_level
0,CR,0_000,0_002,0_002,1,Pensó: Zumo para restar.,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1
1,CR,0_001,0_003,0_003,1,Como les gusta el afeitado en seco a esta gente.,0,0,0,0,0,1,1,1,1,0,0,0,0,1,1
2,CR,0_002,0_004,0_004,1,"asi me gusta, que se maten entre ellos y en al...",0,0,0,0,0,1,0,0,0,0,0,1,1,1,2
3,CR,0_003,0_005,0_005,1,"Loss mas valientes, los que mejor cortan nuest...",0,0,0,0,1,1,0,1,1,0,0,0,0,1,1
4,CR,0_004,0_006,0_006,1,Costumbres...,0,0,0,0,0,1,1,0,0,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3458,MI,20_134,20_164,20_164,1,Ya decía yo que veía menos moros,0,0,0,0,0,1,0,0,1,1,0,0,0,1,1
3459,MI,20_006,20_165,20_008,2,+1. Como lo sabes...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
3460,MI,20_135,20_166,20_166,1,"Seguirán cobrando paguitas en Marruecos,expoli...",0,0,0,0,0,1,1,0,0,0,0,0,1,1,1
3461,MI,20_136,20_167,20_167,1,"pobres, se arriesgan en pateras porque huyen d...",0,0,0,0,0,1,0,0,1,0,0,0,0,1,1


In [24]:
sample_data = df[["comment", "toxicity","toxicity_level"]]
sample_data

Unnamed: 0,comment,toxicity,toxicity_level
0,Pensó: Zumo para restar.,1,1
1,Como les gusta el afeitado en seco a esta gente.,1,1
2,"asi me gusta, que se maten entre ellos y en al...",1,2
3,"Loss mas valientes, los que mejor cortan nuest...",1,1
4,Costumbres...,1,1
...,...,...,...
3458,Ya decía yo que veía menos moros,1,1
3459,+1. Como lo sabes...,0,0
3460,"Seguirán cobrando paguitas en Marruecos,expoli...",1,1
3461,"pobres, se arriesgan en pateras porque huyen d...",1,1


In [25]:
def tweet_preprocessing_not_tokenized(tweet):
    tweet = tweet.lower() # Se empieza pasando todos los mensajes a minúsculas
    tweet = re.sub(r"http\S+|www\S+|https\S+", "" ,tweet , flags=re.MULTILINE) # Quitar URLs
    tweet = re.sub(r"\@\w+|\#", "", tweet) # Quitar @ y #
    tweet = re.sub(r"[\U00010000-\U0010ffff]|:\)|:\(|XD|xD|;\)|:,\(|:D|D:", "", tweet) # Quitar emojis y emoticones
    tweet = tweet.translate(str.maketrans('', '', string.punctuation)) # Quitar signos de puntuación
    tokenized_tweets = word_tokenize(tweet)
    filtered_tweets = [word for word in tokenized_tweets if not word in set(stopwords.words('spanish'))] # Quitar stopwords y filtrar
    
    stemming = PorterStemmer() # Inicializamos PorterStemmer para obtener la raíz de cada una de las palabras
    stemmed_tweets = [stemming.stem(word) for word in filtered_tweets]
    lemmatization = WordNetLemmatizer() # Inicializamos el Lemmatizer para obtener los lemas de las palabras
    lemma_tweets = [lemmatization.lemmatize(word, pos='a') for word in stemmed_tweets] 
    return " ".join(lemma_tweets) # NO TOKENIZADO

preprocessing = lambda x: tweet_preprocessing_not_tokenized(x)

In [27]:
sample_data['comment'] = pd.DataFrame(sample_data["comment"].apply(preprocessing))

In [28]:
train_X, test_X, train_Y, test_Y = train_test_split(sample_data['comment'], sample_data['toxicity'], test_size=0.3)

In [29]:
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(sample_data['comment'])
train_X_Tfidf = tfidf_vect.transform(train_X)
test_X_Tfidf = tfidf_vect.transform(test_X)

In [30]:
svm_clf = SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
svm_clf.fit(train_X_Tfidf,train_Y)

y_train_pred = svm_clf.predict(train_X_Tfidf)
y_test_pred = svm_clf.predict(test_X_Tfidf)

# Training set performance
svm_train_accuracy = accuracy_score(train_Y, y_train_pred) # Calculate Accuracy
svm_train_f1 = f1_score(train_Y, y_train_pred, average='weighted') # Calculate F1-score

# Test set performance
svm_test_accuracy = accuracy_score(test_Y, y_test_pred) # Calculate Accuracy
svm_test_f1 = f1_score(test_Y, y_test_pred, average='weighted') # Calculate F1-score

print('Model performance for Training set')
print('- Accuracy: %s' % svm_train_accuracy)
print('- F1 score: %s' % svm_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % svm_test_accuracy)
print('- F1 score: %s' % svm_test_f1)

Model performance for Training set
- Accuracy: 0.933993399339934
- F1 score: 0.9320929712412499
----------------------------------
Model performance for Test set
- Accuracy: 0.7353224254090471
- F1 score: 0.6974106156358149


In [31]:
tree_clf = DecisionTreeClassifier(max_depth=4)
tree_clf.fit(train_X_Tfidf,train_Y)

y_train_pred = tree_clf.predict(train_X_Tfidf)
y_test_pred = tree_clf.predict(test_X_Tfidf)

# Training set performance
dt_train_accuracy = accuracy_score(train_Y, y_train_pred) # Calculate Accuracy
dt_train_f1 = f1_score(train_Y, y_train_pred, average='weighted') # Calculate F1-score

# Test set performance
dt_test_accuracy = accuracy_score(test_Y, y_test_pred) # Calculate Accuracy
dt_test_f1 = f1_score(test_Y, y_test_pred, average='weighted') # Calculate F1-score

print('Model performance for Training set')
print('- Accuracy: %s' % dt_train_accuracy)
print('- F1 score: %s' % dt_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % dt_test_accuracy)
print('- F1 score: %s' % dt_test_f1)

Model performance for Training set
- Accuracy: 0.7116336633663366
- F1 score: 0.6260278325544746
----------------------------------
Model performance for Test set
- Accuracy: 0.6948989412897016
- F1 score: 0.6129627496326245


In [34]:
rf = RandomForestClassifier(n_estimators=10)
rf.fit(train_X_Tfidf, train_Y)

y_train_pred = rf.predict(train_X_Tfidf)
y_test_pred = rf.predict(test_X_Tfidf)

# Training set performance
rf_train_accuracy = accuracy_score(train_Y, y_train_pred) # Calculate Accuracy
rf_train_f1 = f1_score(train_Y, y_train_pred, average='weighted') # Calculate F1-score

# Test set performance
rf_test_accuracy = accuracy_score(test_Y, y_test_pred) # Calculate Accuracy
rf_test_f1 = f1_score(test_Y, y_test_pred, average='weighted') # Calculate F1-score

print('Model performance for Training set')
print('- Accuracy: %s' % rf_train_accuracy)
print('- F1 score: %s' % rf_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % rf_test_accuracy)
print('- F1 score: %s' % rf_test_f1)

Model performance for Training set
- Accuracy: 0.9727722772277227
- F1 score: 0.9724511354696957
----------------------------------
Model performance for Test set
- Accuracy: 0.7276227141482194
- F1 score: 0.6847379428336633


In [35]:
mlp = MLPClassifier(alpha=1, max_iter=1000)
mlp.fit(train_X_Tfidf, train_Y)

y_train_pred = mlp.predict(train_X_Tfidf)
y_test_pred = mlp.predict(test_X_Tfidf)

# Training set performance
mlp_train_accuracy = accuracy_score(train_Y, y_train_pred) # Calculate Accuracy
mlp_train_f1 = f1_score(train_Y, y_train_pred, average='weighted') # Calculate F1-score

# Test set performance
mlp_test_accuracy = accuracy_score(test_Y, y_test_pred) # Calculate Accuracy
mlp_test_f1 = f1_score(test_Y, y_test_pred, average='weighted') # Calculate F1-score

print('Model performance for Training set')
print('- Accuracy: %s' % mlp_train_accuracy)
print('- F1 score: %s' % mlp_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % mlp_test_accuracy)
print('- F1 score: %s' % mlp_test_f1)

Model performance for Training set
- Accuracy: 0.7735148514851485
- F1 score: 0.730719729604149
----------------------------------
Model performance for Test set
- Accuracy: 0.6920115495668913
- F1 score: 0.6041002053158424


In [37]:
estimator_list = [
    ('svm_clf',svm_clf),
    ('tree_clf',tree_clf),
    ('rf',rf),
    ('mlp',mlp) ]


stack_model = StackingClassifier(
    estimators=estimator_list, final_estimator=LogisticRegression()
)


stack_model.fit(train_X_Tfidf, train_Y)


y_train_pred = stack_model.predict(train_X_Tfidf)
y_test_pred = stack_model.predict(test_X_Tfidf)

# Training set model performance
stack_model_train_accuracy = accuracy_score(train_Y, y_train_pred) # Calculate Accuracy
stack_model_train_f1 = f1_score(train_Y, y_train_pred, average='weighted') # Calculate F1-score

# Test set model performance
stack_model_test_accuracy = accuracy_score(test_Y, y_test_pred) # Calculate Accuracy
stack_model_test_f1 = f1_score(test_Y, y_test_pred, average='weighted') # Calculate F1-score

print('Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- F1 score: %s' % stack_model_train_f1)
print('----------------------------------')
print('Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- F1 score: %s' % stack_model_test_f1)

Model performance for Training set
- Accuracy: 0.9698844884488449
- F1 score: 0.969570201181669
----------------------------------
Model performance for Test set
- Accuracy: 0.7439846005774784
- F1 score: 0.7131512657393546


In [44]:
acc_train_list = {
'svm_rbf': svm_train_accuracy,
'tree_clf': dt_train_accuracy,
'rf': rf_train_accuracy,
'mlp': mlp_train_accuracy,
'stack_model': stack_model_train_accuracy}


f1_train_list = {
'svm_rbf': svm_train_f1,
'tree_clf': dt_train_f1,
'rf': rf_train_f1,
'mlp': mlp_train_f1,
'stack_model': stack_model_train_f1}

In [46]:
acc_test_list = {
'svm_rbf': svm_test_accuracy,
'tree_clf': dt_test_accuracy,
'rf': rf_test_accuracy,
'mlp': mlp_test_accuracy,
'stack_model': stack_model_test_accuracy}


f1_test_list = {
'svm_rbf': svm_test_f1,
'tree_clf': dt_test_f1,
'rf': rf_test_f1,
'mlp': mlp_test_f1,
'stack_model': stack_model_test_f1}

In [45]:
acc_df = pd.DataFrame.from_dict(acc_train_list, orient='index', columns=['Accuracy'])
f1_df = pd.DataFrame.from_dict(f1_train_list, orient='index', columns=['F1'])
df = pd.concat([acc_df, f1_df], axis=1)
df

Unnamed: 0,Accuracy,F1
svm_rbf,0.933993,0.932093
tree_clf,0.711634,0.626028
rf,0.972772,0.972451
mlp,0.773515,0.73072
stack_model,0.969884,0.96957


In [48]:
acc_df_test = pd.DataFrame.from_dict(acc_test_list, orient='index', columns=['Accuracy'])
f1_df_test = pd.DataFrame.from_dict(f1_test_list, orient='index', columns=['F1'])
df_test = pd.concat([acc_df_test, f1_df_test], axis=1)
df_test

Unnamed: 0,Accuracy,F1
svm_rbf,0.735322,0.697411
tree_clf,0.694899,0.612963
rf,0.727623,0.684738
mlp,0.692012,0.6041
stack_model,0.743985,0.713151
