# Imports Iniciais

In [1]:
!pip install scikit-multilearn



In [0]:
import pandas as pd 
import numpy as np

# Leitura e Exploração dos Dados

In [3]:
# Path para o arquivo train.csv
df = pd.read_csv('  ')

df[0:10]

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0


In [4]:
# Número de instâncias
len(df)

159571

In [5]:
# Frequência das labels
labels = df.iloc[:, 2:].columns.values

for l in labels:
  print("Label: ", l)
  print(df[l].value_counts(), '\n')

Label:  toxic
0    144277
1     15294
Name: toxic, dtype: int64 

Label:  severe_toxic
0    157976
1      1595
Name: severe_toxic, dtype: int64 

Label:  obscene
0    151122
1      8449
Name: obscene, dtype: int64 

Label:  threat
0    159093
1       478
Name: threat, dtype: int64 

Label:  insult
0    151694
1      7877
Name: insult, dtype: int64 

Label:  identity_hate
0    158166
1      1405
Name: identity_hate, dtype: int64 



In [6]:
# Valores faltantes
print('Quantidade de valores faltantes:\n{}'.format(df.isna().sum()))

Quantidade de valores faltantes:
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64


In [7]:
# Comentários nulos
print('Quantidade de comentários vazios: {}'.format(len(df[df['comment_text'].isnull()])))

Quantidade de comentários vazios: 0


In [8]:
comments_unlabelled = df[df[labels].sum(axis=1) == 0]

print('Percentual de comentários sem classificação: {}%'.format(len(comments_unlabelled) / len(df)*100))

Percentual de comentários sem classificação: 89.83211235124176%


In [0]:
# Selecionando uma amostra de tamanho 10.000 para os experimentos
df_idx = df.sample(n=10000, random_state=42).index
df = df.loc[df_idx, :]

In [10]:
len(df)

10000

# Pré-processamento

In [0]:
import re

def clean_text(text):
    
    text = text.encode('ascii', errors = 'ignore').decode() #Decodificando caracteres em ASCII
    text = text.lower() #Apenas caracteres minúsculos
    text = re.sub(r'http\S+', ' ', text) #Evitando links
    #Evitando contrações
    text = re.sub(r'#+', ' ', text)
    text = re.sub(r'@[A-Za-z0-9]+', ' ', text)
    text = re.sub(r"([A-Za-z]+)'s", r"\1 is", text)
    text = re.sub(r"what's", "what is ", text) 
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"won't", "will not ", text) 
    text = re.sub(r"\'ve", " have ", text) 
    text = re.sub(r"can't", "can not ", text) 
    text = re.sub(r"n't", " not ", text) 
    text = re.sub(r"isn't", "is not ", text) 
    text = re.sub(r"i'm", "i am ", text) 
    text = re.sub(r"\'re", " are ", text) 
    text = re.sub(r"\'d", " would ", text) 
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = text.strip(' ') #Removendo espaços do começo e fim 
    
    return text

In [12]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

def remove_stopwords(text):
    
  stop_words = set(stopwords.words('english')) 

  word_tokens = word_tokenize(text) 

  filtered_sentence = [w for w in word_tokens if not w in stop_words] 
      
  return filtered_sentence

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [13]:
from tqdm import tqdm 

tqdm.pandas(desc='Limpando o texto')
df['text_tokens'] = df['comment_text'].progress_apply(clean_text)

tqdm.pandas(desc='Removendo as stopwords e tokenizando o texto')
df['text_tokens'] = df['text_tokens'].progress_apply(remove_stopwords)

df[['comment_text', 'text_tokens']].head()

Limpando o texto: 100%|██████████| 10000/10000 [00:01<00:00, 8872.67it/s]
Removendo as stopwords e tokenizando o texto: 100%|██████████| 10000/10000 [00:05<00:00, 1991.98it/s]


Unnamed: 0,comment_text,text_tokens
119105,"Geez, are you forgetful! We've already discus...","[geez, forgetful, already, discussed, marx, an..."
131631,Carioca RFA \n\nThanks for your support on my ...,"[carioca, rfa, thanks, support, request, admin..."
125326,"""\n\n Birthday \n\nNo worries, It's what I do ...","[birthday, worries, enjoy, ur, day, talk, e]"
111256,Pseudoscience category? \n\nI'm assuming that ...,"[pseudoscience, category, assuming, article, p..."
83590,"(and if such phrase exists, it would be provid...","[phrase, exists, would, provided, search, engi..."


# Processo de Estratificação

## Tratamento do X e do Y

In [14]:
# Contando a quantidade de palavras 
text_tokens = []
 
for vet in df['text_tokens'].values:
      text_tokens.extend(vet)
 
num_words = len((list(set(text_tokens))))

num_words

32969

In [15]:
# Transforma cada texto em uma sequência de números inteiros (cada número inteiro é o índice de um token 
  # em um dicionário). O importante é que o valor 0 não será atribuído a nenhuma palavra.
from keras.preprocessing.text import Tokenizer

tok = Tokenizer(num_words=num_words)
tok.fit_on_texts(df['text_tokens'].values)

df['X'] = tok.texts_to_sequences(df['text_tokens'])
 
df[['comment_text', 'text_tokens', 'X']].head()

Using TensorFlow backend.


Unnamed: 0,comment_text,text_tokens,X
119105,"Geez, are you forgetful! We've already discus...","[geez, forgetful, already, discussed, marx, an...","[11345, 11346, 149, 614, 6469, 4335, 186, 422,..."
131631,Carioca RFA \n\nThanks for your support on my ...,"[carioca, rfa, thanks, support, request, admin...","[16363, 938, 19, 260, 249, 2498, 769, 2703, 46..."
125326,"""\n\n Birthday \n\nNo worries, It's what I do ...","[birthday, worries, enjoy, ur, day, talk, e]","[3319, 3525, 636, 1906, 147, 4, 186]"
111256,Pseudoscience category? \n\nI'm assuming that ...,"[pseudoscience, category, assuming, article, p...","[6472, 357, 1401, 1, 6472, 357, 1164, 3320, 10..."
83590,"(and if such phrase exists, it would be provid...","[phrase, exists, would, provided, search, engi...","[1294, 1094, 5, 470, 471, 1958, 26, 306, 2, 54..."


In [16]:
df['num_words'] = df['text_tokens'].apply(lambda x : len(x))
 
max_num_words = df['num_words'].max()
 
df[['comment_text', 'text_tokens', 'num_words']].nlargest(5, 'num_words')

Unnamed: 0,comment_text,text_tokens,num_words
18183,Hey guys I love chicken rice=)Hey guys I love ...,"[hey, guys, love, chicken, rice, hey, guys, lo...",834
104994,mamas boy mamas boy mamas boy mamas boymamas b...,"[mamas, boy, mamas, boy, mamas, boy, mamas, bo...",793
94644,ACCESS DENEID!! \n\nACCESS DENEID!!ACCESS DENE...,"[access, deneid, access, deneid, access, denei...",662
128573,"teabag! \n\nTeabag, teabag, teabag, teabag, te...","[teabag, teabag, teabag, teabag, teabag, teaba...",621
109038,PUZZLE CURIOUS GOGGLE FOR EARTH SOLUTION CLIMA...,"[puzzle, curious, goggle, earth, solution, cli...",470


In [17]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
 
X = pad_sequences(maxlen=max_num_words, sequences=df['X'], value=0, padding='post', truncating='post')
y = df[labels].values
 
print('Dimensão do X: {}'.format(X.shape))
print('Dimensão do y: {}'.format(y.shape))

Dimensão do X: (10000, 834)
Dimensão do y: (10000, 6)


## Iterative Train-Test Split

In [0]:
# from sklearn.model_selection import train_test_split
# X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=42, test_size=0.2, stratify=y)

# ValueError: The least populated class in y has only 1 member, which is too few. 
# The minimum number of groups for any class cannot be less than 2.

In [0]:
# Teste com o train test split do Scikit-learn
from sklearn.model_selection import train_test_split

X_tr, X_te, y_tr, y_te = train_test_split(X, y, random_state=42, test_size=0.2)

In [20]:
# Scikit-learn
(X_tr.shape, y_tr.shape), (X_te.shape, y_te.shape)

(((8000, 834), (8000, 6)), ((2000, 834), (2000, 6)))

In [0]:
from skmultilearn.model_selection import iterative_train_test_split

np.random.seed(42)
X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size=0.2)

In [22]:
# Scikit-multilearn 
(X_train.shape, y_train.shape), (X_test.shape, y_test.shape)

(((8000, 834), (8000, 6)), ((2000, 834), (2000, 6)))

## Verificando a Proporção dos Dados

In [0]:
def count_occurrences_labels(matrix_y):

  CLASSES = {
    'toxic':         0,
    'severe_toxic':  0,
    'obscene':       0,
    'threat':        0,
    'insult':        0,
    'identity_hate': 0,
  }

  for vet in matrix_y:
    for value, label in zip(vet, CLASSES):
      CLASSES[label] += int(value)

  return CLASSES

def calculates_labels_proportion(DICT_CLASSES, labels_names):

  proportions = {}

  for label in labels_names:
    prop = DICT_CLASSES[label]/sum(DICT_CLASSES.values())
    proportions.update({label: prop})
    
  df = pd.DataFrame([proportions])

  return df

def result_prop(data, y_train, y_test):

  data = count_occurrences_labels(data)
  train = count_occurrences_labels(y_train)
  test = count_occurrences_labels(y_test)

  prop_data = calculates_labels_proportion(data, labels)
  prop_train = calculates_labels_proportion(train, labels)
  prop_test = calculates_labels_proportion(test, labels)

  df = pd.concat([prop_data, prop_train, prop_test], ignore_index=True)
  df.rename(index={
      0: 'dataset', 
      1: 'train_set', 
      2: 'test_set'}, inplace=True)

  return df

In [24]:
from collections import Counter
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix

pd.DataFrame({
    'train ML': Counter(str(combination) for row in get_combination_wise_output_matrix(y_train, order=1) for combination in row),
    'test ML' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_test, order=1) for combination in row)
}).T.fillna(0.0)

Unnamed: 0,"(5,)","(0,)","(2,)","(4,)","(1,)","(3,)"
train ML,73,754,428,409,83,16
test ML,18,188,107,102,19,4


In [25]:
pd.DataFrame({
    'train': Counter(str(combination) for row in get_combination_wise_output_matrix(y_tr, order=1) for combination in row),
    'test' : Counter(str(combination) for row in get_combination_wise_output_matrix(y_te, order=1) for combination in row)
}).T.fillna(0.0)

Unnamed: 0,"(5,)","(0,)","(2,)","(4,)","(1,)","(3,)"
train,76,743,422,398,74,17
test,15,199,113,113,28,3


In [26]:
count_occurrences_labels(y_test)

{'identity_hate': 18,
 'insult': 102,
 'obscene': 107,
 'severe_toxic': 19,
 'threat': 4,
 'toxic': 188}

In [27]:
result_prop(df[labels].values, y_train, y_test)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
dataset,0.427987,0.046343,0.243071,0.009087,0.232167,0.041345
train_set,0.42768,0.047079,0.242768,0.009075,0.231991,0.041407
test_set,0.429224,0.043379,0.244292,0.009132,0.232877,0.041096


In [28]:
result_prop(df[labels].values, y_tr, y_te)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
dataset,0.427987,0.046343,0.243071,0.009087,0.232167,0.041345
train_set,0.42948,0.042775,0.243931,0.009827,0.230058,0.043931
test_set,0.422505,0.059448,0.239915,0.006369,0.239915,0.031847


# Classificação Multi-label

In [0]:
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [0]:
# clf = SVC()
# clf.fit(X_train, y_train)

# ValueError: bad input shape (8000, 6)

In [31]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.metrics import accuracy_score

BR = BinaryRelevance(classifier=SVC(), require_dense=[False, True])

BR.fit(X_train, y_train)

y_pred = BR.predict(X_test)

print("Acurácia = ", accuracy_score(y_test, y_pred))

Acurácia =  0.9005


In [32]:
BR.fit(X_tr, y_tr)

y_pr = BR.predict(X_te)

print("Acurácia = ", accuracy_score(y_te, y_pr))

Acurácia =  0.8945


In [33]:
from skmultilearn.problem_transform import ClassifierChain

CC = ClassifierChain(classifier=XGBClassifier(), require_dense=[False, True])

CC.fit(X_train, y_train)

y_pred = CC.predict(X_test)

print("Acurácia = ", accuracy_score(y_test, y_pred))

Acurácia =  0.9


In [34]:
CC.fit(X_tr, y_tr)

y_pr = CC.predict(X_te)

print("Acurácia = ", accuracy_score(y_te, y_pr))

Acurácia =  0.8955


In [35]:
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier

LP = LabelPowerset(classifier=RandomForestClassifier(n_estimators=100), require_dense=[False, True])

LP.fit(X_train, y_train)

y_pred = LP.predict(X_test)

print("Acurácia = ", accuracy_score(y_test, y_pred))

Acurácia =  0.9005


In [36]:
LP.fit(X_tr, y_tr)

y_pr = LP.predict(X_te)

print("Acurácia = ", accuracy_score(y_te, y_pr))

Acurácia =  0.8925
