##Import and Install Libraries

In [1]:
#install all necessary libraries
! export MACOSX_DEPLOYMET_TARGET=10.9
! pip install extremetext
! pip install scikit-multilearn
! pip install nltk

! pip install -U spacy
! python -m spacy download pt_core_news_lg

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting extremetext
  Downloading extremetext-0.8.4.tar.gz (66 kB)
[K     |████████████████████████████████| 66 kB 2.1 MB/s 
[?25hCollecting pybind11>=2.2
  Downloading pybind11-2.10.2-py3-none-any.whl (222 kB)
[K     |████████████████████████████████| 222 kB 12.7 MB/s 
Building wheels for collected packages: extremetext
  Building wheel for extremetext (setup.py) ... [?25lerror
[31m  ERROR: Failed building wheel for extremetext[0m
[?25h  Running setup.py clean for extremetext
Failed to build extremetext
Installing collected packages: pybind11, extremetext
    Running setup.py install for extremetext ... [?25l[?25hdone
[33m  DEPRECATION: extremetext was installed using the legacy 'setup.py install' method, because a wheel could not be built for it. A possible replacement is to fix the wheel build issue reported above. You can find discussion regarding this at https://github.c

In [2]:
# https://github.com/mwydmuch/extremeText
# https://arxiv.org/pdf/1810.11671v1.pdf

In [3]:
import pandas as pd
import numpy as np
import re
import string 

#train/model
import extremeText
from skmultilearn.model_selection import iterative_train_test_split

# pre-processing
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
nltk.download('stopwords')

stopwords = nltk.corpus.stopwords.words('portuguese')
punctuation = set(string.punctuation)

#lemma in portuguese
import spacy
nlp = spacy.load("pt_core_news_lg")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
#connect to drive

from google.colab import drive
drive.mount('/content/drive',force_remount = False)

import os
base_dir = '/content/drive/My Drive/hatespeech_hc' #Yaffa
# base_dir = '/content/drive/Shared with me/hatespeech_hc' #Livia and Rose and Professor 
os.chdir(base_dir)

Mounted at /content/drive


##Get Data

In [5]:
# Data should follow this format from extreme text

train_data_format = """
__label__mariadb-galera __label__mariadb55-mariadb __label__mysql55-mysql mariadb mariadb mysql solaris vulnerability oracle mysql server users availability vectors keys oracle com technetwork topics security html http secunia com http www oracle com technetwork topics security http lists security announce msg00016 html http www oracle com technetwork topics security html http secunia com http www securityfocus security gentoo glsa xml mariadb-galera mariadb55-mariadb-devel ruby-mysql openshift-origin-cartridge-mysql rh-mariadb100-mariadb mariadb-apb-role query-mysql mariadb55-mariadb-test rh-mysql57-mysql rh-mariadb101-mariadb rh-mysql56-mysql mysql mysql-connector-java mariadb55-mariadb-bench mysql55-mysql mysql-apb-role mysql mariadb55-mariadb-server mysql-binuuid-rails rh-mysql80-mysql com.github.brandtg switchboard-mysql rh-mariadb102-mariadb mariadb mariadb55-mariadb rhn-solaris-bootstrap mariadb55-mariadb-libs
"""
# https://github.com/automated-library/ICPC_2022_Automated-Identification-of-Libraries-from-Vulnerability-Data/tree/main/extremeText/dataset

In [6]:
#get hierarchical data

df = pd.read_csv('https://raw.githubusercontent.com/paulafortuna/Portuguese-Hate-Speech-Dataset/master/2019-05-28_portuguese_hate_speech_hierarchical_classification.csv')
print(df.shape)
df.head(2)

(5668, 80)


Unnamed: 0,text,Hate.speech,Sexism,Body,Racism,Ideology,Homophobia,Origin,Religion,Health,...,Thin.women,Arabic,East.europeans,Africans,South.Americans,Brazilians,Migrants,Homossexuals,Thin.people,Ageing
0,"""não come mel, morde marimbondo""",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,"não tem pinto, tem orgulho !",0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Preprocessing

In [7]:
#preprocessing funtion

stopwords_manual = ['http?', 'mais', 'is?o', 'es[st]?*', 'quan[dt]?', ' ', '\n', '...', 'de o', 'em o', 'rt', 'ter', 'pra', 'a o', 'q', '  ', '..', 'por 0', 'fazer', 'dizer', 'vc']

def preprocessing(text):

    text = re.sub('@[\w]+','',text) #remove usernames
    text = re.sub(r'(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w \.-]*)', '', text) #remove links
    text = [w.lemma_ for w in nlp(text)] #This lemma also performs tokenization
    text = [word for word in text if word not in punctuation]
    text = [w.lower() for w in text]
    text = [word for word in text if word not in stopwords and word not in stopwords_manual]
    text = ' '.join([str(word) for word in text])

    return text

In [8]:
df['text'] = df.text.apply(preprocessing)

In [9]:
# delete hatespeech since this is hierarchical classification, not binary
df.pop("Hate.speech")

# replace "." with "-"
df.columns = df.columns.str.replace('.', '-')

cols = df.columns.tolist()
#remove text from list of columns since it is not a classification
cols.pop(0)

  df.columns = df.columns.str.replace('.', '-')


'text'

## Split Data

In itterative train test split, X (text) is supposed to contain strings and Y (classifications) is supposed to conatin 0,1. So first split the data and then converted to correct format for extreme text.

In [10]:
#http://scikit.ml/stratification.html
# https://datascience.stackexchange.com/questions/45174/how-to-use-sklearn-train-test-split-to-stratify-data-for-multi-label-classificat

#reformat text as array of arrays so it is in expected format for iterative_train_test_split
def toArray(text):
  text_array = []
  for row in text:
    text_array.append(np.array([row]))
  return np.asarray(text_array)

In [11]:
#X is text and y is classifications
my_X = toArray(df['text'].values)
my_y = df[cols].values

#split the data for training and testing
X_train, y_train, X_test, y_test = iterative_train_test_split(
    my_X,
    my_y,
    test_size = 0.2
)

In [12]:
#add text back
cols.insert(0,'text')
print(cols)

['text', 'Sexism', 'Body', 'Racism', 'Ideology', 'Homophobia', 'Origin', 'Religion', 'Health', 'OtherLifestyle', 'Aborting-women', 'Agnostic', 'Argentines', 'Asians', 'Autists', 'Black-Women', 'Blond-women', 'Brazilians-women', 'Chinese', 'Criminals', 'Egyptians', 'Fat-people', 'Football-players-women', 'Gamers', 'Homeless', 'Homeless-women', 'Indigenous', 'Iranians', 'Japaneses', 'Jews', 'Jornalists', 'Latins', 'Left-wing-ideology', 'Men-Feminists', 'Mexicans', 'Muslims-women', 'Nordestines', 'Old-people', 'Polyamorous', 'Poor-people', 'Rural-people', 'Russians', 'Sertanejos', 'Street-artist', 'Ucranians', 'Vegetarians', 'White-people', 'Young-people', 'Old-women', 'Ugly-people', 'Venezuelans', 'Angolans', 'Black-people', 'Disabled-people', 'Fat-women', 'Feminists', 'Gays', 'Immigrants', 'Islamists', 'Lesbians', 'Men', 'Muslims', 'Refugees', 'Trans-women', 'Travestis', 'Women', 'Bissexuals', 'Transexuals', 'Ugly-women', 'Thin-women', 'Arabic', 'East-europeans', 'Africans', 'South-Amer

In [13]:
#convert data to dataframes
trainData = pd.DataFrame(np.hstack((X_train, y_train)),None,cols)
testData = pd.DataFrame(np.hstack((X_test, y_test)),None,cols)

In [14]:
#remove text
cols.pop(0)

'text'

##Format for Extreme Text

In [15]:
#Modeling

#convert data to layout accepted by extreme text
def label_value(value, col):
    if value == "1":
    #if value == 1:
        return f' __label__{col}'
    else:
        return ''

#loop through each column for formating
def loopColumns(text, df):
  for col in text:
    return df[text.name].apply(label_value, args=(text.name,))

In [16]:
#replace empty lable with a non hate speech lable 
trainData['label_total'] = trainData[cols].apply(loopColumns, args=(trainData,)).agg(''.join, axis=1).replace('', '__label__not_Hate-speech')

testData['label_total'] = testData[cols].apply(loopColumns, args=(trainData,)).agg(''.join, axis=1).replace('', '__label__not_Hate-speech')

#Add final column used for extreme text consisiting off __lables__ and text
trainData['extremeText_label'] = trainData['label_total'].astype(str) + ' ' + trainData['text'].astype(str)
testData['extremeText_label'] = testData['label_total'].astype(str) + ' ' + testData['text'].astype(str)

trainData = trainData["extremeText_label"]
testData = testData["extremeText_label"]

In [17]:
#write data to file
with open('./Data/train_hs.txt', 'a') as the_file:
    for item in trainData:
        the_file.write(f"{item}\n")
        
with open('./Data/test_hs.txt', 'a') as the_file:
    for item in testData:
        the_file.write(f"{item}\n")

## Modeling
https://github.com/automated-library/ICPC_2022_Automated-Identification-of-Libraries-from-Vulnerability-Data/blob/main/extremeText/extremetext_train.py

In [18]:
# https://github.com/automated-library/ICPC_2022_Automated-Identification-of-Libraries-from-Vulnerability-Data/blob/main/extremeText/extremetext_train.py
# https://github.com/mwydmuch/extremeText/blob/master/python/extremeText/ExtremeText.py


def model_training(train_data):
    # train_supervised uses the same arguments and defaults as the fastText/extremeText cli

    print("Supervised Training")

    # paper supervised training
    model = extremeText.train_supervised(
        input=train_data, epoch=300, lr=0.05, verbose=3, wordNgrams=2, minCount=1, l2=0.003, arity=2, dim=100, tfidfWeights=True
    )
    model.save_model("./Model/xt_supervised.bin")
    return model

In [19]:
#"https://raw.githubusercontent.com/lclarete/hatespeech_hc/main/extremeText/data/train_hs.txt"

model = model_training('./Data/train_hs.txt')

Supervised Training


In [20]:
# 'https://raw.githubusercontent.com/lclarete/hatespeech_hc/main/extremeText/data/test_hs.txt'

#print results
res = []

for k in (1,5,10,15,20,25,30,35,40):
  x, accuracy, precision, recall = model.test('./Data/test_hs.txt', k=k)
  f1 = 2*((precision*recall)/(precision+recall))
  res.append([k,accuracy,precision,recall, f1])

results = pd.DataFrame(res, columns =['k', 'Accuracy@k', 'Precision@k', 'Recall@k', 'f1@k'])

print("Pre-processed all categories")
results.head(8)

Pre-processed all categories


Unnamed: 0,k,Accuracy@k,Precision@k,Recall@k,f1@k
0,1,0.66009,0.561832,0.066667,0.11919
1,5,0.169865,0.722901,0.146667,0.243858
2,10,0.09148,0.778626,0.173333,0.283545
3,15,0.063916,0.816031,0.266667,0.401974
4,20,0.049641,0.845038,0.28,0.420627
5,25,0.040466,0.861069,0.32,0.466598
6,30,0.033812,0.863359,0.32,0.466933
7,35,0.029238,0.870992,0.373333,0.522645


In [21]:
#predict hatespeech
model.predict('O mundo das sapatao é mais ligado')

(('__label__Lesbians',), array([-1.04908144]))

##Try with removing rare categories 5+

In [22]:
df = pd.read_csv('https://raw.githubusercontent.com/paulafortuna/Portuguese-Hate-Speech-Dataset/master/2019-05-28_portuguese_hate_speech_hierarchical_classification.csv')

for column in df.columns:
  if column != "text":
    if df[column].sum() < 5:
      df.drop(column, inplace=True, axis=1)

In [23]:
df['text'] = df.text.apply(preprocessing)

In [24]:
df.pop("Hate.speech")

df.columns = df.columns.str.replace('.', '-')

cols = df.columns.tolist()
cols.pop(0)

  df.columns = df.columns.str.replace('.', '-')


'text'

In [25]:
my_X = toArray(df['text'].values)
my_y = df[cols].values

X_train, y_train, X_test, y_test = iterative_train_test_split(
    my_X,
    my_y,
    test_size = 0.2
)

In [26]:
cols.insert(0,'text')

trainData = pd.DataFrame(np.hstack((X_train, y_train)),None,cols)
testData = pd.DataFrame(np.hstack((X_test, y_test)),None,cols)

cols.pop(0)

trainData['label_total'] = trainData[cols].apply(loopColumns, args=(trainData,)).agg(''.join, axis=1).replace('', '__label__not_Hate-speech')

testData['label_total'] = testData[cols].apply(loopColumns, args=(trainData,)).agg(''.join, axis=1).replace('', '__label__not_Hate-speech')

trainData['extremeText_label'] = trainData['label_total'].astype(str) + ' ' + trainData['text'].astype(str)
testData['extremeText_label'] = testData['label_total'].astype(str) + ' ' + testData['text'].astype(str)

In [27]:
#write data to file
trainData = trainData["extremeText_label"]
testData = testData["extremeText_label"]

with open('./Data/train_hs_common5.txt', 'a') as the_file:
    for item in trainData:
        the_file.write(f"{item}\n")
        
with open('./Data/test_hs_common5.txt', 'a') as the_file:
    for item in testData:
        the_file.write(f"{item}\n")

In [28]:
model = model_training('./Data/train_hs_common5.txt')

Supervised Training


In [29]:
res = []

for i in (1,5,10,15,20,25,30,35,40):
  x, y, z, a = model.test('./Data/test_hs_common5.txt', k=i)
  f1 = 2*((z*a)/(z+a))
  res.append([i,y,z,a,f1])

results = pd.DataFrame(res, columns =['k', 'Accuracy@k', 'Precision@k', 'Recall@k', 'f1@k'])

print("Only categories that occur 5+ times")
results.head(8)

Only categories that occur 5+ times


Unnamed: 0,k,Accuracy@k,Precision@k,Recall@k,f1@k
0,1,0.689408,0.590315,0.153846,0.244081
1,5,0.169838,0.727133,0.307692,0.432408
2,10,0.091023,0.7794,0.410256,0.537557
3,15,0.063495,0.815527,0.487179,0.609973
4,20,0.049237,0.843198,0.512821,0.637763
5,25,0.040431,0.865488,0.641026,0.736535
6,30,0.03453,0.88701,0.717949,0.793575
7,35,0.030546,0.91545,0.74359,0.820618


In [30]:
model.predict('O mundo das sapatao é mais ligado')

(('__label__Lesbians',), array([-0.98692816]))

##Try with removing rare categories 100+

In [31]:
df = pd.read_csv('https://raw.githubusercontent.com/paulafortuna/Portuguese-Hate-Speech-Dataset/master/2019-05-28_portuguese_hate_speech_hierarchical_classification.csv')

for column in df.columns:
  if column != "text":
    if df[column].sum() < 100:
      df.drop(column, inplace=True, axis=1)

In [32]:
df['text'] = df.text.apply(preprocessing)

In [33]:
# delete hatespeech since this is hierarchical classification, not binary
df.pop("Hate.speech")

# replace "." with "-"
df.columns = df.columns.str.replace('.', '-')

cols = df.columns.tolist()
cols.pop(0)

  df.columns = df.columns.str.replace('.', '-')


'text'

In [34]:
#split the data

my_X = toArray(df['text'].values)
my_y = df[cols].values

#but this returns issues too many indices for array: array is 1-dimensional, but 2 were indexed
X_train, y_train, X_test, y_test = iterative_train_test_split(
    my_X,
    my_y,
    test_size = 0.2
)

In [35]:
cols.insert(0,'text')

trainData = pd.DataFrame(np.hstack((X_train, y_train)),None,cols)
testData = pd.DataFrame(np.hstack((X_test, y_test)),None,cols)

cols.pop(0)

trainData['label_total'] = trainData[cols].apply(loopColumns, args=(trainData,)).agg(''.join, axis=1).replace('', '__label__not_Hate-speech')

testData['label_total'] = testData[cols].apply(loopColumns, args=(trainData,)).agg(''.join, axis=1).replace('', '__label__not_Hate-speech')

trainData['extremeText_label'] = trainData['label_total'].astype(str) + ' ' + trainData['text'].astype(str)
testData['extremeText_label'] = testData['label_total'].astype(str) + ' ' + testData['text'].astype(str)

In [36]:
#write data to file

trainData = trainData["extremeText_label"]
testData = testData["extremeText_label"]

with open('./Data/train_hs_common100.txt', 'a') as the_file:
    for item in trainData:
        the_file.write(f"{item}\n")
        
with open('./Data/test_hs_common100.txt', 'a') as the_file:
    for item in testData:
        the_file.write(f"{item}\n")

In [37]:
model = model_training('./Data/train_hs_common100.txt')

Supervised Training


In [38]:
res = []

for i in (1,5,10,15,20,25,30,35,40):
  x, y, z, a = model.test('./Data/test_hs_common100.txt', k=i)
  f1 = 2*((z*a)/(z+a))
  res.append([i,y,z,a,f1])

results = pd.DataFrame(res, columns =['k', 'Accuracy@k', 'Precision@k', 'Recall@k', 'f1@k'])

print("Only categories that occur 100+ times")
results.head(8)

Only categories that occur 100+ times


Unnamed: 0,k,Accuracy@k,Precision@k,Recall@k,f1@k
0,1,0.775986,0.730802,0.272727,0.397217
1,5,0.185305,0.872574,0.727273,0.793325
2,10,0.098387,0.926582,0.818182,0.869015
3,15,0.09653,1.0,0.818182,0.9
4,20,0.09653,1.0,0.818182,0.9
5,25,0.09653,1.0,0.818182,0.9
6,30,0.09653,1.0,0.818182,0.9
7,35,0.09653,1.0,0.818182,0.9


In [39]:
model.predict('O mundo das sapatao é mais ligado')

(('__label__Homossexuals',), array([-0.96877891]))

##Text without Preprocessing

In [40]:
df = pd.read_csv('https://raw.githubusercontent.com/paulafortuna/Portuguese-Hate-Speech-Dataset/master/2019-05-28_portuguese_hate_speech_hierarchical_classification.csv')

In [41]:
# delete hatespeech since this is hierarchical classification, not binary
df.pop("Hate.speech")

# replace "." with "-"
df.columns = df.columns.str.replace('.', '-')

cols = df.columns.tolist()
cols.pop(0)

  df.columns = df.columns.str.replace('.', '-')


'text'

In [42]:
#split the data

my_X = toArray(df['text'].values)
my_y = df[cols].values

#but this returns issues too many indices for array: array is 1-dimensional, but 2 were indexed
X_train, y_train, X_test, y_test = iterative_train_test_split(
    my_X,
    my_y,
    test_size = 0.2
)

In [43]:
cols.insert(0,'text')

trainData = pd.DataFrame(np.hstack((X_train, y_train)),None,cols)
testData = pd.DataFrame(np.hstack((X_test, y_test)),None,cols)

cols.pop(0)

trainData['label_total'] = trainData[cols].apply(loopColumns, args=(trainData,)).agg(''.join, axis=1).replace('', '__label__not_Hate-speech')

testData['label_total'] = testData[cols].apply(loopColumns, args=(trainData,)).agg(''.join, axis=1).replace('', '__label__not_Hate-speech')

trainData['extremeText_label'] = trainData['label_total'].astype(str) + ' ' + trainData['text'].astype(str)
testData['extremeText_label'] = testData['label_total'].astype(str) + ' ' + testData['text'].astype(str)

In [44]:
#write data to file

trainData = trainData["extremeText_label"]
testData = testData["extremeText_label"]

with open('./Data/train_hs_notProcessed.txt', 'a') as the_file:
    for item in trainData:
        the_file.write(f"{item}\n")
        
with open('./Data/test_hs_notProcessed.txt', 'a') as the_file:
    for item in testData:
        the_file.write(f"{item}\n")

In [45]:
model = model_training('./Data/train_hs_notProcessed.txt')

Supervised Training


In [46]:
res = []

for i in (1,5,10,15,20,25,30,35,40):
  x, y, z, a = model.test('./Data/test_hs_notProcessed.txt', k=i)
  f1 = 2*((z*a)/(z+a))
  res.append([i,y,z,a,f1])

results = pd.DataFrame(res, columns =['k', 'Accuracy@k', 'Precision@k', 'Recall@k', 'f1@k'])

print("All Categories, no pre-processing")
results.head(8)

All Categories, no pre-processing


Unnamed: 0,k,Accuracy@k,Precision@k,Recall@k,f1@k
0,1,0.725089,0.617892,0.053333,0.098191
1,5,0.173488,0.739196,0.106667,0.186431
2,10,0.09484,0.808188,0.12,0.208972
3,15,0.066845,0.854435,0.186667,0.306396
4,20,0.051557,0.878696,0.253333,0.393281
5,25,0.041922,0.893101,0.28,0.426337
6,30,0.035261,0.90144,0.32,0.472329
7,35,0.030452,0.908264,0.32,0.473261


In [47]:
model.predict('O mundo das sapatao é mais ligado')

(('__label__Lesbians',), array([-0.85724556]))

##Use only second level categories, see if it improves results, sometimes too many or too few option can make results worse

Ageing
Body
Health
Homophobia
Ideology
Migrants
Origin
Other Lifestyle
Racism
Religion
Sexism


In [48]:
df = pd.read_csv('https://raw.githubusercontent.com/paulafortuna/Portuguese-Hate-Speech-Dataset/master/2019-05-28_portuguese_hate_speech_hierarchical_classification.csv')

column_list = ['text', 'Hate.speech', 'Ageing', 'Body', 'Health', 'Homophobia', 'Ideology', 'Migrants', 'Origin', 'OtherLifestyle', 'Racism', 'Religion', 'Sexism']

for column in df.columns:
  if column not in column_list:
    df.drop(column, inplace=True, axis=1)

In [49]:
df['text'] = df.text.apply(preprocessing)

In [50]:
df.pop("Hate.speech")

df.columns = df.columns.str.replace('.', '-')

cols = df.columns.tolist()
cols.pop(0)

  df.columns = df.columns.str.replace('.', '-')


'text'

In [51]:
my_X = toArray(df['text'].values)
my_y = df[cols].values

X_train, y_train, X_test, y_test = iterative_train_test_split(
    my_X,
    my_y,
    test_size = 0.2
)

In [52]:
cols.insert(0,'text')

trainData = pd.DataFrame(np.hstack((X_train, y_train)),None,cols)
testData = pd.DataFrame(np.hstack((X_test, y_test)),None,cols)

cols.pop(0)

trainData['label_total'] = trainData[cols].apply(loopColumns, args=(trainData,)).agg(''.join, axis=1).replace('', '__label__not_Hate-speech')

testData['label_total'] = testData[cols].apply(loopColumns, args=(trainData,)).agg(''.join, axis=1).replace('', '__label__not_Hate-speech')

trainData['extremeText_label'] = trainData['label_total'].astype(str) + ' ' + trainData['text'].astype(str)
testData['extremeText_label'] = testData['label_total'].astype(str) + ' ' + testData['text'].astype(str)

In [53]:
#write data to file
trainData = trainData["extremeText_label"]
testData = testData["extremeText_label"]

with open('./Data/train_2_level.txt', 'a') as the_file:
    for item in trainData:
        the_file.write(f"{item}\n")
        
with open('./Data/test_2_level.txt', 'a') as the_file:
    for item in testData:
        the_file.write(f"{item}\n")

In [54]:
model = model_training('./Data/train_2_level.txt')

Supervised Training


In [55]:
res = []

for i in (1,5,10,15,20,25,30,35,40):
  x, y, z, a = model.test('./Data/test_2_level.txt', k=i)
  f1 = 2*((z*a)/(z+a))
  res.append([i,y,z,a,f1])

results = pd.DataFrame(res, columns =['k', 'Accuracy@k', 'Precision@k', 'Recall@k', 'f1@k'])

print("Only categories that are in the second level")
results.head(8)



Only categories that are in the second level


Unnamed: 0,k,Accuracy@k,Precision@k,Recall@k,f1@k
0,1,0.688,0.679543,0.333333,0.44727
1,5,0.177956,0.878841,0.916667,0.897355
2,10,0.095467,0.942932,0.916667,0.929614
3,15,0.08437,1.0,0.916667,0.956522
4,20,0.08437,1.0,0.916667,0.956522
5,25,0.08437,1.0,0.916667,0.956522
6,30,0.08437,1.0,0.916667,0.956522
7,35,0.08437,1.0,0.916667,0.956522


In [56]:
model.predict('O mundo das sapatao é mais ligado')

(('__label__Homophobia',), array([9.99994973e-06]))