In [5]:
# Library Default
import  pandas   as pd
import  numpy    as np
import  re
import  joblib
from    unidecode             import unidecode

# Models
from    gensim.models.doc2vec import Doc2Vec, TaggedDocument
from    sklearn.ensemble      import HistGradientBoostingClassifier

# Others
from    sklearn.model_selection import train_test_split
from    sklearn.metrics         import confusion_matrix

### Treino e teste

In [10]:
df_texto = pd.read_excel("texto_bs2.xlsx")

# divisão de treino e teste
X_train, X_test, y_train, y_test = train_test_split(df_texto.texto, df_texto.alvo, test_size=0.3, random_state=42)

# pré-processamento
substituir = re.compile(r"(\W)|(\S*@\S*\s?)|(http\S+)|(www\S+)|(\d+)|(\\n)|[ªº_]")
X_train.replace(substituir, " ", regex = True, inplace=True)
X_train = X_train.apply(lambda x: unidecode(x).split())

substituir = re.compile(r"(\W)|(\S*@\S*\s?)|(http\S+)|(www\S+)|(\d+)|(\\n)|[ªº_]")
X_test.replace(substituir, " ", regex = True, inplace=True)
X_test = X_test.apply(lambda x: unidecode(x).split())

# doc2vec Treino
tagged_doc_train = [TaggedDocument(texto, [i]) for i, texto in enumerate(X_train)]
d2v_model = Doc2Vec(tagged_doc_train, vector_size=1500, min_count = 1, epochs = 20, dm=0, window=5)
x_treino = np.array([d2v_model.infer_vector(tagged_doc_train[i][0], alpha=0.3, min_alpha=0.07) for i in range(len(tagged_doc_train))])

# doc2vec teste
tagged_doc_test = [TaggedDocument(texto, [i]) for i, texto in enumerate(X_test)]
x_teste = np.array([d2v_model.infer_vector(tagged_doc_test[i][0], alpha=0.3, min_alpha=0.07) for i in range(len(tagged_doc_test))])

# Algoritmo de classificação
classifier = HistGradientBoostingClassifier(max_iter = 1000, learning_rate=0.1, warm_start=True, random_state=40, max_depth=5)
classifier.fit(x_treino, y_train)
treino_cls = classifier.predict(x_treino)
teste_cls = classifier.predict(x_teste)

print(classifier)
print("\nTreino:")
print("model score train: %.2f%%" % (classifier.score(x_treino, y_train) * 100.0))
print(confusion_matrix(y_train, treino_cls))

print("\nTeste:")
print("model score train: %.2f%%" % (classifier.score(x_teste, y_test) * 100.0))
print(confusion_matrix(y_test, teste_cls))

HistGradientBoostingClassifier(max_depth=5, max_iter=1000, random_state=40,
                               warm_start=True)

Treino:
model score train: 99.87%
[[3455    0    4]
 [   0  321    2]
 [   9    0 7363]]

Teste:
model score train: 98.05%
[[1505    0   14]
 [   0  122    5]
 [  73    1 3061]]


Após encontrar os melhores resultados crie um modelo usando todos os dados

In [12]:
# pré-processamento
df_texto = pd.read_excel("texto_bs2.xlsx")
substituir = re.compile(r"(\W)|(\S*@\S*\s?)|(http\S+)|(www\S+)|(\d+)|(\\n)|[ªº_]")
df_texto.replace(substituir, " ", regex = True, inplace=True)
df_texto['texto_limpo'] = df_texto.texto.apply(lambda x: unidecode(x).split())

# doc2vec
tagged_doc = [TaggedDocument(texto, [i]) for i, texto in enumerate(df_texto.texto_limpo)]
d2v_model = Doc2Vec(tagged_doc, vector_size=1500, min_count = 1, epochs = 20, dm=0, window=5)

x = np.array([d2v_model.infer_vector(tagged_doc[i][0], alpha=0.3, min_alpha=0.07) for i in range(len(tagged_doc))])
y = df_texto.alvo

# Algoritmo de classificação
classifier = HistGradientBoostingClassifier(max_iter = 1000, learning_rate=0.1, warm_start=True, random_state=40, max_depth=5)
classifier.fit(x, y)
teste_cls = classifier.predict(x)

print(classifier)
print("model score: %.2f%%" % (classifier.score(x, y) * 100.0))
print(confusion_matrix(y, teste_cls))

HistGradientBoostingClassifier(max_depth=5, max_iter=1000, random_state=40,
                               warm_start=True)
model score: 99.89%
[[ 4974     0     4]
 [    0   449     1]
 [   12     0 10495]]


### Salvando o modelo

In [None]:
# Save the model as a pickle in a file
joblib.dump(d2v_model, r'model_d2v.pkl')
joblib.dump(classifier, r'model_clf2.pkl')