<a href="https://colab.research.google.com/github/cassioHilario/TCC2023/blob/main/notebooks/modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import pickle
import nltk
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from sklearn.metrics import accuracy_score


In [None]:
# Carregando os dados pré-processados
df = pd.read_csv('../base/amostra_base_v4.1.csv')

print(df['frase'][:5])

In [None]:

frases_tokenizadas = df["frase"]
labels = df['label']


print(frases_tokenizadas[:5])
print(labels[:5])


In [None]:
# Dividindo os dados em conjuntos de treinamento e teste
X_train, X_test, y_train, y_test = train_test_split(frases_tokenizadas, labels, test_size=0.3, random_state=42)

In [None]:

# Vetorização das frases usando TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Treinando o modelo Naive Bayes
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)


In [None]:

# Fazendo previsões
y_pred = clf.predict(X_test_tfidf)

# Calculando métricas de avaliação
classification_rep = classification_report(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)
print("Confusion Matrix:\n", confusion)


In [None]:
sns.heatmap(confusion, annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Class")
plt.ylabel("True Class")
plt.show()

In [None]:
plt.bar(['Accuracy'], [accuracy])
plt.ylabel('Score')
plt.title('Accuracy')
plt.show()

In [None]:

# Plotando a curva ROC (somente se for um problema de classificação binária)
if len(set(y_test)) == 2:
    y_test_bin = label_binarize(y_test, classes=[0, 1])
    fpr, tpr, _ = roc_curve(y_test_bin, clf.predict_proba(X_test_tfidf)[:, 1])
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()


In [None]:

# Plotando a curva Precision-Recall (somente se for um problema de classificação binária)
if len(set(y_test)) == 2:
    precision, recall, _ = precision_recall_curve(y_test_bin, clf.predict_proba(X_test_tfidf)[:, 1])
    average_precision = average_precision_score(y_test, clf.predict(X_test_tfidf))

    plt.figure()
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, alpha=0.2, color='b')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall curve: AP={0:0.2f}'.format(average_precision))
    plt.show()
