<a href="https://colab.research.google.com/github/cassioHilario/TCC2023/blob/main/notebooks/tf_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#! pip install tensorflow scikit-learn pandas numpy pickle5

In [None]:
#!pip install nltk

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import itertools
import nltk

from nltk import sent_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve, roc_curve, auc
from sklearn.metrics import confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
import pickle5 as pickle

In [None]:
df = pd.read_csv('../base/amostra_base_v10.1.csv')
print(df.head())

In [None]:
df = df[['frase', 'label']]
df['sentiment'] = df['label'].apply(lambda x: 'positive' if x == 1
                                    else 'negative')
df = df[['frase', 'sentiment']]
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['frase'])
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(df['frase'])
padded_sequences = pad_sequences(sequences, maxlen=10000, truncating='post')

In [None]:
sentiment_labels = pd.get_dummies(df['sentiment']).values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequences, sentiment_labels, test_size=0.3)

In [None]:
model = Sequential()
model.add(Embedding(5000, 100, input_length=100))
model.add(Conv1D(64, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(x_train, y_train, epochs=100, batch_size=32, validation_data=(x_test, y_test))

In [None]:
y_pred = np.argmax(model.predict(x_test), axis=-1)
print("Accuracy:", accuracy_score(np.argmax(y_test, axis=-1), y_pred))

In [None]:

# Make predictions
y_pred = np.argmax(model.predict(x_test), axis=-1)

# Calculate and print accuracy
accuracy = accuracy_score(np.argmax(y_test, axis=-1), y_pred)
print("Accuracy:", accuracy)

# Generate classification report
class_report = classification_report(np.argmax(y_test, axis=-1), y_pred, target_names=['negative', 'positive'])
print("Classification Report:\n", class_report)

# Generate confusion matrix
conf_matrix = confusion_matrix(np.argmax(y_test, axis=-1), y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Plot confusion matrix
def plot_confusion_matrix(conf_matrix, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    plt.figure(figsize=(6, 6))
    plt.imshow(conf_matrix, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if normalize:
        conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]

    plt.tight_layout()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = conf_matrix.max() / 2.
    for i, j in itertools.product(range(conf_matrix.shape[0]), range(conf_matrix.shape[1])):
        plt.text(j, i, format(conf_matrix[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if conf_matrix[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()

# Plot the confusion matrix
plot_confusion_matrix(conf_matrix, classes=['negative', 'positive'])

In [None]:
# Generate precision-recall curve
precision, recall, _ = precision_recall_curve(np.argmax(y_test, axis=-1), y_pred)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='b', lw=2)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid()

# Calculate and print area under the curve (AUC) for precision-recall curve
pr_auc = auc(recall, precision)
print("Precision-Recall AUC:", pr_auc)

plt.show()

# Generate ROC curve
fpr, tpr, _ = roc_curve(np.argmax(y_test, axis=-1), y_pred)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.grid()

# Calculate and print area under the curve (AUC) for ROC curve
roc_auc = auc(fpr, tpr)
print("ROC AUC:", roc_auc)

plt.show()


In [None]:
model.save('nominal_agreement_analysis_model_v2.h5')
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import keras

model = keras.models.load_model('/content/nominal_agreement_analysis_model_v2.h5')
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
def predict_sentiment(text):
    # Tokenize and pad the input text
    text_sequence = tokenizer.texts_to_sequences([text])
    text_sequence = pad_sequences(text_sequence, maxlen=100)

    # Make a prediction using the trained model
    predicted_rating = model.predict(text_sequence)[0]
    if np.argmax(predicted_rating) == 0:
        return 'Negative'
    elif np.argmax(predicted_rating) == 1:
        return 'Positive'

In [None]:
nltk.download('punkt')

In [None]:
text_input = "O carro é bonita"
predicted_sentiment = predict_sentiment(text_input)
print(predicted_sentiment)

In [None]:
text_input = "O carro é bonito"
predicted_sentiment = predict_sentiment(text_input)
print(predicted_sentiment)

In [None]:
text_input = "O carro é bonitos"
predicted_sentiment = predict_sentiment(text_input)
print(predicted_sentiment)

In [None]:
text_input = "O carro é bonitas"
predicted_sentiment = predict_sentiment(text_input)
print(predicted_sentiment)