In [None]:
from google.colab import drive
drive.mount('/content/drive')
#/content/drive/MyDrive/Datos.csv

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from collections import defaultdict
import re
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

class NaiveBayesClassifier:
    def __init__(self):
        self.class_counts = defaultdict(int)
        self.word_counts = defaultdict(lambda: defaultdict(int))
        self.classes = set()
        self.total_documents = 0
        self.vocabulary = set()

    def clean_text(self, text):
        # Limpieza.
        text = re.sub(r'\W', ' ', text)
        return text.lower().split()

    def train(self, documents):
        for document, label in documents:
            self.total_documents += 1
            self.classes.add(label)
            self.class_counts[label] += 1
            for word in self.clean_text(document):
                # Ignorar stopwords en español
                if word not in stopwords:
                    self.word_counts[label][word] += 1
                    self.vocabulary.add(word)

    def predict(self, document):
        document_words = set(self.clean_text(document))
        scores = {c: 0 for c in self.classes}

        for label in self.classes:
            for word in document_words:
                # Aplicar suavizado de Laplace
                scores[label] += (self.word_counts[label][word] + 1) / (self.class_counts[label] + len(self.vocabulary))

            # Incluir el logaritmo de la probabilidad de la clase
            scores[label] += self.class_counts[label] / self.total_documents

        # Normalizar para evitar overflow y underflow
        max_score = max(scores.values())
        scores = {label: score / max_score for label, score in scores.items()}

        return max(scores, key=scores.get)

# Leer datos desde un archivo CSV
train_csv_file_path = '/content/drive/MyDrive/Train_data.csv'
test_csv_file_path = '/content/drive/MyDrive/Test_data.csv'

df = pd.read_csv(train_csv_file_path)
test_df = pd.read_csv(test_csv_file_path)

# Preprocesamiento de las etiquetas para que sean 'positivo', 'neutro' o 'negativo'
df['sentiment'] = df['sentiment'].apply(lambda x: 'positivo' if 'positivo' in x.lower() else ('neutro' if 'neutro' in x.lower() else 'negativo'))
# Preprocesamiento de las etiquetas para que sean 'positivo', 'neutro' o 'negativo' en el conjunto de prueba
test_df['sentiment'] = test_df['sentiment'].apply(lambda x: 'positivo' if 'positivo' in x.lower() else ('neutro' if 'neutro' in x.lower() else 'negativo'))

# Crear y entrenar el clasificador
stopwords = set(['de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'del', 'se', 'las', 'por', 'un', 'para', 'con', 'una', 'su', 'al', 'es', 'lo', 'como', 'más', 'pero', 'sus'])
classifier = NaiveBayesClassifier()
classifier.train(zip(df['review'], df['sentiment']))

# Predicciones en el conjunto de prueba
test_predictions = [classifier.predict(test_document) for test_document in test_df['review']]

# Etiquetas verdaderas en el conjunto de prueba
true_labels = test_df['sentiment'].tolist()

for i in range(200):  # Imprimir las primeras 10 predicciones para ilustrar
    print(f'Comentario: {test_df["review"].iloc[i]}')
    print(f'Predicción: {test_predictions[i]} - Etiqueta Verdadera: {true_labels[i]}')
    print('\n')

# Calcular métricas
precision = precision_score(true_labels, test_predictions, average='weighted')
recall = recall_score(true_labels, test_predictions, average='weighted')
f1 = f1_score(true_labels, test_predictions, average='weighted')

# Imprimir métricas redondeadas a 2 decimales
print(f'Precision: {round(precision, 4)}')
print(f'Recall: {round(recall, 4)}')
print(f'F1 Score: {round(f1, 4)}')

