In [None]:
# import necessary libraries
import re, string, unicodedata
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import nltk
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer

# import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score

from langdetect import detect


SEED=42

In [None]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

In [None]:
# cargar los datos
df_original = pd.read_csv('data/tipo1_entrenamiento_estudiantes.csv')

# Entendimiento

In [None]:
df_original.shape

In [None]:
df_original.info()

In [None]:

df_original.sample(5)

In [None]:
df_stats = df_original.copy()
df_stats['Word_Count'] = df_stats['Review'].apply(lambda x: len(x.split()))
df_stats

In [None]:
word_count_stats = df_stats['Word_Count'].describe()
word_count_stats

In [None]:
df_original.isnull().sum()

No hay registros nulos, los datos son completos.

In [None]:
# distribución de variable objetivo
df_original['Class'].value_counts().plot(kind='bar')

Todos los valores de la columna 'Class' son válidos.

In [None]:
df_original.duplicated().sum()

Hay 71 registro duplicados, estos se deben eliminar del dataset.

In [None]:
duplicated_rows = df_original[df_original.duplicated()]
duplicated_rows


# Procesamiento y Preparación

In [None]:
# eliminar las filas duplicadas
df_prep = df_original.drop_duplicates()

In [None]:
df_prep.shape

In [None]:
# TODO: balancear las clases (variable objetivo)

In [None]:
# elimnar filas con texto en otros idiomas
def detect_language(text):
    try:
        language = detect(text)
        return language == 'es'  # Check if language is Spanish
    except:
        return False

In [None]:
df_prep = df_prep[df_prep['Review'].apply(detect_language)]

In [None]:
df_prep.shape

In [None]:
# TODO: guardar datos despues de transformaciones de limpieza
# df_prep.to_csv('data/tipo1_entrenamiento_estudiantes_prep.csv', index=False)

## Divisón en entrenamiento y prueba 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_prep["Review"], df_prep["Class"], test_size = 0.3, stratify = df_prep["Class"], random_state = SEED)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Tokenización / Vectorización

"Feature engineering"

* Bag of Words / Count Tokenizer
* Tf-IDF

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
stop_words = set(stopwords.words('spanish'))

In [None]:
def tokenize_text(text):
    # convertir a minúsculas
    text = text.lower()
    
    # eliminar caracteres numericos
    text = re.sub(r'\d+', '', text) 

    # eliminar puntuación
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # eliminar caracters especiales
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons TODO: not all emojis included here
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               "]+", flags=re.UNICODE)
    
    text = emoji_pattern.sub(r'', text)

    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')  
     
    tokens = word_tokenize(text)

    # eliminar los stems de las palabras
    stemmer = SnowballStemmer('spanish')
    tokens = [stemmer.stem(t) for t in tokens]
    return tokens

In [None]:
bow_vectorizer = CountVectorizer(stop_words=list(stop_words), tokenizer=tokenize_text)
tfidf_vectorizer = TfidfVectorizer( stop_words=list(stop_words), tokenizer=tokenize_text)

In [None]:
X_train_bow = bow_vectorizer.fit_transform(X_train)

In [None]:
len(bow_vectorizer.vocabulary_)

In [None]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [None]:
len(tfidf_vectorizer.vocabulary_)

In [None]:
df_bow_sklearn = pd.DataFrame(X_train_bow.toarray(),columns=bow_vectorizer.get_feature_names_out())
df_bow_sklearn

In [None]:
df_tfid_sklearn = pd.DataFrame(X_train_tfidf.toarray(),columns=tfidf_vectorizer.get_feature_names_out())
df_tfid_sklearn

### Eliminación de palabras con baja frecuencia

In [None]:
# Obtener el vocabulario
vocabulario = bow_vectorizer.vocabulary_

# Obtener la suma de las frecuencias de cada palabra en todo el conjunto de datos
frecuencia_total = X_train_bow.sum(axis=0)
# Crear un diccionario que mapea cada palabra a su frecuencia total
frecuencias = {palabra: frecuencia_total[0, indice] for palabra, indice in vocabulario.items()}

# Convertir el diccionario de frecuencias en un DataFrame
df_frecuencias = pd.DataFrame(list(frecuencias.items()), columns=['Palabra', 'Frecuencia'])

# Ordenar el DataFrame por frecuencia de forma descendente
df_frecuencias = df_frecuencias.sort_values(by='Frecuencia', ascending=False)
# Imprimir las frecuencias de cada palabra
df_frecuencias

In [None]:
df_palabras_altafrec = df_frecuencias[df_frecuencias['Frecuencia'] > 50]
df_palabras_altafrec.describe()

In [None]:
# Box plot
plt.figure(figsize=(10, 6))
plt.boxplot(df_palabras_altafrec['Frecuencia'])
plt.title('Boxplot de la frecuencia de las palabras')
plt.ylabel('Frecuencia')
plt.show()

In [None]:
# Obtener las palabras de alta frecuencia como una lista
palabras_altafrecuencia = df_palabras_altafrec['Palabra'].tolist()

# Filtrar las columnas que coinciden con las palabras de alta frecuencia
df_bow_sklearn_filtrado = df_bow_sklearn[palabras_altafrecuencia]

len(df_bow_sklearn_filtrado.columns)

In [None]:
# Convertir el DataFrame de nuevo a un array de NumPy
X_train_bow_filtered = df_bow_sklearn_filtrado.to_numpy()
X_train_bow_filtered.shape

# Entrenamiento y evaluación de modelos

## [nombre algoritmo]

Desarrollado por:

[descripción]

### Conclusiones

## Naive Bayes

Desarrollado por: Maria Castro Iregui

[descripción]

In [None]:
from datetime import datetime
current_time = datetime.now().strftime("%H:%M:%S")
print("Current time:", current_time)


### Entrenamiento sin filtro de palabras

In [None]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_bow, y_train)


In [None]:
feature_importances = np.exp(naive_bayes.feature_log_prob_)[1,:] - np.exp(naive_bayes.feature_log_prob_)[0,:]

In [None]:
pd.Series(naive_bayes.feature_log_prob_[1], index = bow_vectorizer.vocabulary_).sort_values().tail(20).plot.barh(figsize = (15, 10))

In [None]:
y_train_naive_predict = naive_bayes.predict(X_train_bow)
y_test_naive_predict = naive_bayes.predict(bow_vectorizer.transform(X_test))

In [None]:
print('Clases', len(naive_bayes.classes_))
print('Etiquetas:', naive_bayes.classes_)
print('Features:', len(naive_bayes.feature_log_prob_[0]))
print('Features relevantes', np.count_nonzero(naive_bayes.feature_log_prob_))
print('Prior probabilities:', naive_bayes.class_log_prior_)
print('Feature log probabilities:', naive_bayes.feature_log_prob_)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_naive_predict)

In [None]:
print("accuracy:", naive_bayes.score(X_train, y_train))
print("precision:", precision_score(y_train, y_train_naive_predict, average='macro'))
print("recall:", recall_score(y_train, y_train_naive_predict, average='macro'))
print("f1:", f1_score(y_train, y_train_naive_predict, average='macro'))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_test_naive_predict)

In [None]:
print("accuracy:", naive_bayes.score(tfidf_vectorizer.transform(X_test), y_test))
print("precision:", precision_score(y_test, y_test_naive_predict, average='macro'))
print("recall:", recall_score(y_test, y_test_naive_predict, average='macro'))
print("f1:", f1_score(y_test, y_test_naive_predict, average='macro'))

### Entrenamiento con filtro de palabras

In [None]:
naive_bayes_filtered = MultinomialNB()
naive_bayes_filtered.fit(X_train_bow_filtered, y_train)

In [None]:
feature_importances = np.exp(naive_bayes_filtered.feature_log_prob_)[1,:] - np.exp(naive_bayes_filtered.feature_log_prob_)[0,:]

In [None]:
pd.Series(naive_bayes_filtered.feature_log_prob_[1], index = df_palabras_altafrec['Palabra']).sort_values().tail(20).plot.barh(figsize = (15, 10))

In [None]:
y_train_naive_predict = naive_bayes_filtered.predict(X_train_bow)

#  la eliminacion frecuencias bajas en text

# X_test as df
X_test_bow = bow_vectorizer.transform(X_test)
df_X_test_bow = pd.DataFrame(X_test_bow.toarray(),columns=bow_vectorizer.get_feature_names_out())

df_X_test_bow_filtrado = df_X_test_bow[palabras_altafrecuencia]
X_test_bow_filtrado = df_X_test_bow_filtrado.to_numpy()

# Filtrar las columnas que coinciden con las palabras de alta frecuencia

y_test_naive_predict = naive_bayes_filtered.predict(X_test_bow_filtrado)

In [None]:
print('Clases', len(naive_bayes_filtered.classes_))
print('Etiquetas:', naive_bayes_filtered.classes_)
print('Features:', len(naive_bayes_filtered.feature_log_prob_[0]))
print('Features relevantes', np.count_nonzero(naive_bayes_filtered.feature_log_prob_))
print('Prior probabilities:', naive_bayes_filtered.class_log_prior_)
print('Feature log probabilities:', naive_bayes_filtered.feature_log_prob_)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_train, y_train_naive_predict)

In [None]:
print("accuracy:", naive_bayes_filtered.score(X_train_bow_filtered, y_train))
print("precision:", precision_score(y_train, y_train_naive_predict, average='macro'))
print("recall:", recall_score(y_train, y_train_naive_predict, average='macro'))
print("f1:", f1_score(y_train, y_train_naive_predict, average='macro'))

In [None]:
print("accuracy:", naive_bayes_filtered.score(X_test_bow_filtrado, y_test))
print("precision:", precision_score(y_test, y_test_naive_predict, average='macro'))
print("recall:", recall_score(y_test, y_test_naive_predict, average='macro'))
print("f1:", f1_score(y_test, y_test_naive_predict, average='macro'))

### Conclusiones

## [nombre algoritmo]

Desarrollado por:

[descripción]

### Conclusiones

## Selección de final