# 1. Importación de librerías

In [2]:
# Librerías para manejo de datos
import pandas as pd
pd.set_option('display.max_columns', 25) # Número máximo de columnas a mostrar
pd.set_option('display.max_rows', 50) # Numero máximo de filas a mostar
import numpy as np
np.random.seed(3301)
import pandas as pd
# Para preparar los datos
from sklearn.preprocessing import LabelEncoder
# Para crear el arbol de decisión 
from sklearn.tree import DecisionTreeClassifier 
# Para usar KNN como clasificador
from sklearn.neighbors import KNeighborsClassifier
# Para realizar la separación del conjunto de aprendizaje en entrenamiento y test.
from sklearn.model_selection import train_test_split
# Para evaluar el modelo
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score, accuracy_score, ConfusionMatrixDisplay
# Para búsqueda de hiperparámetros
from sklearn.model_selection import GridSearchCV
# Para la validación cruzada
from sklearn.model_selection import KFold 
#Librerías para la visualización
import matplotlib.pyplot as plt
# Seaborn
import seaborn as sns 
from sklearn import tree

# 2. Limpieza y Perfilamiento de Datos

## 2.1 Importación de librerías

In [None]:
# Librería Natural Language Toolkit, usada para trabajar con textos.

import nltk
nltk.download('stopwords')

In [None]:
#El lemmatizer de NLTK NO funciona en español, por lo que se usará el de Stanza.

import stanza
stanza.download('es')

In [None]:
nlp = stanza.Pipeline(lang='es', processors='tokenize,mwt,pos,lemma')

In [None]:
# Instalación de librerias
import sys
import re, string, unicodedata

from nltk.tokenize import WordPunctTokenizer 
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.base import BaseEstimator, ClassifierMixin

## 2.2 Lectura de los datos

In [None]:
# Uso de la libreria pandas para la lectura de archivos
data=pd.read_csv('train_reviews.csv', sep=',', encoding = 'utf-8')
# Asignación a una nueva variable de los datos leidos
data_train=data

In [None]:
data_train

In [None]:
data_train.info()

## 2.3 Entendimiento de los datos

In [None]:
from scipy import stats as st

textos = data_train.copy()
textos['Conteo'] = [len(x) for x in textos['Review']]

#Por ahora: La moda no da información relevante
"""def moda(textos):
    for i in textos['Review']: 
        dict = {}
        for x in i.split(' '): 
            print(i)
        
            if x in dict.keys():
                dict[x] += 1
            else:
                dict[x] = 1

        max_key = max(dict, key=dict.get)
        print(max_key)

moda(textos)
"""
#textos['Moda'] =
#Max tiene el máximo tamaño de la palabra
textos['Max'] = [[max([len(x) for x in i.split(' ')])][0] for i in textos['Review']]
#Min tiene el minimo tamaño de la palabra
textos['Min'] = [[min([len(x) for x in i.split(' ')])][0] for i in textos['Review']]

def frecuenciaPalabras(texto):
    frecuenciaPalabras = {}

    for i in texto: 
        for x in i.split(' '): 
            if x in frecuenciaPalabras.keys():
                frecuenciaPalabras[x] += 1
            else:
                frecuenciaPalabras[x] = 1

    print(frecuenciaPalabras)
    return frecuenciaPalabras

In [None]:
dictFrec = frecuenciaPalabras(textos['Review'])

df_Frecuencias = pd.DataFrame.from_dict(dictFrec, orient='index', columns=['frecuencia'])

df_Frecuencias['palabra'] = df_Frecuencias.index
df_Frecuencias.reset_index(drop=True,inplace=True)


df_Frecuencias['palabra']=[unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in df_Frecuencias['palabra']]

In [None]:
sortedFirst = df_Frecuencias.copy().sort_values(by=['frecuencia'], ascending=False).head(30)

fig = plt.figure(figsize=(10, 6))
plt.barh(sortedFirst['palabra'], sortedFirst['frecuencia'], color='green')
plt.xlabel('Frecuencia')
plt.ylabel('Palabra')
plt.title('Distribución de frecuencia de las palabras')
plt.show()

In [None]:
sortedLast= df_Frecuencias.copy().sort_values(by=['frecuencia'], ascending=False).tail(30)

fig2 = plt.figure(figsize=(10, 6))
plt.barh(sortedLast['palabra'], sortedLast['frecuencia'], color='green')
plt.xlabel('Frecuencia')
plt.ylabel('Palabra')
plt.title('Distribución de frecuencia de las palabras')
plt.show()

In [None]:
textos

In [None]:
"""import ydata_profiling
from ydata_profiling import ProfileReport
ProfileReport(textos)"""

## 2.4 Limpieza de los datos

## 2.4.1 Duplicados

In [None]:
textos.duplicated(keep = False).sum()

In [None]:
textos.drop_duplicates(keep='first', inplace=True)
textos.duplicated(keep = False).sum()

In [None]:
print(textos['Review'][1201])
textos['Review'] = textos['Review'].replace(r'\d+,\d+', '', regex=True)
textos['Review'] = textos['Review'].replace(r'\d+', '', regex=True)
textos['Review'] = textos['Review'].replace(r'\d+.\d+', '', regex=True)
print(textos['Review'][1201])

In [None]:
spanish_stopwords = stopwords.words('spanish')
print(spanish_stopwords)

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        if word is not None:
          new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
          new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_words.append(word.lower())
    return new_words
    

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        if word is not None:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
    return new_words

#def replace_numbers(words):
#    """Replace all interger occurrences in list of tokenized words with textual representation"""
#    p = inflect.engine()
#    print(words)
#    new_words = []
#    for word in words:
#        if word.isdigit():
#            new_word = p.number_to_words(word)
#            new_words.append(new_word)
#            print("if " + new_word)
#        else:
#            new_words.append(word)
#    return new_words

def remove_stopwords(words):
    new_words = []
    for word in words:
        if word not in spanish_stopwords:
            new_words.append(word)
    return new_words

def preprocessing(words):
    words = to_lowercase(words)
 #   words = replace_numbers(words)
    words = remove_punctuation(words)
#    words = remove_non_ascii(words)
    words = remove_stopwords(words)
    return words

## 2.4.2 Tokenización

In [None]:
textos['tokens'] = [ WordPunctTokenizer().tokenize(i) for i in textos['Review']]
    
textos.iloc[1201]['tokens']

In [None]:
textos['tokens'].dropna()

## 2.4.3 Eliminación de ruido

In [None]:
textos['tokens']=textos['tokens'].apply(preprocessing) #Aplica la eliminación del ruido

textos.head()

In [None]:
textos['tokens'] = textos['tokens'].apply(lambda x: ' '.join(map(str, x)))
textos.head()

## 2.4.4 Normalización

In [None]:
"""stemmer = SnowballStemmer('spanish')

textos['tokens'] =  [ [stemmer.stem(word) for word in tokens] for tokens in textos['tokens']]
i = 1201
print(textos['tokens'][i])
print(textos['Review'][i])"""

In [None]:
def lemmatizer(review):

    print(review)
    
    #print(row.name)
    doc  =  nlp(review)
    #print (review)
    lemma = [[word.lemma for word in sent.words]  for sent in doc.sentences]
    finalLemma =[]
    for sent in lemma:
        for word in sent:  
            finalLemma.append(word)
    #print(finalLemma)

    return finalLemma

textos['tokens']= lemmatizer(textos['tokens'].values) #Aplica la lematización

In [None]:
"""def oneSentence(list):
    complete = []
    for sent in list:
        for word in sent:
            complete.append(word)

    return complete

textos['tokens'] = [oneSentence(i) for i in textos['tokens']]"""

In [None]:
#Codigo para guardar los lemas en csv
textos.to_csv('lemaSinStopWords.csv')

In [None]:
#Leer archivo previamente guardado de lemas
from ast import literal_eval
textosLemas = pd.read_csv('lemaSinStopWords.csv', sep=',', encoding = 'utf-8')
textosLemas['tokens'] = textosLemas['tokens'].apply(literal_eval)

In [None]:
textosLemas['tokens']=textosLemas['tokens'].apply(preprocessing) #Aplica la eliminación del ruido

## 2.4.5 Selección de campos

In [None]:
textosLemas.head()

In [None]:
textosLemas['tokens'] = textosLemas['tokens'].apply(lambda x: ' '.join(map(str, x)))
textosLemas.head()

## 2.5 División en conjuntos train, test y validación

In [None]:
x_train, x_test = train_test_split(textosLemas, test_size=0.45, random_state=1) 
print(x_train.shape, x_test.shape)
x_train, x_val = train_test_split(x_train, test_size=0.25, random_state=1)

print( x_test.shape,x_train.shape, x_val.shape)

## 2.6 Segundo profile tras aplicación de preprocessing

In [None]:
"""import ydata_profiling
from ydata_profiling import ProfileReport
ProfileReport(textosLemas)"""

In [None]:
dictFrecuenciasTokenizado = frecuenciaPalabras(textosLemas['tokens'])

df_Frecuencias = pd.DataFrame.from_dict(dictFrecuenciasTokenizado, orient='index', columns=['frecuencia'])

df_Frecuencias['palabra'] = df_Frecuencias.index
df_Frecuencias.reset_index(drop=True,inplace=True)


df_Frecuencias['palabra']=[unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore') for word in df_Frecuencias['palabra']]

In [None]:
sortedFirst = df_Frecuencias.copy().sort_values(by=['frecuencia'], ascending=False).head(30)

fig = plt.figure(figsize=(10, 6))
plt.barh(sortedFirst['palabra'], sortedFirst['frecuencia'], color='green')
plt.xlabel('Frecuencia')
plt.ylabel('Palabra')
plt.title('Distribución de frecuencia de las palabras')
plt.show()

In [None]:
sortedLast= df_Frecuencias.copy().sort_values(by=['frecuencia'], ascending=False).tail(30)

fig2 = plt.figure(figsize=(10, 6))
plt.barh(sortedLast['palabra'], sortedLast['frecuencia'], color='green')
plt.xlabel('Frecuencia')
plt.ylabel('Palabra')
plt.title('Distribución de frecuencia de las palabras')
plt.show()

In [None]:
x_train, y_train = x_train['tokens'],x_train['Class']
x_val, y_val = x_val['tokens'],x_val['Class']
x_test, y_test = x_test['tokens'],x_test['Class']

## 2.7 Embedding del texto

## 2.7.1 Count Vectorizer

In [None]:
count = CountVectorizer()
x_train_countVectorizer = count.fit_transform(x_train)
print(x_train_countVectorizer.shape)
x_train_countVectorizer.toarray()[3]


count = CountVectorizer()
x_val_countVectorizer = count.fit_transform(x_val)
print(x_val_countVectorizer.shape)
x_val_countVectorizer.toarray()[3]


count = CountVectorizer()
x_test_countVectorizer = count.fit_transform(x_test)
print(x_test_countVectorizer.shape)
x_test_countVectorizer.toarray()[3]

## 2.7.2 TfiDf

In [None]:
tfidf = TfidfVectorizer()
x_train_tfidfVectorizer = tfidf.fit_transform(x_train)
print(x_train_tfidfVectorizer.shape)
x_train_tfidfVectorizer.toarray()[3]

df_train = pd.DataFrame(x_train_tfidfVectorizer[0].T.todense(),
    	index=tfidf.get_feature_names_out(), columns=["TF-IDF"])
df_train = df_train.sort_values('TF-IDF', ascending=False)
df_train.head(20)

In [None]:
tfidf = TfidfVectorizer()
x_test_tfidfVectorizer = tfidf.fit_transform(x_test)
print(x_train_tfidfVectorizer.shape)
x_train_tfidfVectorizer.toarray()[3]

df_test = pd.DataFrame(x_test_tfidfVectorizer[0].T.todense(),
    	index=tfidf.get_feature_names_out(), columns=["TF-IDF"])
df_test = df_test.sort_values('TF-IDF', ascending=False)
df_test.head(20)

In [None]:
tfidf = TfidfVectorizer()
x_val_tfidfVectorizer = tfidf.fit_transform(x_val)
print(x_train_tfidfVectorizer.shape)
x_train_tfidfVectorizer.toarray()[3]

df_val = pd.DataFrame(x_val_tfidfVectorizer[0].T.todense(),
    	index=tfidf.get_feature_names_out(), columns=["TF-IDF"])
df_val = df_val.sort_values('TF-IDF', ascending=False)
df_val.head(20)

# 3. Algoritmo KNN (K-Nearest Neighbors)

In [None]:
neigh = KNeighborsClassifier(n_neighbors=3)
neigh_cV = neigh.fit(x_train_countVectorizer, y_train)
neigh_tfi = neigh.fit(x_train_tfidfVectorizer, y_train)

In [None]:
y_pred_cV = neigh_cV.predict(x_val_countVectorizer)

In [None]:
y_pred_tfi = neigh_tfi.predict(x_val_tfidfVectorizer)

In [None]:
# Se genera la matriz de confusión cV
cm_cV = confusion_matrix(y_test, y_pred_cV)

In [None]:
# Se genera la matriz de confusión tfi
cm_tfi = confusion_matrix(y_test, y_pred_tfi)

In [None]:
# Se puede visualizar la matriz de confusión cV

disp = ConfusionMatrixDisplay(confusion_matrix=cm_cV)
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
# Se puede visualizar la matriz de confusión tfi
disp = ConfusionMatrixDisplay(confusion_matrix=cm_tfi)
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
# Mostrar reporte de clasificación
print(classification_report(y_test, y_pred_cV))

In [None]:
# Mostrar reporte de clasificación
print(classification_report(y_test, y_pred_tfi))