## Preprocesamiento de textos
Juan Manuel Silva y Cristian Sarmiento

In [1]:
import pandas as pd
import unicodedata
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

## Lectura del archivo
with open('alice_in_wonderland.txt', 'r', encoding='utf-8') as f:
    contents = f.read()
    
## primera visualización de los datos:
print(contents[835:1400])

[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     certificate chain too long (_ssl.c:1006)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     certificate chain too long (_ssl.c:1006)>


Alice’s Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

Contents

 CHAPTER I.     Down the Rabbit-Hole
 CHAPTER II.    The Pool of Tears
 CHAPTER III.   A Caucus-Race and a Long Tale
 CHAPTER IV.    The Rabbit Sends in a Little Bill
 CHAPTER V.     Advice from a Caterpillar
 CHAPTER VI.    Pig and Pepper
 CHAPTER VII.   A Mad Tea-Party
 CHAPTER VIII.  The Queen’s Croquet-Ground
 CHAPTER IX.    The Mock Turtle’s Story
 CHAPTER X.     The Lobster Quadrille
 CHAPTER XI.    Who Stole the Tarts?
 CHAPTER XII.   Alice’s Evidence







[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     certificate chain too long (_ssl.c:1006)>


## 1. Preprocesamiento básico

### 1.1 Eliminación de mayúsculas

In [2]:
contents = contents.lower()
print(contents[:100])

the project gutenberg ebook of alice's adventures in wonderland
    
this ebook is for the use of a


### 1.2 Tokenización

In [3]:
## Se extraen las palabras de los contenidos usando tokenización de 
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
tokens = tokenizer.tokenize(contents)
print(tokens[:10])

['the', 'project', 'gutenberg', 'ebook', 'of', 'alice', 's', 'adventures', 'in', 'wonderland']


In [4]:
print('Total palabras: ', len(tokens))
print('Total unicas: ', len(set(tokens)))

Total palabras:  30591
Total unicas:  3143


### 1.3 Stopwords

In [5]:
stop_words = nltk.corpus.stopwords.words('english')

print(stop_words[:10])

words_filtered = [word for word in tokens if word not in stop_words]
print(words_filtered[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
['project', 'gutenberg', 'ebook', 'alice', 'adventures', 'wonderland', 'ebook', 'use', 'anyone', 'anywhere']


In [6]:
print('Total palabras quitando stopwords: ', len(words_filtered))
print('Total unicas quitando stopwords: ', len(set(words_filtered)))

Total palabras quitando stopwords:  14298
Total unicas quitando stopwords:  2998


### 1.4 Eliminación de abreviaciones y acentos

In [7]:
text = 'Eliminación de acentos'
unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

'Eliminacion de acentos'

In [8]:
contents = unicodedata.normalize('NFKD', contents).encode('ascii', 'ignore').decode('utf-8')

### 1.5 Lematización

In [9]:
# Inicializar el lematizador
lemmatizer = WordNetLemmatizer()

# Matizar cada palabra
lemmatized_words = [lemmatizer.lemmatize(word) for word in words_filtered]

print(lemmatized_words[:15])

['project', 'gutenberg', 'ebook', 'alice', 'adventure', 'wonderland', 'ebook', 'use', 'anyone', 'anywhere', 'united', 'state', 'part', 'world', 'cost']


In [10]:
print('Total palabras lematizadas: ', len(lemmatized_words))
print('Total unicas lematizadas: ', len(set(lemmatized_words)))

Total palabras lematizadas:  14298
Total unicas lematizadas:  2772


### 1.6 Steaming

In [11]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words_filtered]

print(stemmed_words[:15])

['project', 'gutenberg', 'ebook', 'alic', 'adventur', 'wonderland', 'ebook', 'use', 'anyon', 'anywher', 'unit', 'state', 'part', 'world', 'cost']


In [12]:
print('Total palabras steam: ', len(stemmed_words))
print('Total unicas steam: ', len(set(stemmed_words)))

Total palabras steam:  14298
Total unicas steam:  2278


## 2. Técnicas de representación

### 2.1 Bag of words

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(lemmatized_words)

bow_array = X.toarray()
feature_names = vectorizer.get_feature_names_out()

# Resultado
print("Palabras:", feature_names[:100])
print("Bag of Words:\n", bow_array[:10])

Palabras: ['000' '11' '1500' '1887' '20' '2001' '2008' '2024' '27' '30' '50' '501'
 '596' '60' '6221541' '64' '801' '809' '84116' '90' '_alice' '_all'
 '_all_' '_and' '_are_' '_at' '_before' '_beg_' '_began_' '_best_' '_can_'
 '_could' '_could_' '_curtseying_' '_don' '_ever_' '_everybody_' '_fit_'
 '_hated_' '_have_' '_he_' '_her_' '_here_' '_him_' '_his_' '_how' '_i'
 '_i_' '_in_' '_inside_' '_is_' '_it' '_it_' '_less_' '_little_' '_me_'
 '_mine_' '_more_' '_must_' '_myself_' '_never_' '_no_' '_not' '_not_'
 '_one_' '_ours_' '_outside_' '_please_' '_plenty_' '_poison_'
 '_precious_' '_proves_' '_quite_' '_red_' '_said' '_said_' '_she' '_she_'
 '_sit_' '_some_' '_somebody_' '_something_' '_somewhere_' '_speaker_'
 '_stolen' '_that' '_that_' '_their_' '_then_' '_there_' '_these' '_they'
 '_think_' '_this' '_this_' '_through_' '_tis' '_took' '_turtle'
 '_twinkle']
Bag of Words:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ..

### 2.2 Bigramas

In [14]:
from nltk import bigrams

bigram_list = list(bigrams(lemmatized_words))
bigram_list[:10]

[('project', 'gutenberg'),
 ('gutenberg', 'ebook'),
 ('ebook', 'alice'),
 ('alice', 'adventure'),
 ('adventure', 'wonderland'),
 ('wonderland', 'ebook'),
 ('ebook', 'use'),
 ('use', 'anyone'),
 ('anyone', 'anywhere'),
 ('anywhere', 'united')]

### 2.3 Term Frequency-Inverse Document Frequency

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(lemmatized_words)

# Mostrar la matriz TF-IDF
print("Matriz TF-IDF:\n", tfidf_matrix.toarray())

# Mostrar los términos
print("Términos:\n", vectorizer.get_feature_names_out())

Matriz TF-IDF:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Términos:
 ['000' '11' '1500' ... 'youth' 'zealand' 'zigzag']


## 3. Técnicas de representación adicionales

### 3.1. Word2Vec

In [16]:
from gensim.models import Word2Vec

## Generar las frases tokenizadas
sentences = [tokenizer.tokenize(sentence.strip()) for sentence in contents.split('\n') 
             if sentence.strip() != '']

# Entrenar el modelo Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Obtener el vector para una palabra específica (por ejemplo, "alice")
vector = model.wv['alice']

print("Vector para 'alice':\n", vector)

Vector para 'alice':
 [-0.41106218  0.5566622   0.07799479  0.07226732  0.17316268 -0.8077865
  0.07842701  0.969804   -0.61568147 -0.51299405  0.01843098 -0.6103687
 -0.39528337  0.31936285 -0.03298039 -0.30175847  0.20804974 -0.08221999
 -0.12310431 -0.96926713 -0.07051158  0.2594048   0.16562517 -0.45268953
 -0.32171124  0.08117118 -0.37572294 -0.53216845 -0.29094487  0.33375967
  0.73074555 -0.24172832  0.25742027 -0.61648834 -0.22345953  0.7510103
 -0.10204238 -0.25358525 -0.12527071 -0.8315402   0.29418647 -0.47284278
 -0.0396327   0.25796854  0.40965822  0.04216215 -0.24819313 -0.3133194
  0.01558769  0.24121435  0.3692049  -0.2222646   0.25920743 -0.0865428
 -0.5565296   0.246616    0.08230811 -0.08938461 -0.26653388  0.04362943
  0.01691168  0.03610986  0.28622842 -0.0209935  -0.36599684  0.37502635
  0.32099614  0.4826203  -0.89844006  0.6007489  -0.21620585  0.64274967
  0.6853746  -0.00388802  0.7212434   0.1303567  -0.267348    0.0743857
 -0.47566378  0.03929089 -0.3374517

### 3.1. Word2Vec

In [23]:
from glove import Corpus, Glove

# Crear un corpus
corpus = Corpus()
corpus.fit(sentences, window=10)

# Create and train the GloVe model
glove = Glove(no_components=100, learning_rate=0.05)
glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)

## 4. Tabla de análisis

![Example Image](tabla_resumen.png)

Las técnicas de preprocesamiento, como la normalización, tokenización y eliminación de stop words, son esenciales para transformar el texto en un formato que los modelos de NLP puedan entender. Esto mejora la calidad de los datos y la precisión de los resultados obtenidos.

Métodos como Bag of Words (BoW) y bigramas son fáciles de implementar, pero ignoran el contexto y las relaciones semánticas profundas entre las palabras. Esto puede resultar en una pérdida significativa de información, lo que limita la efectividad del modelo en tareas más complejas.

Técnicas más avanzadas como Word2Vec y GloVe ofrecen representaciones densas que capturan relaciones semánticas y sintácticas, mejorando así la comprensión del lenguaje. Sin embargo, requieren grandes volúmenes de datos para entrenar y pueden ser computacionalmente intensivas, lo que plantea desafíos en su implementación.