In [46]:
import pandas as pd
import unicodedata
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

## Lectura del archivo
with open('alice_in_wonderland.txt', 'r', encoding='utf-8') as f:
    contents = f.read()
    
## primera visualización de los datos:
print(contents[835:1400])

Alice’s Adventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

Contents

 CHAPTER I.     Down the Rabbit-Hole
 CHAPTER II.    The Pool of Tears
 CHAPTER III.   A Caucus-Race and a Long Tale
 CHAPTER IV.    The Rabbit Sends in a Little Bill
 CHAPTER V.     Advice from a Caterpillar
 CHAPTER VI.    Pig and Pepper
 CHAPTER VII.   A Mad Tea-Party
 CHAPTER VIII.  The Queen’s Croquet-Ground
 CHAPTER IX.    The Mock Turtle’s Story
 CHAPTER X.     The Lobster Quadrille
 CHAPTER XI.    Who Stole the Tarts?
 CHAPTER XII.   Alice’s Evidence







[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A0860770\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\A0860770\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 1. Preprocesamiento básico

### 1.1 Eliminación de mayúsculas

In [25]:
contents = contents.lower()
print(contents[:100])

the project gutenberg ebook of alice's adventures in wonderland
    
this ebook is for the use of a


### 1.2 Tokenización

In [26]:
## Se extraen las palabras de los contenidos usando tokenización de 
tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
tokens = tokenizer.tokenize(contents)
print(tokens[:10])

['the', 'project', 'gutenberg', 'ebook', 'of', 'alice', 's', 'adventures', 'in', 'wonderland']


In [27]:
print('Total palabras: ', len(tokens))
print('Total unicas: ', len(set(tokens)))

Total palabras:  30591
Total unicas:  3143


### 1.3 Stopwords

In [28]:
stop_words = nltk.corpus.stopwords.words('english')

print(stop_words[:10])

words_filtered = [word for word in tokens if word not in stop_words]
print(words_filtered[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]
['project', 'gutenberg', 'ebook', 'alice', 'adventures', 'wonderland', 'ebook', 'use', 'anyone', 'anywhere']


In [31]:
print('Total palabras quitando stopwords: ', len(words_filtered))
print('Total unicas quitando stopwords: ', len(set(words_filtered)))

Total palabras quitando stopwords:  14298
Total unicas quitando stopwords:  2998


### 1.4 Eliminación de abreviaciones y acentos

In [34]:
text = 'Eliminación de acentos'
unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8')

'Eliminacion de acentos'

In [35]:
contents = unicodedata.normalize('NFKD', contents).encode('ascii', 'ignore').decode('utf-8')

### 1.5 Lematización

In [49]:
# Inicializar el lematizador
lemmatizer = WordNetLemmatizer()

# Matizar cada palabra
lemmatized_words = [lemmatizer.lemmatize(word) for word in words_filtered]

print(lemmatized_words[:15])

['project', 'gutenberg', 'ebook', 'alice', 'adventure', 'wonderland', 'ebook', 'use', 'anyone', 'anywhere', 'united', 'state', 'part', 'world', 'cost']


In [50]:
print('Total palabras lematizadas: ', len(lemmatized_words))
print('Total unicas lematizadas: ', len(set(lemmatized_words)))

Total palabras lematizadas:  14298
Total unicas lematizadas:  2772


### 1.6 Steaming

In [53]:
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in words_filtered]

print(stemmed_words[:15])

['project', 'gutenberg', 'ebook', 'alic', 'adventur', 'wonderland', 'ebook', 'use', 'anyon', 'anywher', 'unit', 'state', 'part', 'world', 'cost']


In [54]:
print('Total palabras steam: ', len(stemmed_words))
print('Total unicas steam: ', len(set(stemmed_words)))

Total palabras steam:  14298
Total unicas steam:  2278


## 2. Técnicas de representación

### 2.1 Bag of words

In [63]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(lemmatized_words)

bow_array = X.toarray()
feature_names = vectorizer.get_feature_names_out()

# Resultado
print("Palabras:", feature_names[:100])
print("Bag of Words:\n", bow_array[:10])

Palabras: ['000' '11' '1500' '1887' '20' '2001' '2008' '2024' '27' '30' '50' '501'
 '596' '60' '6221541' '64' '801' '809' '84116' '90' '_alice' '_all'
 '_all_' '_and' '_are_' '_at' '_before' '_beg_' '_began_' '_best_' '_can_'
 '_could' '_could_' '_curtseying_' '_don' '_ever_' '_everybody_' '_fit_'
 '_hated_' '_have_' '_he_' '_her_' '_here_' '_him_' '_his_' '_how' '_i'
 '_i_' '_in_' '_inside_' '_is_' '_it' '_it_' '_less_' '_little_' '_me_'
 '_mine_' '_more_' '_must_' '_myself_' '_never_' '_no_' '_not' '_not_'
 '_one_' '_ours_' '_outside_' '_please_' '_plenty_' '_poison_'
 '_precious_' '_proves_' '_quite_' '_red_' '_said' '_said_' '_she' '_she_'
 '_sit_' '_some_' '_somebody_' '_something_' '_somewhere_' '_speaker_'
 '_stolen' '_that' '_that_' '_their_' '_then_' '_there_' '_these' '_they'
 '_think_' '_this' '_this_' '_through_' '_tis' '_took' '_turtle'
 '_twinkle']
Bag of Words:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ..

### 2.2 Bigramas

In [68]:
from nltk import bigrams

bigram_list = list(bigrams(lemmatized_words))
bigram_list[:10]

[('project', 'gutenberg'),
 ('gutenberg', 'ebook'),
 ('ebook', 'alice'),
 ('alice', 'adventure'),
 ('adventure', 'wonderland'),
 ('wonderland', 'ebook'),
 ('ebook', 'use'),
 ('use', 'anyone'),
 ('anyone', 'anywhere'),
 ('anywhere', 'united')]

### 2.3 Term Frequency-Inverse Document Frequency

In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(lemmatized_words)

# Mostrar la matriz TF-IDF
print("Matriz TF-IDF:\n", tfidf_matrix.toarray())

# Mostrar los términos
print("Términos:\n", vectorizer.get_feature_names_out())

Matriz TF-IDF:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Términos:
 ['000' '11' '1500' ... 'youth' 'zealand' 'zigzag']


## 3. Técnicas de representación adicionales

### 3.1. Word2Vec

In [78]:
from gensim.models import Word2Vec

# Entrenar el modelo Word2Vec
model = Word2Vec(lemmatized_words, vector_size=100, window=5, min_count=1, workers=4)

# Obtener el vector para una palabra específica (por ejemplo, "a")
vector = model.wv['a']

print("Vector para 'a':\n", vector)

Vector para 'a':
 [-0.06919146  0.1513098   0.09496016  0.00594849  0.03454124 -0.09025294
  0.17801334  0.33231354 -0.09633935 -0.22529984  0.12589228 -0.12489654
 -0.03427171 -0.04333304  0.01649248 -0.13542858  0.2535273   0.02516163
 -0.1801012  -0.306724    0.04267503 -0.04209262  0.29679888  0.03007597
 -0.13919196  0.05714276 -0.04823051  0.13802154 -0.09887528  0.10815766
  0.04286549 -0.14140335  0.04100087 -0.2178936  -0.05808378  0.05006877
  0.12832662 -0.05626721 -0.11570751 -0.00452293  0.05365253 -0.01522636
 -0.07815081  0.12274932  0.09837615 -0.04236872 -0.03447138 -0.06334791
  0.02812665  0.08939759  0.01165137 -0.08046526 -0.05688585 -0.18015818
 -0.11757441 -0.09710056  0.06164008 -0.00909479  0.02324123  0.05992176
 -0.01159922 -0.07553179  0.22043096 -0.01771142 -0.08230908  0.15010083
  0.0737887   0.2543724  -0.20494641 -0.07147556  0.13142613  0.16014366
  0.14667675  0.13136758  0.08621339  0.03798676  0.1356768   0.12125632
 -0.09168289 -0.12834999 -0.22136