<a href="https://colab.research.google.com/github/antonellagambarte/procesamiento_del_lenguaje_I/blob/main/Desafio2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
import os
import platform
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.preprocessing.text import text_to_word_sequence

import multiprocessing
try:
  from gensim.models import Word2Vec
except:
  !pip install gensim
  from gensim.models import Word2Vec

from gensim.models.callbacks import CallbackAny2Vec

from sklearn.decomposition import IncrementalPCA
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

In [2]:
# Descarga de carpeta de dataset
if os.access('./songs_dataset', os.F_OK) is False:
    if os.access('songs_dataset.zip', os.F_OK) is False:
        if platform.system() == 'Windows':
            !curl https://raw.githubusercontent.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/main/datasets/songs_dataset.zip -o songs_dataset.zip
        else:
            !wget songs_dataset.zip https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/datasets/songs_dataset.zip
    !unzip -q songs_dataset.zip
else:
    print("El dataset ya se encuentra descargado")

--2025-11-11 00:35:18--  http://songs_dataset.zip/
Resolving songs_dataset.zip (songs_dataset.zip)... failed: Name or service not known.
wget: unable to resolve host address ‘songs_dataset.zip’
--2025-11-11 00:35:18--  https://github.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/raw/main/datasets/songs_dataset.zip
Resolving github.com (github.com)... 140.82.116.3
Connecting to github.com (github.com)|140.82.116.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/main/datasets/songs_dataset.zip [following]
--2025-11-11 00:35:18--  https://raw.githubusercontent.com/FIUBA-Posgrado-Inteligencia-Artificial/procesamiento_lenguaje_natural/main/datasets/songs_dataset.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.

In [3]:
os.listdir("./songs_dataset/")

['kanye-west.txt',
 'blink-182.txt',
 'ludacris.txt',
 'lorde.txt',
 'eminem.txt',
 'bruce-springsteen.txt',
 'leonard-cohen.txt',
 'nirvana.txt',
 'cake.txt',
 'beatles.txt',
 'notorious-big.txt',
 'missy-elliott.txt',
 'nursery_rhymes.txt',
 'nicki-minaj.txt',
 'lady-gaga.txt',
 'adele.txt',
 'disney.txt',
 'lil-wayne.txt',
 'r-kelly.txt',
 'janisjoplin.txt',
 'kanye.txt',
 'dj-khaled.txt',
 'dickinson.txt',
 'rihanna.txt',
 'bjork.txt',
 'Lil_Wayne.txt',
 'al-green.txt',
 'paul-simon.txt',
 'bob-marley.txt',
 'bieber.txt',
 'jimi-hendrix.txt',
 'drake.txt',
 'michael-jackson.txt',
 'johnny-cash.txt',
 'patti-smith.txt',
 'dr-seuss.txt',
 'Kanye_West.txt',
 'britney-spears.txt',
 'bruno-mars.txt',
 'bob-dylan.txt',
 'radiohead.txt',
 'nickelback.txt',
 'joni-mitchell.txt',
 'alicia-keys.txt',
 'amy-winehouse.txt',
 'notorious_big.txt',
 'dolly-parton.txt',
 'lin-manuel-miranda.txt',
 'prince.txt']

In [4]:
df = pd.read_csv('songs_dataset/michael-jackson.txt', sep='/n', header=None)
df.head()

  df = pd.read_csv('songs_dataset/michael-jackson.txt', sep='/n', header=None)


Unnamed: 0,0
0,[Spoken Intro:]
1,You ever want something
2,that you know you shouldn't have
3,"The more you know you shouldn't have it,"
4,The more you want it


In [5]:
print("Cantidad de documentos:", df.shape[0])

Cantidad de documentos: 9484


## **Creación de vectores**

Preprocesamiento:

In [6]:
sentence_tokens = []
# Recorrer todas las filas y transformar las oraciones
# en una secuencia de palabras (esto podría realizarse con NLTK o spaCy también)
for _, row in df[:None].iterrows():
    sentence_tokens.append(text_to_word_sequence(row[0]))


sentence_tokens[:2]

[['spoken', 'intro'], ['you', 'ever', 'want', 'something']]

In [7]:
# Durante el entrenamiento gensim por defecto no informa el "loss" en cada época
# Sobrecargamos el callback para poder tener esta información
class callback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        if self.epoch == 0:
            print('Loss after epoch {}: {}'.format(self.epoch, loss))
        else:
            print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
        self.epoch += 1
        self.loss_previous_step = loss

In [8]:
# Crearmos el modelo generador de vectores
# En este caso utilizaremos la estructura modelo Skipgram
w2v_model = Word2Vec(min_count=5,    # frecuencia mínima de palabra para incluirla en el vocabulario
                     window=2,       # cant de palabras antes y desp de la predicha
                     vector_size=300,       # dimensionalidad de los vectores
                     negative=20,    # cantidad de negative samples... 0 es no se usa
                     workers=1,      # si tienen más cores pueden cambiar este valor
                     sg=1)           # modelo 0:CBOW  1:skipgram

In [9]:
# Obtener el vocabulario con los tokens
w2v_model.build_vocab(sentence_tokens)

In [10]:
# Cantidad de filas/docs encontradas en el corpus
print("Cantidad de docs en el corpus:", w2v_model.corpus_count)

Cantidad de docs en el corpus: 9484


In [11]:
# Cantidad de words encontradas en el corpus
print("Cantidad de words distintas en el corpus:", len(w2v_model.wv.index_to_key))

Cantidad de words distintas en el corpus: 1000


In [12]:
# Entrenamos el modelo generador de vectores
# Utilizamos nuestro callback
w2v_model.train(sentence_tokens,
                 total_examples=w2v_model.corpus_count,
                 epochs=20,
                 compute_loss = True,
                 callbacks=[callback()]
                 )

Loss after epoch 0: 332423.65625
Loss after epoch 1: 227544.78125
Loss after epoch 2: 227830.0625
Loss after epoch 3: 222362.0
Loss after epoch 4: 193124.875
Loss after epoch 5: 183437.75
Loss after epoch 6: 176325.125
Loss after epoch 7: 169745.875
Loss after epoch 8: 166790.125
Loss after epoch 9: 160752.5
Loss after epoch 10: 146619.75
Loss after epoch 11: 141943.75
Loss after epoch 12: 139831.5
Loss after epoch 13: 138176.5
Loss after epoch 14: 137774.75
Loss after epoch 15: 135772.25
Loss after epoch 16: 136182.25
Loss after epoch 17: 134676.0
Loss after epoch 18: 135812.75
Loss after epoch 19: 134378.75


(626341, 993780)

## **Términos de interés**

In [14]:
# Palabras más similares
print("Similares a 'love':")
w2v_model.wv.most_similar(positive=["love"], topn=10)

Similares a 'love':


[('rarest', 0.6610762476921082),
 ('summer', 0.6503308415412903),
 ('perfect', 0.6333413124084473),
 ("fallin'", 0.6228702068328857),
 ('farewell', 0.6222076416015625),
 ('power', 0.616187572479248),
 ("makin'", 0.613447904586792),
 ('caress', 0.6112262606620789),
 ('bear', 0.6075263023376465),
 ('strong', 0.6069183945655823)]

In [15]:
print("\nSimilares a 'dance':")
w2v_model.wv.most_similar(positive=["dance"], topn=10)



Similares a 'dance':


[('floor', 0.8131090402603149),
 ('across', 0.8122112154960632),
 ('carpet', 0.7644079327583313),
 ('boogie', 0.7616882920265198),
 ('earth', 0.7575291395187378),
 ('screen', 0.7472867965698242),
 ('greatest', 0.7444616556167603),
 ('round', 0.7375797033309937),
 ('bloodstains', 0.7356144189834595),
 ("speedin'", 0.733298122882843)]

In [16]:
print("\nSimilares a 'baby':")
w2v_model.wv.most_similar(positive=["baby"], topn=10)



Similares a 'baby':


[('loves', 0.6341243982315063),
 ("cryin'", 0.6338416337966919),
 ('caress', 0.6260278224945068),
 ('regret', 0.618771493434906),
 ('babe', 0.6183763146400452),
 ('maria', 0.6020016074180603),
 ('honey', 0.5938025116920471),
 ('factual', 0.5910568833351135),
 ("beggin'", 0.5898691415786743),
 ('laughing', 0.5896839499473572)]

In [17]:
# Palabras menos relacionadas
print("\nMenos similares a 'love':")
w2v_model.wv.most_similar(negative=["love"], topn=10)



Menos similares a 'love':


[('uuh', 0.08635728806257248),
 ('refrain', 0.06611773371696472),
 ('siedah', 0.039845868945121765),
 ('cheater', 0.007223140448331833),
 ('stevie', -0.02029196172952652),
 ('pitbull', -0.022470083087682724),
 ('out', -0.054199814796447754),
 ('got', -0.11881646513938904),
 ("doin'", -0.1278565526008606),
 ('lib', -0.13371868431568146)]

## **Reducción de dimensionalidad**

In [20]:
def reduce_dimensions(model, num_dimensions = 2 ):

    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)

    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    return vectors, labels

vecs, labels = reduce_dimensions(w2v_model)

In [21]:
MAX_WORDS=200
fig = px.scatter(x=vecs[:MAX_WORDS,0], y=vecs[:MAX_WORDS,1], text=labels[:MAX_WORDS])
fig.show(renderer="colab") # esto para plotly en colab