# ElMo
Vamos a representar nuestro texto (El imperio final + El pozo de la ascension) con Embeddings de ELMO

**Este script debe ser ejecutado en google collab**

In [34]:
!pip install allennlp

from allennlp.modules.elmo import Elmo, batch_to_ids



In [35]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
options_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json"
weight_file = "https://allennlp.s3.amazonaws.com/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5"

# Creacion de embeddings


In [37]:
elmo = Elmo(options_file, weight_file, 1, dropout=0)

## Funciones de ayuda

In [38]:
from torch.nn.functional import cosine_similarity
import nltk.data

def average_embedding(sentence_embedding):
    non_zero_rows_mask = (sentence_embedding != 0).any(dim=1)
    non_zero_rows = sentence_embedding[non_zero_rows_mask]
    average = non_zero_rows.mean(dim=0)
    return average

def get_elmo_embeddings(sentences):
    character_ids = batch_to_ids(sentences)
    embeddings = elmo(character_ids)
    return embeddings['elmo_representations'][0]


nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Carga de tokens
Vamos a utilizar punkt tokenizer para dividir el texto en oraciones


In [39]:
import os
TFE_id = "17C9moA9qfHtse4eubcKc_haHIrWjjd2d"
#TWoA_id = "187p_orA1lJUOTIBiCAdT70bQQ0LZtR47"

TFE_destination = "/content/TheFinalEmpire.txt"
#TWoA_destination = "/content/TheWellOfAscension.txt"

file_ids = [TFE_id]
file_destinations = [TFE_destination]

for file_id, file_destination in zip(file_ids, file_destinations):
    os.system(f"gdown --id {file_id} -O {file_destination}")

In [40]:
from nltk import sent_tokenize, wordpunct_tokenize

tokens = []
page_text = []

with open('../content/TheFinalEmpire.txt', 'r', encoding='utf-8') as f:
    page_text.extend(f.readlines())

with open('../content/TheWellOfAscension.txt', 'r', encoding='utf-8') as f:
    page_text.extend(f.readlines())

# Unir las líneas en una sola cadena sin caracteres '\n'
page_text = ''.join(page_text)
## -- Tokenizado NLTK --##
pageTokens = sent_tokenize(page_text)

for sentence in pageTokens:
  tokens.append(wordpunct_tokenize(sentence))



In [41]:
tokens[:3]

[['ASH', 'FELL', 'FROM', 'THE', 'SKY', '.'],
 ['Lord',
  'Tresting',
  'frowned',
  ',',
  'glancing',
  'up',
  'at',
  'the',
  'ruddy',
  'midday',
  'sky',
  'as',
  'his',
  'servants',
  'scuttled',
  'forward',
  ',',
  'opening',
  'a',
  'parasol',
  'over',
  'Tresting',
  'and',
  'his',
  'distinguished',
  'guest',
  '.'],
 ['Ashfalls',
  'weren',
  '’',
  't',
  'that',
  'uncommon',
  'in',
  'the',
  'Final',
  'Empire',
  ',',
  'but',
  'Tresting',
  'had',
  'hoped',
  'to',
  'avoid',
  'getting',
  'soot',
  'stains',
  'on',
  'his',
  'fine',
  'new',
  'suit',
  'coat',
  'and',
  'red',
  'vest',
  ',',
  'which',
  'had',
  'just',
  'arrived',
  'via',
  'canal',
  'boat',
  'from',
  'Luthadel',
  'itself',
  '.']]

In [42]:
len(tokens)

49321

In [43]:
tokens = tokens[:int(0.03 * len(tokens))]

In [44]:
len(tokens)

1479

In [45]:
tokens = [sublista for sublista in tokens if len(sublista) <= 20 and len(sublista) >= 17]

In [46]:
len(tokens)

172

## Hacemos el embedding

In [47]:
embeddings = get_elmo_embeddings(tokens)

print(embeddings.shape)
print(embeddings[0].shape)

torch.Size([172, 20, 1024])
torch.Size([20, 1024])


## Embedings más similares

In [48]:
from itertools import combinations
import numpy as np

def get_top_similar_pairs(embeddings, tokens):
    # Calcular todas las posibles combinaciones de pares de embeddings
    pairs = list(combinations(range(len(embeddings)), 2))

    # Calcular la distancia de coseno para cada par de embeddings
    similarities = []
    for pair in pairs:
        if pair[0] == pair[1]:
          continue
        else:
          # print(tokens[pair[0]])
          # print(tokens[pair[1]])
          emb1 = (embeddings[pair[0]][:20])
          emb2 = (embeddings[pair[1]][:20])
          similarity  = cosine_similarity(average_embedding(emb1), average_embedding(emb2), dim=0).item()
          similarities.append((pair, similarity, tokens[pair[0]],tokens[pair[1]], emb1, emb2 ))

    # Ordenar las distancias de coseno y obtener las 10 principales
    top_similar_pairs = sorted(similarities, key=lambda x: x[1])[:10]

    return top_similar_pairs


In [49]:
top_similar_pairs = get_top_similar_pairs(embeddings, tokens)
print("Embedings más similares")
top_similar_pairs
i = 0
for  pair, distance, w1, w2, emb1, emb2 in (top_similar_pairs):
    i+=1
    print(f"{i}. - \t[Frase 1]:\n\t{' '.join(w1)}\n\t[Frase 2]:\n\t{' '.join(w2)}\n\t[Distancia]: {distance}\n--------------------------------------------------------------------")
    # print(f"{emb1}")
    # print(f"{emb2}")
    print("#######################################")

Embedings más similares
1. - 	[Frase 1]:
	Skaa cleaning crews were already back at work on the streets below , brushing up the dark ash .
	[Frase 2]:
	He ’ s a good Smoker , but he ’ s not a good enough man ."
	[Distancia]: 0.2278267741203308
--------------------------------------------------------------------
#######################################
2. - 	[Frase 1]:
	Intricate , with rows of spearlike spires or deep archways , these were the homes of the high nobility .
	[Frase 2]:
	He ’ s a good Smoker , but he ’ s not a good enough man ."
	[Distancia]: 0.23699983954429626
--------------------------------------------------------------------
#######################################
3. - 	[Frase 1]:
	Kelsier watched the sun , his eyes following the giant red disk as it crept toward the western horizon .
	[Frase 2]:
	He ’ s a good Smoker , but he ’ s not a good enough man ."
	[Distancia]: 0.24141870439052582
--------------------------------------------------------------------
############