In [1]:
import matplotlib.pyplot as plt 
import argparse
from torch.utils.data import DataLoader, Dataset 
from dataset import *

# 1. Data Loading

In [2]:
data = pd.read_csv("dataset/poems.csv")
data.head()

Unnamed: 0,author,content,title
0,Leopoldo Lugones,\n\nEn el parque confuso\nQue con lánguidas br...,LA MUERTE DE LA LUNA
1,Marilina Rébora,"\n\nPorque si tú no velas, vendré como ladrón;...",PORQUE SI TÚ NO VELAS
2,Antonio Colinas,"\n\nPequeña de mis sueños, por tu piel las pal...",POEMA DE LA BELLEZA CAUTIVA QUE PERDÍ
3,José María Hinojosa,\n\nLos dedos de la nieve\nrepiquetearon\nen e...,SENCILLEZ
4,Rubén Izaguirre Fiallos,"Naciste en Armenia,\npero te fuiste a vivir al...",Breve Carta a Consuelo Suncín


In [3]:
print('Poems: ', data['title'].count())

Poems:  5131


In [4]:
data['author'].value_counts()

Pablo Neruda                     357
Luis de Góngora                  218
Mario Benedetti                  167
Federico García Lorca            134
Ramón López Velarde              126
                                ... 
Delmira Agustini                   1
Gioconda Belli                     1
Antonio Plaza Llamas               1
Manuel Bretón de los Herreros      1
Meira Delmar                       1
Name: author, Length: 267, dtype: int64

# 2. Data Cleaning 

We need to remove: 
- **punctuations** 
- **lower casing** 

In [5]:
# let's start with a sample 
sample_poem = data.content[:200]
sent_corpus = [line.lower() for poem in sample_poem for line in str(poem).split("\n")]

By loading the `es_core_news_sm model`, you can perform various NLP tasks on Spanish text using the functionalities provided by spaCy, including **tokenization** specific to the Spanish language.

In [6]:
tokenizer = get_tokenizer('spacy', language='es_core_news_sm')
list_sent_corpus = [tokenizer(x) for x in sent_corpus if x != ''] # remove the spaces

In [7]:
print('Number of sentences:' , len(list_sent_corpus))

Number of sentences: 5410


In [8]:
def tokens_to_text(st_text, start_token, end_token): 
    l=[]
    for s in st_text: # loop through each sentence
        s = [x for x in s if x.isalnum()]
        s.insert(0, start_token) # insert start/end tokens
        s.append(end_token)
        l.extend(s)  
    return l

In [9]:
word_corpus = tokens_to_text(list_sent_corpus, '<SOS>', '<EOS>')

In [10]:
print("Words in the corpus:", len(word_corpus))

Words in the corpus: 41345


In [11]:
word_corpus[:6]

['<SOS>', 'en', 'el', 'parque', 'confuso', '<EOS>']

In [12]:
word_count = Counter(word_corpus)
sorted_word_count = sorted(word_count, key=word_count.get, reverse=True)
print('Number of unique tokens: ', len(sorted_word_count))

Number of unique tokens:  7506


In [13]:
word_count.most_common(2)

[('<SOS>', 5410), ('<EOS>', 5410)]

In [14]:
index_to_word = {index: word for index, word in enumerate(sorted_word_count)}
# {word: index}
word_to_index = {word: index for index, word in enumerate(sorted_word_count)}

words_indexes = [word_to_index[w] for w in word_corpus] # word_corpus with the index of the words instead of the words themselves

In [15]:
print(len(words_indexes))

41345


In [16]:
DIR_PATH = "dataset/poems.csv"
START_TOKEN = "<SOV>" # start of verse
END_TOKEN = "<EOV>" # end of verse

parser = argparse.ArgumentParser()
parser.add_argument('--max-epochs', type=int, default=0)
parser.add_argument('--batch-size', type=int, default=256)
parser.add_argument('--sequence_length', type=int, default=5)
args, unknown = parser.parse_known_args()

dataset = SpanishPoemsDataset(DIR_PATH, START_TOKEN, END_TOKEN, args)
print(dataset.__getitem__(0))
print('Vocab:', len(dataset.unique_words))

(tensor([   0,    7,    5, 2449, 1375]), tensor([   7,    5, 2449, 1375,    1]))
Vocab: 7506


In [17]:
for i in range(1,5):
    print(dataset.__getitem__(i))

(tensor([   7,    5, 2449, 1375,    1]), tensor([   5, 2449, 1375,    1,    0]))
(tensor([   5, 2449, 1375,    1,    0]), tensor([2449, 1375,    1,    0,    6]))
(tensor([2449, 1375,    1,    0,    6]), tensor([1375,    1,    0,    6,   15]))
(tensor([1375,    1,    0,    6,   15]), tensor([   1,    0,    6,   15, 2450]))
