# Poems Generator using Word Embeddings


First we load all the necessary packages.

In [1]:
import os
import pandas as pd
import numpy as  np
import re
import spacy
import urllib.request
from bs4 import BeautifulSoup
import pandas as pd

## Data recollection
We collect Edgar Allan Poe poems from the **mypoeticside.com** site and save then into a *.csv* file.

In [None]:
#this trick the server to think that we are connecting from a web browser
class AppURLopener(urllib.request.FancyURLopener): 
    version = "Mozilla/5.0" 
opener = AppURLopener()
writer = "edgar-allan-poe-poems"
data = opener.open('https://mypoeticside.com/poets/' + writer).read().decode()

#search and save the poem links 
soup =  BeautifulSoup(data, 'html.parser')
poem_list = soup.find(class_="list-poems")
links = poem_list.findAll('a')
results = ["https:"+link.get('href') for link in links]

#saves the title and content of each poem
titles = []
corpus = []
for page in results:
     data = opener.open(page).read().decode()
     soup = BeautifulSoup(data, 'html.parser')
     title = soup.find(class_='title-poem')
     poem = soup.find(class_='poem-entry')
     titles.append(title.getText())
     print(title.getText())
     corpus.append(poem.find('p').getText())
     
 #saves to a .csv file all the poems   
poems = pd.DataFrame({'title' : titles,'text' : corpus})
poems.to_csv('allan_poems.csv')
     

## Data cleaning and preparation
We split the poems into sentences, replace unwanted characters and save everytihn into a new *.csv* file.

In [9]:
def docs_to_sentences(file, split=r"\n"):
    path = os.getcwd()
    df_docs = pd.read_csv(path+"/" + file)
    number_docs = df_docs.shape[0]
    df_sentences = pd.DataFrame(columns=['doc_id','sentence'])  
    for i in range(number_docs):
        text = df_docs.text[i]
        #dictionary to replace unwanted elements
        replace_dict = {'?«' :  '«', '(' :  '', ')' : '', ':' : ',', '.' : ',', ',,,' : ',', '"':''}
        for x,y in replace_dict.items():
            text = text.replace(x, y)
        text = text.lower()   
        #split into sentences
        sentences = re.split(split, text)
        len_sentences = len(sentences)   
        doc_id = [i] * (len_sentences)
        #save sentence and poem_id
        doc_sentences = pd.DataFrame({'doc_id' : doc_id, 'sentence' : sentences})
        df_sentences = df_sentences.append(doc_sentences)   
    #extra cleaning and reset index
    df_sentences = df_sentences[df_sentences.sentence != '']
    df_sentences.reset_index(drop=True, inplace=True)  
    #saves clean sentences to a .csv file 
    df_sentences.to_csv("sentences_" + file)
    
docs_to_sentences('allan_poems.csv')

## poem generator function
We define the generator function which takes 3 arguments: the sentences file, the initial word and the number of verses for the new poem.

In [2]:
def poem_generator(file, word, n_sents=4):
        #load the english model from Spacy
        nlp = spacy.load("en")
        init_str = nlp(word)
        path = os.getcwd()
        sentences = pd.read_csv(path+'/'+ file)
        sup_index= sentences.shape[0]
        poem_id = int()
        poem =[]
        #generate the sentences
        for i in range(n_sents):
            rand_sent_index = np.random.randint(0, sup_index, size=30)
            sent_list = list(sentences.sentence.iloc[rand_sent_index])
            #transform sentences to a Spacy Doc object
            docs = nlp.pipe(sent_list)
            sim_list = []
            #compute similarity for each sentence
            for sent in docs:
                similarity = (init_str.similarity(sent))
                sim_list.append(similarity)
            #saves similarity to DataFrame
            df_1 = pd.DataFrame({'similarity' : sim_list, 'doc_id' : sentences.doc_id.iloc[rand_sent_index] }, index=rand_sent_index)   
            df_1 = df_1[df_1.doc_id != poem_id]
            df_1.sort_values(by='similarity', inplace=True, ascending=False)
            sent_index= df_1.index[0]
            sent = sentences.sentence[sent_index]
            #erase line jumps and carriage return
            replace_dict = {'\n' :  '', '\r' :  ''}
            for x,y in replace_dict.items():
                sent = sent.replace(x, y)
            poem.append(sent)    
            poem_id = df_1.doc_id.iloc[0]
            init_str = nlp(sent)  
        #join the sentences with a line break
        str_poem = ("\n".join(poem)) 
        return str_poem

## poem formating function
Finally we define a function to uppercase the first letter and add a dot and the end.

In [3]:
def format_poem(text):
    text = text[:1].upper() + text[1:]
    text = text[:-1] + '.'
    return text 

## Results

In [10]:
# example n° 1
poem = poem_generator(file='sentences_allan_poems.csv',word='love')
poem = format_poem(poem)
print(poem)

          and true love caresses-
           and i rest so contentedly,
     methought, my sweet one, then i ceased to soar
     i would not call thee fool, old man.
