In [1]:
import gutenbergpy.textget
import gutenbergpy.gutenbergcachesettings
import pandas as pd 

In [2]:
gutenbergpy.gutenbergcachesettings.GutenbergCacheSettings.TEXT_FILES_CACHE_FOLDER = '../data/texts'

def download_text(id):
    print("downloading ID", id)
    try:
        text = gutenbergpy.textget.get_text_by_id(id)
        text = gutenbergpy.textget.strip_headers(text)
        text = text.decode('utf-8')
        return text
    except Exception as e:
        print("error downloading ID", id)
        raise e

In [3]:
df_metadata = pd.read_csv('../data/gutenberg.csv')
#filter type text
df_metadata = df_metadata[df_metadata['Type'] == 'Text']
#filter langauge english
df_metadata = df_metadata[df_metadata['Language'] == 'en']
#get only Text Authors and Title
df_metadata = df_metadata[['Text#','Authors','Title']]
df_metadata = df_metadata.dropna().reset_index(drop=True)
#remove years
df_metadata["Authors"] = df_metadata["Authors"].str.split(',').str[:-1].str.join(',').str.strip()
df_metadata["Authors"]


  df_metadata = pd.read_csv('../data/gutenberg.csv')


0                         Jefferson, Thomas
1                                          
2        Kennedy, John F. (John Fitzgerald)
3                          Lincoln, Abraham
4                                          
                        ...                
55596                   Schiller, Friedrich
55597                        Huneker, James
55598              Flandrau, Charles Macomb
55599         Cornell, Frederick Carruthers
55600                                      
Name: Authors, Length: 55601, dtype: object

Seleccionamos los autores que están en el top 100
https://www.gutenberg.org/browse/scores/top

In [4]:
authors = pd.read_csv('../data/authors.csv')
#find last '(' and remove everything after
authors['Author'] = authors['Author'].str.split('(').str[:-1].str.join('(').str.strip()

authors


Unnamed: 0,Author
0,"Christie, Agatha"
1,"Doyle, Arthur Conan"
2,"Alcott, Louisa May"
3,"Austen, Jane"
4,"Shakespeare, William"
5,"Verne, Jules"


In [5]:
_df_metadata_filtered = df_metadata[df_metadata['Authors'].isin(authors['Author'])]

_df_metadata_filtered.groupby('Authors').count().sort_values(by='Text#', ascending=False)


Unnamed: 0_level_0,Text#,Title
Authors,Unnamed: 1_level_1,Unnamed: 2_level_1
"Shakespeare, William",158,158
"Doyle, Arthur Conan",80,80
"Alcott, Louisa May",40,40
"Verne, Jules",30,30
"Christie, Agatha",11,11
"Austen, Jane",8,8


In [6]:
#get books from shakespear with title like 'Hamlet'
_df_metadata_filtered[_df_metadata_filtered['Authors'] == 'Alcott, Louisa May'].to_csv('alcot.csv', index=False)


In [7]:
books = {
    'Austen, Jane': [1342, 158, 161],
    'Verne, Jules': [83, 103, 163],
    'Alcott, Louisa May': [514, 8677, 2726],
    'Christie, Agatha': [61262, 66446, 70114],
    'Shakespeare, William': [1122,1112, 1129],
    'Doyle, Arthur Conan': [244,2852,5148],
    
}

#array of all books
books_array = []
for author, ids in books.items():
    for id in ids:
        books_array.append(id)

In [8]:
df_metadata_filtered = df_metadata[df_metadata['Text#'].isin(books_array)]

In [9]:
#pick a random book per author with seed 13
#df_metadata_filtered = _df_metadata_filtered.groupby('Authors').sample(n=3, random_state=13).reset_index(drop=True)


In [10]:
len(df_metadata_filtered)

18

In [11]:
# _authors_df = pd.read_csv('../data/authors.csv')
# _authors_df['Author'] = _authors_df['Author'].str.split('(').str[:-1].str.join('(').str.strip()

# #filter only books with authors in the authors.csv
# df_metadata_filtered = df_metadata_filtered[df_metadata_filtered['Authors'].isin(_authors_df['Author'])]

In [12]:
code_ids = df_metadata_filtered['Text#'].tolist()
books = [download_text(id) for id in code_ids]

downloading ID 83
http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg
downloading ID 103
http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg
downloading ID 158
downloading ID 161
downloading ID 163
http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg
downloading ID 244
http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg
downloading ID 514
http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg
downloading ID 1112
http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg
downloading ID 1122
http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg
downloading ID 1129
http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg
downloading ID 1342
http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg
downloading ID 2726
http://www.mirrorservice.org/sites/ftp.ibiblio.org/pub/docs/books/gutenberg
download

In [13]:
#add books column to dataframe
df_metadata_filtered['Books'] = books

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_metadata_filtered['Books'] = books


In [14]:
df_metadata_filtered

Unnamed: 0,Text#,Authors,Title,Books
76,83,"Verne, Jules","From the Earth to the Moon; and, Round the Moon",\n[Illustration]\n\n\n\n\nFrom the Earth to th...
96,103,"Verne, Jules",Around the World in Eighty Days,\n[Illustration]\n\n\n\n\nAround the World in ...
147,158,"Austen, Jane",Emma,\n\n\n\nEmma\n\nby Jane Austen\n\n\nContents\n...
150,161,"Austen, Jane",Sense and Sensibility,\n[Illustration]\n\n\n\n\nSense and Sensibilit...
152,163,"Alcott, Louisa May",Flower Fables,\n[Illustration]\n\n\n\n\nFlower Fables\n\nby ...
208,244,"Doyle, Arthur Conan",A Study in Scarlet,\n\n\n\nA STUDY IN SCARLET\n\nBy A. Conan Doyl...
474,514,"Alcott, Louisa May",Little Women,\n\n\n\nLittle Women\n\nby Louisa May Alcott\n...
1037,1112,"Shakespeare, William",The Tragedy of Romeo and Juliet,\n\n\n\n\n\n\n\n\n\n\n\n\n*Project Gutenberg i...
1047,1122,"Shakespeare, William","The Tragedy of Hamlet, Prince of Denmark","\n\n\n\n\n1604\n\n\nTHE TRAGEDY OF HAMLET, PRI..."
1054,1129,"Shakespeare, William",The Tragedy of Macbeth,\n\n\n\n\n\n\n1606\n\nTHE TRAGEDY OF MACBETH\n...


In [15]:
#save dataframe to csv
df_metadata_filtered.to_csv('../data/gutenberg_with_text.csv', index=False)