In [1]:
# Importing relevant packages
import numpy as np
import pandas as pd
import os, sys
import shutil
import pickle

from collections import Counter
import matplotlib.pyplot as plt

%matplotlib inline

%load_ext autoreload
%autoreload 2

## path to the downloaded gutenberg corpus
path_gutenberg = os.path.join(os.pardir,os.pardir,'gutenberg')

## import internal helper functions
src_dir = os.path.join(os.pardir,'src')
sys.path.append(src_dir)
from data_io import get_book

import re
import random


# Accessing the metadata
sys.path.append(os.path.join(path_gutenberg,'src'))
from metaquery import meta_query
mq = meta_query(path=os.path.join(path_gutenberg,'metadata','metadata.csv'), filter_exist=False)

### Creating subcorpora

The names of functions and output files end in '_new' because we made a big change to how we generated the corpora

In [3]:
# New author corpra
def create_big_corpus(task, chosen_subject=None, time_period=None):
    path_gutenberg = os.path.join(os.pardir,os.pardir,'gutenberg')
    src_dir = os.path.join(os.pardir,'src')
    sys.path.append(src_dir)
    mq = meta_query(path=os.path.join(path_gutenberg,'metadata','metadata.csv'), filter_exist=False)

    mq.reset()

    # Perform necessary filtering
    mq.reset()
    mq.filter_lang('en',how='only')
    # Only select books with more than 20 downloads
    df = mq.get_df()
    mq.df = df[df['downloads'] >= 20]
    # 1800 onwards
    mq.filter_year([1800, 2050])
    # Filter out data with no subject listed
    df = mq.get_df()
    mq.df = df[df['subjects'] != 'set()']
    # Filter out entries that don't have author birth or death year
    df = mq.get_df()
    mq.df = df[df[['authoryearofbirth', 'authoryearofdeath']].notnull().all(1)]

    if chosen_subject is not None:
        mq.filter_subject(chosen_subject, how='any')
    if time_period is not None:
        mq.filter_year(time_period)

    full_df = mq.get_df()

    if task == 'author':
        id_dict = dict(full_df.groupby('author')['id'].apply(list))
    elif task == 'subject':
        counts = mq.get_subjects_counts()
        filtered_subjects = [i for i in counts if counts[i] >= 20]
        id_dict = {}
        for subject in filtered_subjects:
            mq.df = full_df
            mq.filter_subject(subject, how='any')
            ids = mq.get_ids()
            if len(ids) > 0:
                id_dict[subject] = ids
    elif task == 'time':
        time_periods = [[i,i+20] for i in range(1800, 1981, 20)]
        id_dict = {}
        for time_period in time_periods:
            mq.df = full_df
            mq.filter_year(time_period)
            ids = mq.get_ids()
            if len(ids) > 0:
                id_dict[tuple(time_period)] = ids

    return id_dict

def create_corpus_new(task, seed, chosen_subject=None, time_period=None):
    '''
    Creates a new corpus, with the option to filter by subject or time period.

    Parameters
    ----------
    task : string
        Either 'author', 'subject' or 'time'.
    
    seed : integer
        Random seed used to determine which books are randomly selected.
    
    OPTIONAL:
    chosen_subject : string
        Desired subject (if wanting to filter by subject)
    
    time_period : list
        Desired time period (if wanting to filter by time period).
        Has the form [ start_year , end_year ].

    Returns
    -------
     : dictionary
        Dictionary with the following structure:
        { 
            'same' : [ list of book pairs ( book1, book2 ) ],
            'different' : [ list of book pairs ( book1, book2 ) ] 
        }
    '''

    if task == 'author':
        corpus = create_big_corpus(task, chosen_subject, time_period)
    elif task == 'subject':
        corpus = create_big_corpus(task, time_period=time_period)
    elif task == 'time':
        corpus = create_big_corpus(task, chosen_subject=chosen_subject)

    random.seed(seed)

    all_keys = list(corpus.keys())
    weighted_keys = []
    for key in all_keys:
        weighted_keys += [key]*len(corpus[key])

    same = []
    for _ in range(1000):
        not_found = True
        while not_found:
            # group = random.sample(all_keys,1)[0]
            group = random.sample(weighted_keys,1)[0]
            if len(corpus[group]) < 2:
                continue
        
            ids = random.sample(corpus[group],2)
            try:
                book1 = get_book(ids[0])
                book2 = get_book(ids[1])
            except:
                continue
            not_found = False

        same.append(tuple(ids))
    
    different = []
    for _ in range(1000):
        not_found = True
        while not_found:
            group1, group2 = random.sample(weighted_keys,2)
            if group1 == group2:
                continue

            # Check that time periods are at least 60 years apart
            if task == 'time':
                # Identify which group is earlier
                if group1[0] > group2[0]:
                    if group2[1] + 60 > group1[0]:
                        continue
                elif group1[0] < group2[0]:
                    if group1[1] + 60 > group2[0]:
                        continue
                
            id1 = random.sample(corpus[group1],1)[0]
            id2 = random.sample(corpus[group2],1)[0]

            if id1 == id2:
                continue 

            # Check if books are actually in our data
            try:
                book1 = get_book(id1)
                book2 = get_book(id2)
            except:
                continue
            not_found = False

        different.append((id1,id2))
    
    return {'same':same,'different':different}

### Creating uncontrolled corpora (i.e. no filtering by author, subject or time)

In [7]:
seeds = [48879,14413,41829,88434,4950,75725,48049,1470,93532,50150]
author_corpora = []
for seed in seeds:
    author_corpora.append(create_corpus_new('author', seed))

In [8]:
seeds = [2505,99585,33607,20448,65477,13008,68250,82957,86969,52851]
subject_corpora = []
for seed in seeds:
    subject_corpora.append(create_corpus_new('subject', seed))

In [9]:
seeds = [54291,6042,40198,69432,48405,63623,21104,21522,44701,94938]
time_corpora = []
for seed in seeds:
    time_corpora.append(create_corpus_new('time', seed))

### Creating controlled corpora

In [4]:
author_corpora_controlled = []

author_corpora_controlled.append(create_corpus_new('author', 68664, chosen_subject = 'Science fiction', time_period = [1950,2000]))
author_corpora_controlled.append(create_corpus_new('author', 33982, chosen_subject = 'Detective and mystery stories', time_period = [1900,1950]))
author_corpora_controlled.append(create_corpus_new('author', 55232, chosen_subject = 'Western stories', time_period = [1925,1975]))
author_corpora_controlled.append(create_corpus_new('author', 15128, chosen_subject = 'Historical fiction', time_period = [1825,1875]))
author_corpora_controlled.append(create_corpus_new('author', 95222, chosen_subject = 'Sea stories', time_period = [1850,1900]))
author_corpora_controlled.append(create_corpus_new('author', 91059, chosen_subject = 'Fairy tales', time_period = [1850,1900]))
author_corpora_controlled.append(create_corpus_new('author', 47274, chosen_subject = 'Love stories', time_period = [1875,1925]))
author_corpora_controlled.append(create_corpus_new('author', 41424, chosen_subject = 'Adventure and adventurers -- Juvenile fiction', time_period = [1875,1925]))
author_corpora_controlled.append(create_corpus_new('author', 81736, chosen_subject = 'Adventure stories', time_period = [1900,1950]))
author_corpora_controlled.append(create_corpus_new('author', 54735, chosen_subject = 'England -- Fiction', time_period = [1850,1900]))

In [5]:
subject_corpora_controlled = []

subject_corpora_controlled.append(create_corpus_new('subject', 34042, time_period = [1800,1850]))
subject_corpora_controlled.append(create_corpus_new('subject', 7102, time_period = [1825,1875]))
subject_corpora_controlled.append(create_corpus_new('subject', 83270, time_period = [1850,1900]))
subject_corpora_controlled.append(create_corpus_new('subject', 36586, time_period = [1850,1900]))
subject_corpora_controlled.append(create_corpus_new('subject', 37779, time_period = [1875,1925]))
subject_corpora_controlled.append(create_corpus_new('subject', 93047, time_period = [1875,1925]))
subject_corpora_controlled.append(create_corpus_new('subject', 39314, time_period = [1900,1950]))
subject_corpora_controlled.append(create_corpus_new('subject', 28939, time_period = [1900,1950]))
subject_corpora_controlled.append(create_corpus_new('subject', 82848, time_period = [1925,1975]))
subject_corpora_controlled.append(create_corpus_new('subject', 28169, time_period = [1950,2000]))

In [6]:
time_corpora_controlled = []

time_corpora_controlled.append(create_corpus_new('time', 31250, chosen_subject = 'Love stories'))
time_corpora_controlled.append(create_corpus_new('time', 57562, chosen_subject = 'Historical fiction'))
time_corpora_controlled.append(create_corpus_new('time', 53025, chosen_subject = 'Adventure stories'))
time_corpora_controlled.append(create_corpus_new('time', 8031, chosen_subject = 'Psychological fiction'))
time_corpora_controlled.append(create_corpus_new('time', 92688, chosen_subject = 'Science fiction'))
time_corpora_controlled.append(create_corpus_new('time', 27926, chosen_subject = 'Fantasy fiction'))
time_corpora_controlled.append(create_corpus_new('time', 18753, chosen_subject = 'England -- Fiction'))
time_corpora_controlled.append(create_corpus_new('time', 78701, chosen_subject = 'War stories'))
time_corpora_controlled.append(create_corpus_new('time', 56320, chosen_subject = 'Detective and mystery stories'))
time_corpora_controlled.append(create_corpus_new('time', 97837, chosen_subject = 'Bildungsromans'))

### Store these corpora in pickle files

In [10]:
# Pickle files

# Uncontrolled
output_file_path = '../output_files/author_corpora_new.pickle'
with open(output_file_path, 'wb') as f:
    pickle.dump(author_corpora, f)

output_file_path = '../output_files/subject_corpora_new.pickle'
with open(output_file_path, 'wb') as f:
    pickle.dump(subject_corpora, f)

output_file_path = '../output_files/time_corpora_new.pickle'
with open(output_file_path, 'wb') as f:
    pickle.dump(time_corpora, f)

# Controlled 
output_file_path = '../output_files/author_corpora_new_controlled.pickle'
with open(output_file_path, 'wb') as f:
    pickle.dump(author_corpora_controlled, f)

output_file_path = '../output_files/subject_corpora_new_controlled.pickle'
with open(output_file_path, 'wb') as f:
    pickle.dump(subject_corpora_controlled, f)

output_file_path = '../output_files/time_corpora_new_controlled.pickle'
with open(output_file_path, 'wb') as f:
    pickle.dump(time_corpora_controlled, f)

### Code for adding all texts in our subcorpora to a folder to reduce memory usage on the servers

This allowed us to just upload the books in our subcorpora to the servers, rather than
all the books in the PG database.

In [None]:
# Perform necessary filtering
mq.reset()
mq.filter_lang('en',how='only')
# Only select books with more than 20 downloads
df = mq.get_df()
mq.df = df[df['downloads'] >= 20]
# 1800 onwards
mq.filter_year([1800, 2050])
# Filter out data with no subject listed
df = mq.get_df()
mq.df = df[df['subjects'] != 'set()']
# Filter out entries that don't have author birth or death year
df = mq.get_df()
mq.df = df[df[['authoryearofbirth', 'authoryearofdeath']].notnull().all(1)]

all_uniq_ids = mq.get_ids()
print(len(all_uniq_ids))

In [21]:
source_filepath = path_gutenberg + '/data/counts'
dest_filepath = path_gutenberg + '/data/counts_final2'

for filename in os.listdir(source_filepath):
    current_filepath = source_filepath + '/' + filename
    book_id = filename.split('_')[0]
    if book_id in all_uniq_ids:
        shutil.copy(current_filepath, dest_filepath)

In [22]:
source_filepath = path_gutenberg + '/data/tokens'
dest_filepath = path_gutenberg + '/data/tokens_final2'

for filename in os.listdir(source_filepath):
    current_filepath = source_filepath + '/' + filename
    book_id = filename.split('_')[0]
    if book_id in all_uniq_ids:
        shutil.copy(current_filepath, dest_filepath)

In [8]:
all_tokens_filepath = '../output_files/all_tokens.txt'
f = open(all_tokens_filepath, "r")
files = f.readlines()
uploaded_ids = [file.split('_')[0] for file in files]
remaining_ids = [id1 for id1 in all_uniq_ids if id1 not in uploaded_ids]
print(len(uploaded_ids), len(remaining_ids), len(uploaded_ids + remaining_ids), len(all_uniq_ids))

10083 3840 13923 13923


In [10]:
source_filepath = path_gutenberg + '/data/tokens'
dest_filepath = path_gutenberg + '/data/tokens_remaining'

for filename in os.listdir(source_filepath):
    current_filepath = source_filepath + '/' + filename
    book_id = filename.split('_')[0]
    if book_id in remaining_ids:
        shutil.copy(current_filepath, dest_filepath)