In [17]:
#important stuff

import pandas
import numpy

### LOADING DATA

Here, I'm loading the top ~5% most common authors in the dataset and saving it as a pandas dataframe. From that dataframe, I create a second dataframe with four relevant columns: first_name, last_name, title, fraction_total

This process repeats for each dataset. First, Early English Books Online...

In [18]:
# EEBO

eebo_df = pandas.read_csv('../eebo/eebo_dataset_final_3.csv', nrows=40)
eebo_df = eebo_df.loc[:, ['last_name','first_name', 'title', 'count']]
eebo_df.head(n=5)

Unnamed: 0,last_name,first_name,title,count
0,not_applicable,Anonymous,not_applicable,27862
1,England and Wales,Parliament,not_applicable,1119
2,Sovereign,Charles I,not_applicable,1017
3,not_applicable,England and Wales,not_applicable,940
4,not_applicable,Church of England,not_applicable,902


Next, English Short Title Catalogue...

In [19]:
# ESTC

estc_df = pandas.read_csv('../estc/estc_dataset_final_3.csv', nrows=159)
estc_df = estc_df.loc[:, ['last_name','first_name', 'title', 'count']]
estc_df.head(n=5)

Unnamed: 0,last_name,first_name,title,count
0,Swift,Jonathan,not_applicable,310
1,Smollett,Tobias,not_applicable,160
2,Johnson,Samuel,not_applicable,155
3,Griffiths,Ralph,not_applicable,144
4,Great Britain,Parliament,not_applicable,110


Open Syllabus Project...

In [20]:
# OPEN SYLLABUS

open_syllabus_df = pandas.read_csv('../open-syllabus/english-lit/open-syllabus_dataset_final_4.csv', nrows=250)
open_syllabus_df = open_syllabus_df.loc[:, ['last_name','first_name', 'title', 'count']]
open_syllabus_df.head(n=5)

Unnamed: 0,last_name,first_name,title,count
0,Shakespeare,William,not_applicable,33180
1,Hacker,Diana,not_applicable,18974
2,Chaucer,Geoffrey,not_applicable,10448
3,Wordsworth,William,not_applicable,9977
4,Poe,Edgar Allen,not_applicable,9904


The Oxford Text Archive...

In [21]:
# OTA

ota_df = pandas.read_csv('../ota-2/ota-2_dataset_final.csv', nrows=30)
ota_df = ota_df.loc[:, ['last_name','first_name', 'title', 'count']]
ota_df.head(n=5)

Unnamed: 0,last_name,first_name,title,count
0,Defoe,Daniel,not_applicable,121
1,More,Hannah,not_applicable,69
2,Pope,Alexander,not_applicable,62
3,Goldsmith,Oliver,not_applicable,51
4,not_applicable,Unknown,not_applicable,47


And finally, Project Gutenberg...

In [22]:
# PROJECT GUTENBERG

project_gutenberg_df = pandas.read_csv('../project-gutenberg-2/project-gutenberg_dataset_final_3.csv', nrows=1057)
project_gutenberg_df = project_gutenberg_df.loc[:, ['last_name','first_name', 'title', 'count']]
project_gutenberg_df.head(n=5)

Unnamed: 0,last_name,first_name,title,count
0,not_applicable,Various,not_applicable,3469
1,not_applicable,Anonymous,not_applicable,755
2,Shakespeare,William,not_applicable,327
3,Twain,Mark,not_applicable,225
4,Lytton,Edward Bulwer,not_applicable,221


Now, I'll mash together the dataframes into one big dataframe with all authors from all datasets.

In [23]:
concat_all = pandas.concat([eebo_df, estc_df, open_syllabus_df, ota_df, project_gutenberg_df], axis=0)
concat_all.head(n=5)

Unnamed: 0,last_name,first_name,title,count
0,not_applicable,Anonymous,not_applicable,27862
1,England and Wales,Parliament,not_applicable,1119
2,Sovereign,Charles I,not_applicable,1017
3,not_applicable,England and Wales,not_applicable,940
4,not_applicable,Church of England,not_applicable,902


### PROCESSING LOADED DATA

Now that all the datasets are loaded as dataframes with the relevant columns, I can start processing the loaded data.

First, I'll extra all the records WITHOUT duplicates and put them into a new dataframe.

In [24]:
no_duplicate_concat = concat_all.drop_duplicates(['last_name', 'first_name', 'title'], keep=False)
no_duplicate_concat = no_duplicate_concat.loc[:, ['last_name','first_name', 'title', 'count']]
no_duplicate_concat.head(n=5)

Unnamed: 0,last_name,first_name,title,count
1,England and Wales,Parliament,not_applicable,1119
2,Sovereign,Charles I,not_applicable,1017
3,not_applicable,England and Wales,not_applicable,940
4,not_applicable,Church of England,not_applicable,902
5,Sovereign,Charles II,not_applicable,683


Now I'll put together a dataframe of all the records WITH duplicates

In [25]:
duplicate_concat = concat_all[concat_all.duplicated(['last_name', 'first_name', 'title'], keep=False)]
duplicate_concat = duplicate_concat.loc[:, ['last_name','first_name', 'title', 'count']]
duplicate_concat.head(n=20)

Unnamed: 0,last_name,first_name,title,count
0,not_applicable,Anonymous,not_applicable,27862
13,Burnet,Gilbert,not_applicable,214
14,Dryden,John,not_applicable,191
17,Penn,William,not_applicable,186
27,Bacon,Francis,not_applicable,123
31,Bunyan,John,not_applicable,116
33,Shakespeare,William,not_applicable,111
35,Tillotson,John,not_applicable,109
0,Swift,Jonathan,not_applicable,310
2,Johnson,Samuel,not_applicable,155


I'll combine the duplicated records and add their fraction_totals together

In [26]:
duplicate_concat_sum = (duplicate_concat.groupby(['last_name','first_name','title'],as_index = False).sum())
# duplicate_concat_sum = duplicate_concat_sum.sort_values(by=['first_name'], ascending=False)
duplicate_concat_sum.head(n=5)

Unnamed: 0,last_name,first_name,title,count
0,Addison,Joseph,not_applicable,86
1,Austen,Jane,not_applicable,6265
2,Bacon,Francis,not_applicable,145
3,Baldwin,James,not_applicable,2189
4,Beaumont,Francis,not_applicable,36


In [27]:
concat_final = pandas.concat([no_duplicate_concat, duplicate_concat_sum])
# concat_final = concat_final.sort_values(by=['count'], ascending=False)
concat_final.head(n=10)

Unnamed: 0,last_name,first_name,title,count
1,England and Wales,Parliament,not_applicable,1119
2,Sovereign,Charles I,not_applicable,1017
3,not_applicable,England and Wales,not_applicable,940
4,not_applicable,Church of England,not_applicable,902
5,Sovereign,Charles II,not_applicable,683
6,Scotland,Privy Council,not_applicable,548
7,Sovereign,James I,not_applicable,470
8,Sovereign,Elizabeth I,not_applicable,429
9,Sternhold,Thomas,not_applicable,403
10,England and Wales,Parliament House of Commons,not_applicable,282


In [28]:
divided_values = (concat_final['count']/750394)
new_concat_final = concat_final.loc[:, ['last_name','first_name', 'title']]
new_concat_final = concat_final.assign(count = divided_values)
new_concat_final = new_concat_final.sort_values(by=['count'], ascending=False)
new_concat_final.head(20)

Unnamed: 0,last_name,first_name,title,count
67,Shakespeare,William,not_applicable,0.044986
85,not_applicable,Anonymous,not_applicable,0.038136
1,Hacker,Diana,not_applicable,0.025285
15,Chaucer,Geoffrey,not_applicable,0.013941
84,Wordsworth,William,not_applicable,0.013316
4,Poe,Edgar Allen,not_applicable,0.013198
5,Eliot,T S,not_applicable,0.011191
6,Faulkner,William,not_applicable,0.010805
7,Woolf,Virginia,not_applicable,0.010225
39,Hawthorne,Nathaniel,not_applicable,0.009484


In [29]:
is_author = concat_all['first_name']=="Hannah"


find_author = concat_all[is_author]
print(find_author.head())

  last_name first_name           title  count
1      More     Hannah  not_applicable     69
