In [42]:
#important stuff

import pandas
import numpy

### LOADING DATA

Here, I'm loading the top ~5% most common authors in the dataset and saving it as a pandas dataframe. From that dataframe, I create a second dataframe with four relevant columns: first_name, last_name, title, fraction_total

This process repeats for each dataset. First, Early English Books Online...

In [43]:
# EEBO

eebo_df = pandas.read_csv('../eebo/eebo_dataset_final_3.csv', nrows=40)
eebo_df = eebo_df.loc[:, ['last_name','first_name', 'title', 'fraction_total', 'dataset']]
eebo_df.head(n=5)

Unnamed: 0,last_name,first_name,title,fraction_total,dataset
0,not_applicable,Anonymous,not_applicable,0.29159,eebo
1,England and Wales,Parliament,not_applicable,0.011711,eebo
2,Sovereign,Charles I,not_applicable,0.010643,eebo
3,not_applicable,England and Wales,not_applicable,0.009838,eebo
4,not_applicable,Church of England,not_applicable,0.00944,eebo


Next, English Short Title Catalogue...

In [44]:
# ESTC

estc_df = pandas.read_csv('../estc/estc_dataset_final_3.csv', nrows=159)
estc_df = estc_df.loc[:, ['last_name','first_name', 'title', 'fraction_total', 'dataset']]
estc_df.head(n=5)

Unnamed: 0,last_name,first_name,title,fraction_total,dataset
0,Swift,Jonathan,not_applicable,0.032832,estc
1,Smollett,Tobias,not_applicable,0.016946,estc
2,Johnson,Samuel,not_applicable,0.016416,estc
3,Griffiths,Ralph,not_applicable,0.015251,estc
4,Great Britain,Parliament,not_applicable,0.01165,estc


Open Syllabus Project...

In [45]:
# OPEN SYLLABUS

open_syllabus_df = pandas.read_csv('../open-syllabus/english-lit/open-syllabus_dataset_final_4.csv', nrows=250)
open_syllabus_df = open_syllabus_df.loc[:, ['last_name','first_name', 'title', 'fraction_total', 'dataset']]
open_syllabus_df.head(n=5)

Unnamed: 0,last_name,first_name,title,fraction_total,dataset
0,Shakespeare,William,not_applicable,0.022192,open_syllabus
1,Hacker,Diana,not_applicable,0.01269,open_syllabus
2,Chaucer,Geoffrey,not_applicable,0.006988,open_syllabus
3,Wordsworth,William,not_applicable,0.006673,open_syllabus
4,Poe,Edgar Allen,not_applicable,0.006624,open_syllabus


The Oxford Text Archive...

In [46]:
# OTA

ota_df = pandas.read_csv('../ota/ota_dataset_final.csv', nrows=30)
ota_df = ota_df.loc[:, ['last_name','first_name', 'title', 'fraction_total', 'dataset']]
ota_df.head(n=5)

Unnamed: 0,last_name,first_name,title,fraction_total,dataset
0,Defoe,Daniel,not_applicable,0.043809,ota
1,More,Hannah,not_applicable,0.024982,ota
2,Pope,Alexander,not_applicable,0.022448,ota
3,Goldsmith,Oliver,not_applicable,0.018465,ota
4,not_applicable,Unknown,not_applicable,0.017017,ota


And finally, Project Gutenberg...

In [47]:
# PROJECT GUTENBERG

project_gutenberg_df = pandas.read_csv('../project-gutenberg/project-gutenberg_dataset_final_3.csv', nrows=1057)
project_gutenberg_df = project_gutenberg_df.loc[:, ['last_name','first_name', 'title', 'fraction_total', 'dataset']]
project_gutenberg_df.head(n=5)

Unnamed: 0,last_name,first_name,title,fraction_total,dataset
0,not_applicable,Various,not_applicable,0.057384,project_gutenberg
1,not_applicable,Anonymous,not_applicable,0.012489,project_gutenberg
2,Shakespeare,William,not_applicable,0.005409,project_gutenberg
3,Twain,Mark,not_applicable,0.003722,project_gutenberg
4,Lytton,Edward Bulwer,not_applicable,0.003656,project_gutenberg


Now, I'll mash together the dataframes into one big dataframe with all authors from all datasets.

In [48]:
concat_all = pandas.concat([eebo_df, estc_df, open_syllabus_df, ota_df, project_gutenberg_df], axis=0)
concat_all.head(n=5)

Unnamed: 0,last_name,first_name,title,fraction_total,dataset
0,not_applicable,Anonymous,not_applicable,0.29159,eebo
1,England and Wales,Parliament,not_applicable,0.011711,eebo
2,Sovereign,Charles I,not_applicable,0.010643,eebo
3,not_applicable,England and Wales,not_applicable,0.009838,eebo
4,not_applicable,Church of England,not_applicable,0.00944,eebo


### PROCESSING LOADED DATA

Now that all the datasets are loaded as dataframes with the relevant columns, I can start processing the loaded data.

First, I'll extra all the records WITHOUT duplicates and put them into a new dataframe.

In [49]:
no_duplicate_concat = concat_all.drop_duplicates(['last_name', 'first_name', 'title'], keep=False)
no_duplicate_concat = no_duplicate_concat.loc[:, ['last_name','first_name', 'title', 'fraction_total']]
no_duplicate_concat.head(n=5)

Unnamed: 0,last_name,first_name,title,fraction_total
1,England and Wales,Parliament,not_applicable,0.011711
2,Sovereign,Charles I,not_applicable,0.010643
3,not_applicable,England and Wales,not_applicable,0.009838
4,not_applicable,Church of England,not_applicable,0.00944
5,Sovereign,Charles II,not_applicable,0.007148


Now I'll put together a dataframe of all the records WITH duplicates

In [50]:
duplicate_concat = concat_all[concat_all.duplicated(['last_name', 'first_name', 'title'], keep=False)]
duplicate_concat.head(n=5)

Unnamed: 0,last_name,first_name,title,fraction_total,dataset
0,not_applicable,Anonymous,not_applicable,0.29159,eebo
13,Burnet,Gilbert,not_applicable,0.00224,eebo
14,Dryden,John,not_applicable,0.001999,eebo
17,Penn,William,not_applicable,0.001947,eebo
27,Bacon,Francis,not_applicable,0.001287,eebo


I'll combine the duplicated records and add their fraction_totals together

In [51]:
duplicate_concat_sum = (duplicate_concat.groupby(['last_name','first_name','title'],as_index = False).sum())
# duplicate_concat_sum = duplicate_concat_sum.sort_values(by=['first_name'], ascending=False)
duplicate_concat_sum.head(n=5)

Unnamed: 0,last_name,first_name,title,fraction_total
0,Addison,Joseph,not_applicable,0.015285
1,Austen,Jane,not_applicable,0.004666
2,Bacon,Francis,not_applicable,0.002724
3,Baldwin,James,not_applicable,0.001639
4,Beaumont,Francis,not_applicable,0.001757


In [52]:
concat_final = pandas.concat([no_duplicate_concat, duplicate_concat_sum]).sort_values(by=['fraction_total'], ascending=False)
concat_final.head(n=10)

Unnamed: 0,last_name,first_name,title,fraction_total
85,not_applicable,Anonymous,not_applicable,0.304079
0,not_applicable,Various,not_applicable,0.057384
24,Defoe,Daniel,not_applicable,0.053889
67,Shakespeare,William,not_applicable,0.052962
74,Swift,Jonathan,not_applicable,0.037042
64,Pope,Alexander,not_applicable,0.030472
1,More,Hannah,not_applicable,0.024982
37,Goldsmith,Oliver,not_applicable,0.019739
88,not_applicable,Unknown,not_applicable,0.019349
52,Kipling,Rudyard,not_applicable,0.018891


In [53]:
divided_values = (concat_final['fraction_total']/5)
new_concat_final = concat_final.loc[:, ['last_name','first_name', 'title']]
new_concat_final = concat_final.assign(fraction_total = divided_values)
new_concat_final.head(20)

Unnamed: 0,last_name,first_name,title,fraction_total
85,not_applicable,Anonymous,not_applicable,0.060816
0,not_applicable,Various,not_applicable,0.011477
24,Defoe,Daniel,not_applicable,0.010778
67,Shakespeare,William,not_applicable,0.010592
74,Swift,Jonathan,not_applicable,0.007408
64,Pope,Alexander,not_applicable,0.006094
1,More,Hannah,not_applicable,0.004996
37,Goldsmith,Oliver,not_applicable,0.003948
88,not_applicable,Unknown,not_applicable,0.00387
52,Kipling,Rudyard,not_applicable,0.003778


In [54]:
export_csv = new_concat_final.to_csv (r'final_dataset.csv', index = None, header=True)
export_csv