In [114]:
# general process outline

# take dataset files
# read top x number of rows into pandas dataframe (according to percentage_breakdowns_v2.xlsx)
# save new dataframe with only last_name, first_name, title, fraction_total
# mash together 5 dataframes - where records are identical across first 3 rows, add together fraction total
# take mashed-together dataset records and divide all fraction_total values by 5
# reorganize final author dataset by fraction_total (highest to lowest)


In [115]:
import pandas
import numpy

### LOADING DATA

Here, I'm loading the top ~5% most common authors in the dataset and saving it as a pandas dataframe. From that dataframe, I create a second dataframe with four relevant columns: first_name, last_name, title, fraction_total

This process repeats for each dataset. First, Early English Books Online...

In [116]:
# EEBO

eebo_df = pandas.read_csv('../eebo/eebo_dataset_final_3.csv', nrows=40)
# eebo_df

In [117]:
eebo_df2 = eebo_df.loc[:, ['last_name', 'first_name', 'title', 'fraction_total', 'dataset']]
# eebo_df2

Next, English Short Title Catalog...

In [118]:
# ESTC

estc_df = pandas.read_csv('../estc/estc_dataset_final_3.csv', nrows=159)
# estc_df

In [119]:
estc_df2 = estc_df.loc[:, ['last_name', 'first_name', 'title', 'fraction_total', 'dataset']]
# estc_df2

Open Syllabus Project...

In [120]:
# OPEN SYLLABUS

open_syllabus_df = pandas.read_csv('../open-syllabus/english-lit/open-syllabus_dataset_final_3.csv', nrows=250)
# open_syllabus_df

In [121]:
open_syllabus_df2 = open_syllabus_df.loc[:, ['last_name', 'first_name', 'title', 'fraction_total', 'dataset']]
# open_syllabus_df2

Oxford Text Archive...

In [122]:
# OTA

ota_df = pandas.read_csv('../ota/ota_dataset_final_3.csv', nrows=31)
# ota_df

In [123]:
ota_df2 = ota_df.loc[:, ['last_name', 'first_name', 'title', 'fraction_total', 'dataset']]
# ota_df2

And finally, Project Gutenberg...

In [145]:
# PROJECT GUTENBERG

project_gutenberg_df = pandas.read_csv('../project-gutenberg-2/project-gutenberg_dataset_final_3.csv', nrows=1057)
# project_gutenberg_df

In [146]:
project_gutenberg_df2 = project_gutenberg_df.loc[:, ['last_name', 'first_name', 'title', 'fraction_total', 'dataset']]
# project_gutenberg_df2

### PROCESSING LOADED DATA

Now that all the datasets are loaded as dataframes with the relevant columns, I can start processing the loaded data.

This is where I got stuck. I'm trying to merge together records that match on the first three columns (first_name, last_name, title) and, in merging, create an average fraction_total. For example, let's say the records in the dataframe below are taken from 2 different datasets. Because they're identical across the first three columns, the fraction_total column should be merged together and divided by two.

In [126]:
import pandas

sample_records1 = {'last_name': ['Shakespeare', 'Shakespeare'], 'first_name': ['William', 'William'], 'title': ['',''], 'fraction_total': [.1, .2]}

df_sample1 = pandas.DataFrame(sample_records1)

df_sample1

Unnamed: 0,last_name,first_name,title,fraction_total
0,Shakespeare,William,,0.1
1,Shakespeare,William,,0.2


But in this next dataframe, only the first and second records should be merged. The third differs in the 'title' column and should be read as a different record.

In [127]:
import pandas

sample_records2 = {'last_name': ['Hopper', 'Hopper', 'Hopper'], 'first_name': ['William', 'William', 'William'], 'title': ['','', 'Mrs'], 'fraction_total': [.1, .2, .1]}

df_sample2 = pandas.DataFrame(sample_records2)

df_sample2

Unnamed: 0,last_name,first_name,title,fraction_total
0,Hopper,William,,0.1
1,Hopper,William,,0.2
2,Hopper,William,Mrs,0.1


Ideally I could mash together all 5 datasets at once so the math is more straightforward, but merging dataset-by-dataset shouldn't be too much of a problem. 

Having a bit of trouble determining whether the code bit below actually works or not. I'm pretty sure it doesn't so I'm still fiddling. Currently doing some pandas research to see what methods might work for me.

Also considering:
- abandoning Pandas and using the Jupyter Notebook to manipulate the actual files themselves (i.e. create new CSVs instead of dataframes and read those in)
- changing the structure of my datasets to suit pandas better / create a more meaningful final dataset (as of now there's no way of determining which dataset(s) the authors come from in the final dataset, but I think that would be pretty cool to see)

In [128]:
merged_df = pandas.concat((eebo_df2, estc_df2, open_syllabus_df2, ota_df2, project_gutenberg_df2))
merged_df
#  merged_df.groupby(merged_df.index).mean()


Unnamed: 0,last_name,first_name,title,fraction_total,dataset
0,not_applicable,Anonymous,not_applicable,0.291590,eebo
1,England and Wales,Parliament,not_applicable,0.011711,eebo
2,Sovereign,Charles I,not_applicable,0.010643,eebo
3,not_applicable,England and Wales,not_applicable,0.009838,eebo
4,not_applicable,Church of England,not_applicable,0.009440,eebo
...,...,...,...,...,...
1052,Lu,Xun,not_applicable,0.000149,project_gutenberg
1053,Ovid,not_applicable,not_applicable,0.000149,project_gutenberg
1054,Jean Paul,not_applicable,not_applicable,0.000149,project_gutenberg
1055,Saki,not_applicable,not_applicable,0.000149,project_gutenberg


### let's muck around with multiindexes, shall we?

In [129]:
# eebo_df2
# estc_df2
# open_syllabus_df2
# ota_df2
# project_gutenberg_df2

In [130]:
eebo_multi = pandas.MultiIndex.from_frame(eebo_df2)
eebo_multi

MultiIndex([(   'not_applicable',                   'Anonymous', ...),
            ('England and Wales',                  'Parliament', ...),
            (        'Sovereign',                   'Charles I', ...),
            (   'not_applicable',           'England and Wales', ...),
            (   'not_applicable',           'Church of England', ...),
            (        'Sovereign',                  'Charles II', ...),
            (         'Scotland',               'Privy Council', ...),
            (        'Sovereign',                     'James I', ...),
            (        'Sovereign',                 'Elizabeth I', ...),
            (        'Sternhold',                      'Thomas', ...),
            ('England and Wales', 'Parliament House of Commons', ...),
            (           'Baxter',                     'Richard', ...),
            (              'Fox',                      'George', ...),
            (           'Burnet',                     'Gilbert', ...),
      

In [131]:
print(eebo_multi[0][3])
print(type(eebo_multi[0][3]))

0.29158992
<class 'numpy.float64'>


#### maybe pandas.DataFrame.duplicated is the way to go?

In [149]:
concat_all = pandas.concat([eebo_df2, estc_df2, open_syllabus_df2, ota_df2, project_gutenberg_df2], axis=0)
concat_all

Unnamed: 0,last_name,first_name,title,fraction_total,dataset
0,not_applicable,Anonymous,not_applicable,0.291590,eebo
1,England and Wales,Parliament,not_applicable,0.011711,eebo
2,Sovereign,Charles I,not_applicable,0.010643,eebo
3,not_applicable,England and Wales,not_applicable,0.009838,eebo
4,not_applicable,Church of England,not_applicable,0.009440,eebo
...,...,...,...,...,...
1052,Lu,Xun,not_applicable,0.000149,project_gutenberg
1053,Ovid,not_applicable,not_applicable,0.000149,project_gutenberg
1054,Jean Paul,not_applicable,not_applicable,0.000149,project_gutenberg
1055,Saki,not_applicable,not_applicable,0.000149,project_gutenberg


In [175]:
is_mrs = concat_all['last_name']=="Wood"
print(is_mrs.head())

0    False
1    False
2    False
3    False
4    False
Name: last_name, dtype: bool


In [176]:
concat_all_mrs = concat_all[is_mrs]
print(concat_all_mrs.shape)

(2, 5)


In [177]:
print(concat_all_mrs.head())

    last_name first_name           title  fraction_total            dataset
147      Wood      Henry             Mrs        0.000612  project_gutenberg
700      Wood    William  not_applicable        0.000215  project_gutenberg


In [185]:
duplicated_concat2 = concat_all[concat_all.duplicated(['last_name', 'first_name', 'title'], keep=False)]

duplicated_concat2

Unnamed: 0,last_name,first_name,title,fraction_total,dataset
0,not_applicable,Anonymous,not_applicable,0.291590,eebo
13,Burnet,Gilbert,not_applicable,0.002240,eebo
14,Dryden,John,not_applicable,0.001999,eebo
17,Penn,William,not_applicable,0.001947,eebo
27,Bacon,Francis,not_applicable,0.001287,eebo
...,...,...,...,...,...
851,Radcliffe,Ann Ward,not_applicable,0.000165,project_gutenberg
874,Bacon,Francis,not_applicable,0.000165,project_gutenberg
903,Milton,John,not_applicable,0.000165,project_gutenberg
942,Huxley,Aldous,not_applicable,0.000149,project_gutenberg


In [186]:
# duplicated_concat2.head(n=40)

In [189]:
duplicated_concat2_sorted = duplicated_concat2.sort_values(by=['first_name'], ascending=False)

duplicated_concat2_sorted.head(n=50)

Unnamed: 0,last_name,first_name,title,fraction_total,dataset
569,Wordsworth,William,not_applicable,0.000248,project_gutenberg
24,Hayley,William,not_applicable,0.003495,estc
4,Shakespeare,William,not_applicable,0.005409,project_gutenberg
17,Penn,William,not_applicable,0.001947,eebo
20,Gilpin,William,not_applicable,0.009417,ota
12,Hayley,William,not_applicable,0.01159,ota
33,Shakespeare,William,not_applicable,0.001162,eebo
0,Shakespeare,William,not_applicable,0.091391,open_syllabus
9,Shakespeare,William,not_applicable,0.013401,ota
3,Wordsworth,William,not_applicable,0.027481,open_syllabus
