In [1]:
import pandas as pd
import numpy as np
import gutenbergpy.textget
import re
import string
from nltk import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from plotnine import *

In [2]:
# Create training and testing datasets
book_authors_train = {
    'Charles Dickens': [46, 98, 1400, 730, 766, 1023],
    'Herman Melville': [2701, 11231, 15859, 21816, 34970, 10712],
    'Jane Austen': [1342, 158, 161, 105, 121, 141]
}
book_authors_test = {
    'Charles Dickens': [786, 580, 883],
    'Herman Melville': [4045, 8118, 2694, 13720, 53861],
    'Jane Austen': [946, 1212]
}
books_train =  [book_id for id_list in book_authors_train.values() for book_id in id_list]
books_test =  [book_id for id_list in book_authors_test.values() for book_id in id_list]
book_authors = {}
for key in book_authors_train.keys():
    book_authors[key] = book_authors_train[key] + book_authors_test[key]
    

In [3]:
# Load in the text of the books from Project Gutenberg
book_contents = {}
for book_id in [book_id for id_list in book_authors.values() for book_id in id_list]:
    # Load in the book
    raw_book = gutenbergpy.textget.get_text_by_id(book_id)
    clean_book = gutenbergpy.textget.strip_headers(raw_book)
    # Convert to string
    book = clean_book.decode('UTF-8')
    # Save the book contents
    book_contents[book_id] = book

In [4]:
# Tokenize each book into words and sentences
books_wtoks = {}
books_stoks = {}
for book_id, book in book_contents.items():
    # Using RegexpTokenizer with \w+ keeps only alphabetic words
    # Using word_tokenize() keeps both alphabetic words and also counts puncutation as words
    tokenizer = RegexpTokenizer(r'\w+')
    #books_wtoks[book_id] = tokenizer.tokenize(book)
    books_wtoks[book_id] = word_tokenize(book)
    books_stoks[book_id] = sent_tokenize(book)

In [5]:
# Get 100 samples per book of 1000 words each
book_samples = {}
for book_id, words in books_wtoks.items():
    np.random.seed(42)
    book_len = len(words)
    for i in range(0, 100):
        start_idx = int(np.random.random(1)[0] * (book_len - 1000))
        print(start_idx)
        sample = words[start_idx:start_idx+1000]
        print(sample[0:10])
        book_samples[f'{book_id}_{i}'] = sample
samples_df = pd.DataFrame(book_samples)

12977
['Mrs.', 'Fezziwig', '.', 'Top', 'couple', ',', 'too', ';', 'with', 'a']
32941
['a', 'feather', ',', 'I', 'am', 'as', 'happy', 'as', 'an', 'angel']
25362
['might', 'be', 'a', 'claw', ',', 'for', 'the', 'flesh', 'there', 'is']
20742
['hear', 'the', 'Insect', 'on', 'the', 'leaf', 'pronouncing', 'on', 'the', 'too']
5405
['and', 'his', 'coat-skirts', ',', 'and', 'the', 'hair', 'upon', 'his', 'head']
5405
['and', 'his', 'coat-skirts', ',', 'and', 'the', 'hair', 'upon', 'his', 'head']
2012
['sir', ',', "''", 'he', 'added', ',', 'turning', 'to', 'his', 'nephew']
30012
['he', 'felt', 'ashamed', ',', 'and', 'which', 'he', 'struggled', 'to', 'repress']
20828
[',', 'reddening', '.', '``', 'I', 'wish', 'I', 'had', 'him', 'here']
24534
["'s", 'nephew', 'had', 'to', 'think', 'of', 'something', ',', 'and', 'the']
713
['thaw', 'it', 'one', 'degree', 'at', 'Christmas', '.', 'External', 'heat', 'and']
33606
['and', 'buy', 'it', ',', 'and', 'tell', "'em", 'to', 'bring', 'it']
28843
['I', 'dare', 's

In [10]:
def get_data_df(book_samples, book_authors):
    ref_grams = {}
    ref_grams[1] = {
        'cd': [('t',), ('don',), ('boy',), ('until',), ('stopped',), ('hair',), ('d',), ('streets',), ('shook',), ('shaking',)],
        'ja':[('her',), ('she',), ('She',), ('Mrs.',), ('herself',), ('sister',), ('father',), ('Lady',), ('wish',), ('Sir',)],
        'hm':[('sea',), ('strange',), ('THE',), ('Nor',), ('board',), ('ye',), ('ere',), ('peculiar',), ('concerning',), ('original',)]
    }
    ref_grams[2] = {
        'cd':[('’', 't'), ('don', '’'), (',', 'Mr.'), ('said', 'the'), ('his', 'head'), ('the', 'fire'), (',', 'looking'), ('I', 'said'), ('s', 'a'), ('“', 'Now')],
        'ja':[('.', 'She'), (',', 'she'), ('of', 'her'), ('she', 'had'), ('could', 'not'), ('to', 'her'), ('she', 'was'), ('that', 'she'), ('do', 'not'), ('she', 'could')],
        'hm':[(',', 'then'), (',', 'yet'), (';', 'in'), ('.', 'Nor'), ('so', 'that'), ('when', ','), ('.', 'Some'), ('though', ','), (';', 'while'), ('.', 'Upon')]
    }
    ref_grams[3] = {
        'cd': [('don', '’', 't'), ('!', '”', 'said'), ('?', '”', 'said'), ('’', 's', 'a'), ('.', '“', 'Now'), ('.', 'I', 'had'), ('as', 'if', 'he'), ('“', 'Now', ','), ('he', 'said', ','), ('.', '“', 'Yes')],
        'ja': [(',', 'however', ','), ('I', 'am', 'sure'), ('I', 'do', 'not'), (',', 'and', 'she'), ('.', 'She', 'was'), ('she', 'could', 'not'), ('.', 'She', 'had'), (',', 'she', 'was'), (';', 'and', 'she'), ('“', 'Oh', '!')],
        'hm': [(',', 'then', ','), (',', 'who', ','), ('.', 'But', 'the'), ('“', 'I', 'would'), (',', 'like', 'the'), ('that', ',', 'in'), (',', 'that', 'in'), ('answer', '.', '“'), ('out', 'of', 'sight'), (',', 'in', 'some')]
    }
    data_dict = {}
    for sample_id, words in book_samples.items():
        #sample_id = '883_4'
        #words = book_samples[sample_id]
        #print(sample_id)
        #print(words)
        sample_row = {}
        get_ngrams = lambda words, gram_length: pd.Series(sorted(ngrams(words, gram_length))).value_counts()
        top_grams = {}
        # Calculate 1 to 3-grarms
        for gram_length in range(1, 4):
            top_grams[gram_length] = get_ngrams(words, gram_length)
        # Find the number of reference ngrams by author in each sample
        for author in ref_grams[1].keys():
            for gram_length in range(1, 4):
                top_grams_count = top_grams[gram_length]
                author_ref_grams = ref_grams[gram_length][author]
                author_grams_count = top_grams_count.reindex(author_ref_grams)
                #print(author_grams_count)
                sample_row[f'{author}_{gram_length}grams'] = author_grams_count.sum()
        data_dict[sample_id] = sample_row
    data_df = pd.DataFrame(data_dict).T
    data_df = (
        data_df
        .reset_index()
        .rename(columns={'index':'sample_id'})
    )
    return data_df
    #base_df = pd.DataFrame(pd.Series(book_samples)).reset_index()
    #base_df.columns = ['sample','words']
orig_data_df = get_data_df(book_samples, book_authors)
orig_data_df

Unnamed: 0,sample_id,cd_1grams,cd_2grams,cd_3grams,ja_1grams,ja_2grams,ja_3grams,hm_1grams,hm_2grams,hm_3grams
0,46_0,3.0,1.0,0.0,10.0,3.0,1.0,0.0,1.0,0.0
1,46_1,11.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,46_2,3.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0
3,46_3,2.0,3.0,1.0,10.0,2.0,1.0,0.0,1.0,0.0
4,46_4,4.0,3.0,1.0,0.0,1.0,0.0,0.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...
2795,1212_95,0.0,1.0,0.0,4.0,1.0,4.0,1.0,1.0,1.0
2796,1212_96,0.0,0.0,0.0,25.0,6.0,1.0,0.0,1.0,0.0
2797,1212_97,1.0,0.0,0.0,39.0,16.0,2.0,1.0,0.0,0.0
2798,1212_98,1.0,1.0,0.0,14.0,2.0,2.0,0.0,0.0,0.0


In [21]:
data_df = (
    orig_data_df
    .assign(book_id=lambda x: x.sample_id.str.split("_").apply(lambda y: y[0]).astype(float))
    .assign(sample_num=lambda x: x.sample_id.str.split("_").apply(lambda y: y[1]).astype(float))
    .drop('sample_id', axis=1)
)
book_authors_df = pd.melt(pd.DataFrame.from_dict({k:pd.Series(v) for k, v in book_authors.items()}))
book_authors_df.columns = ['author_name', 'book_id']
print(book_authors_df)
data_df = data_df.merge(book_authors_df, on='book_id', how='left')
data_df = data_df.drop(['book_id','sample_num'], axis=1)
target_1hot = pd.get_dummies(data_df.author_name)
data_df = 
data_df

        author_name  book_id
0   Charles Dickens     46.0
1   Charles Dickens     98.0
2   Charles Dickens   1400.0
3   Charles Dickens    730.0
4   Charles Dickens    766.0
5   Charles Dickens   1023.0
6   Charles Dickens    786.0
7   Charles Dickens    580.0
8   Charles Dickens    883.0
9   Charles Dickens      NaN
10  Charles Dickens      NaN
11  Herman Melville   2701.0
12  Herman Melville  11231.0
13  Herman Melville  15859.0
14  Herman Melville  21816.0
15  Herman Melville  34970.0
16  Herman Melville  10712.0
17  Herman Melville   4045.0
18  Herman Melville   8118.0
19  Herman Melville   2694.0
20  Herman Melville  13720.0
21  Herman Melville  53861.0
22      Jane Austen   1342.0
23      Jane Austen    158.0
24      Jane Austen    161.0
25      Jane Austen    105.0
26      Jane Austen    121.0
27      Jane Austen    141.0
28      Jane Austen    946.0
29      Jane Austen   1212.0
30      Jane Austen      NaN
31      Jane Austen      NaN
32      Jane Austen      NaN


Unnamed: 0,cd_1grams,cd_2grams,cd_3grams,ja_1grams,ja_2grams,ja_3grams,hm_1grams,hm_2grams,hm_3grams,author_name
0,3.0,1.0,0.0,10.0,3.0,1.0,0.0,1.0,0.0,Charles Dickens
1,11.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Charles Dickens
2,3.0,3.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,Charles Dickens
3,2.0,3.0,1.0,10.0,2.0,1.0,0.0,1.0,0.0,Charles Dickens
4,4.0,3.0,1.0,0.0,1.0,0.0,0.0,2.0,1.0,Charles Dickens
...,...,...,...,...,...,...,...,...,...,...
2795,0.0,1.0,0.0,4.0,1.0,4.0,1.0,1.0,1.0,Jane Austen
2796,0.0,0.0,0.0,25.0,6.0,1.0,0.0,1.0,0.0,Jane Austen
2797,1.0,0.0,0.0,39.0,16.0,2.0,1.0,0.0,0.0,Jane Austen
2798,1.0,1.0,0.0,14.0,2.0,2.0,0.0,0.0,0.0,Jane Austen


In [22]:
pd.get_dummies(data_df.author_name)

Unnamed: 0,Charles Dickens,Herman Melville,Jane Austen
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
...,...,...,...
2795,0,0,1
2796,0,0,1
2797,0,0,1
2798,0,0,1
