In [1]:
import pandas as pd
import numpy as np
import gutenbergpy.textget
import re
import string
from nltk import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from plotnine import *
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import utils

In [2]:
# Load in datasets
book_authors_all = utils.get_book_authors_all()
book_contents = utils.load_book_contents(book_authors_all)

In [3]:
# Tokenize the data
books_wtoks = utils.wtok_books(book_contents)
books_stoks = utils.stok_books(book_contents)

In [6]:
# Get 100 samples per book of 1000 words each
from importlib import reload
reload(utils)
book_samples = utils.get_samples(books_wtoks, 100, 1000, random_seed=42)
samples_df = pd.DataFrame(book_samples)

In [11]:
len(book_samples['46_0'])

1000

In [14]:
# Do feature engineering
# Use ngram frequency as features
# cd_1grams is the frequency of 1-grams associated with Charles Dickens, for example
def get_data_df(book_samples, book_authors):
    ref_grams = {}
    ref_grams[1] = {
        'cd': [('t',), ('don',), ('boy',), ('until',), ('stopped',), ('hair',), ('d',), ('streets',), ('shook',), ('shaking',)],
        'ja':[('her',), ('she',), ('She',), ('Mrs.',), ('herself',), ('sister',), ('father',), ('Lady',), ('wish',), ('Sir',)],
        'hm':[('sea',), ('strange',), ('THE',), ('Nor',), ('board',), ('ye',), ('ere',), ('peculiar',), ('concerning',), ('original',)]
    }
    ref_grams[2] = {
        'cd':[('’', 't'), ('don', '’'), (',', 'Mr.'), ('said', 'the'), ('his', 'head'), ('the', 'fire'), (',', 'looking'), ('I', 'said'), ('s', 'a'), ('“', 'Now')],
        'ja':[('.', 'She'), (',', 'she'), ('of', 'her'), ('she', 'had'), ('could', 'not'), ('to', 'her'), ('she', 'was'), ('that', 'she'), ('do', 'not'), ('she', 'could')],
        'hm':[(',', 'then'), (',', 'yet'), (';', 'in'), ('.', 'Nor'), ('so', 'that'), ('when', ','), ('.', 'Some'), ('though', ','), (';', 'while'), ('.', 'Upon')]
    }
    ref_grams[3] = {
        'cd': [('don', '’', 't'), ('!', '”', 'said'), ('?', '”', 'said'), ('’', 's', 'a'), ('.', '“', 'Now'), ('.', 'I', 'had'), ('as', 'if', 'he'), ('“', 'Now', ','), ('he', 'said', ','), ('.', '“', 'Yes')],
        'ja': [(',', 'however', ','), ('I', 'am', 'sure'), ('I', 'do', 'not'), (',', 'and', 'she'), ('.', 'She', 'was'), ('she', 'could', 'not'), ('.', 'She', 'had'), (',', 'she', 'was'), (';', 'and', 'she'), ('“', 'Oh', '!')],
        'hm': [(',', 'then', ','), (',', 'who', ','), ('.', 'But', 'the'), ('“', 'I', 'would'), (',', 'like', 'the'), ('that', ',', 'in'), (',', 'that', 'in'), ('answer', '.', '“'), ('out', 'of', 'sight'), (',', 'in', 'some')]
    }
    data_dict = {}
    for sample_id, words in book_samples.items():
        sample_row = {}
        get_ngrams = lambda words, gram_length: pd.Series(sorted(ngrams(words, gram_length))).value_counts()
        top_grams = {}
        # Calculate 1 to 3-grarms
        for gram_length in range(1, 4):
            top_grams[gram_length] = get_ngrams(words, gram_length)
        # Find the number of reference ngrams by author in each sample
        for author in ref_grams[1].keys():
            for gram_length in range(1, 4):
                top_grams_count = top_grams[gram_length]
                # Uese only the first 5 ngrams
                author_ref_grams = ref_grams[gram_length][author][0:5]
                author_grams_count = top_grams_count.reindex(author_ref_grams)
                # Normalize it by the length of the text
                sample_row[f'{author}_{gram_length}grams'] = author_grams_count.sum() / len(words)
        data_dict[sample_id] = sample_row
    # Create the initial data frame
    data_df = pd.DataFrame(data_dict).T
    data_df = (
        data_df
        .reset_index()
        .rename(columns={'index':'sample_id'})
    )
    # Clean data, attack to book authors 
    data_df = (
        data_df
        .assign(book_id=lambda x: x.sample_id.str.split("_").apply(lambda y: y[0]).astype(float))
        .assign(sample_num=lambda x: x.sample_id.str.split("_").apply(lambda y: y[1]).astype(float))
        .drop('sample_id', axis=1)
    )
    book_authors_df = pd.melt(pd.DataFrame.from_dict({k:pd.Series(v) for k, v in book_authors.items()}))
    book_authors_df.columns = ['author_name', 'book_id']
    data_df = data_df.merge(book_authors_df, on='book_id', how='left')
    data_df = data_df.drop(['book_id','sample_num'], axis=1)
    return data_df
    #base_df = pd.DataFrame(pd.Series(book_samples)).reset_index()
    #base_df.columns = ['sample','words']
data_df = get_data_df(book_samples, book_authors_all)
data_df

Unnamed: 0,cd_1grams,cd_2grams,cd_3grams,ja_1grams,ja_2grams,ja_3grams,hm_1grams,hm_2grams,hm_3grams,author_name
0,0.002,0.001,0.000,0.009,0.001,0.000,0.000,0.001,0.000,Charles Dickens
1,0.011,0.001,0.000,0.000,0.000,0.000,0.000,0.000,0.000,Charles Dickens
2,0.003,0.002,0.000,0.000,0.000,0.000,0.003,0.000,0.000,Charles Dickens
3,0.001,0.001,0.000,0.009,0.002,0.001,0.000,0.001,0.000,Charles Dickens
4,0.001,0.002,0.000,0.000,0.001,0.000,0.000,0.002,0.001,Charles Dickens
...,...,...,...,...,...,...,...,...,...,...
2795,0.000,0.000,0.000,0.000,0.000,0.001,0.000,0.001,0.001,Jane Austen
2796,0.000,0.000,0.000,0.025,0.004,0.001,0.000,0.001,0.000,Jane Austen
2797,0.001,0.000,0.000,0.036,0.011,0.001,0.000,0.000,0.000,Jane Austen
2798,0.001,0.001,0.000,0.009,0.001,0.000,0.000,0.000,0.000,Jane Austen


In [15]:
tgt_cols = data_df.columns
tgt_cols = ['author_name']
X = data_df.drop(tgt_cols,axis=1)
y = data_df.filter(tgt_cols).to_numpy().ravel()
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size = 0.2,
    random_state = 42)

In [16]:
gb_model =  GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
conf_matrix = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
conf_matrix.index = set(y_train)
conf_matrix.columns = conf_matrix.index
display(conf_matrix)
print(metrics.classification_report(y_test, y_pred))
conf_matrix

Accuracy:  0.8928571428571429


Unnamed: 0,Jane Austen,Herman Melville,Charles Dickens
Jane Austen,149,17,12
Herman Melville,15,208,3
Charles Dickens,7,6,143


                 precision    recall  f1-score   support

Charles Dickens       0.87      0.84      0.85       178
Herman Melville       0.90      0.92      0.91       226
    Jane Austen       0.91      0.92      0.91       156

       accuracy                           0.89       560
      macro avg       0.89      0.89      0.89       560
   weighted avg       0.89      0.89      0.89       560



Unnamed: 0,Jane Austen,Herman Melville,Charles Dickens
Jane Austen,149,17,12
Herman Melville,15,208,3
Charles Dickens,7,6,143


In [17]:
cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5)
cv_scores

array([0.890625  , 0.82366071, 0.83705357, 0.86830357, 0.875     ])

In [18]:
gb_model.feature_importances_

array([0.12752857, 0.21378603, 0.00882246, 0.18125047, 0.17196394,
       0.06115826, 0.12355298, 0.06663789, 0.0452994 ])

In [20]:
X_train.sort_values

TypeError: DataFrame.sort_values() missing 1 required positional argument: 'by'