In [1]:
import pandas as pd
import numpy as np
import gutenbergpy.textget
import re
import string
from nltk import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from plotnine import *
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import utils

In [2]:
# Load in datasets)
book_contents_train = utils.load_book_contents(utils.book_authors_train)
book_contents_test = utils.load_book_contents(utils.book_authors_test)

In [3]:
# Tokenize the data
books_train_wtoks = utils.wtok_books(book_contents_train)
books_test_wtoks = utils.wtok_books(book_contents_test)

In [4]:
# Get 100 samples per book of around 1000 words each
from importlib import reload
reload(utils)
book_samples_train = utils.get_samples(books_train_wtoks, 100, [900, 1100], random_seed=42)
book_samples_test = utils.get_samples(books_test_wtoks, 100, [900, 1100], random_seed=42)

In [5]:
# Do feature engineering
# Use ngram frequency as features
# cd_1grams is the frequency of 1-grams associated with Charles Dickens, for example
data_df_train = utils.get_data_df(book_samples_train, utils.book_authors_train)
data_df_test = utils.get_data_df(book_samples_test, utils.book_authors_test)

In [6]:
tgt_cols = data_df_test.columns
tgt_cols = ['author_name']
X_train = data_df_train.drop(tgt_cols,axis=1)
y_train = data_df_train.filter(tgt_cols).to_numpy().ravel()
X_test = data_df_test.drop(tgt_cols,axis=1)
y_test = data_df_test.filter(tgt_cols).to_numpy().ravel()

In [7]:
print(X_train.shape)

(1800, 9)


In [8]:
gb_model =  GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
conf_matrix = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
conf_matrix.index = set(y_train)
conf_matrix.columns = conf_matrix.index
display(conf_matrix)
print(metrics.classification_report(y_test, y_pred))
conf_matrix

Accuracy:  0.811


Unnamed: 0,Charles Dickens,Herman Melville,Jane Austen
Charles Dickens,244,35,21
Herman Melville,82,398,20
Jane Austen,7,24,169


                 precision    recall  f1-score   support

Charles Dickens       0.73      0.81      0.77       300
Herman Melville       0.87      0.80      0.83       500
    Jane Austen       0.80      0.84      0.82       200

       accuracy                           0.81      1000
      macro avg       0.80      0.82      0.81      1000
   weighted avg       0.82      0.81      0.81      1000



Unnamed: 0,Charles Dickens,Herman Melville,Jane Austen
Charles Dickens,244,35,21
Herman Melville,82,398,20
Jane Austen,7,24,169


In [9]:
cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5)
cv_scores

array([0.80555556, 0.88333333, 0.91111111, 0.89444444, 0.94444444])

In [None]:
gb_model.feature_importances_