In [1]:
import pandas as pd
import numpy as np
import gutenbergpy.textget
import re
import string
from nltk import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from plotnine import *
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import utils

In [2]:
# Load in datasets
book_contents_train = utils.load_book_contents(utils.book_authors_train)
book_contents_test = utils.load_book_contents(utils.book_authors_test)

In [3]:
# Tokenize the data
books_train_wtoks = utils.wtok_books(book_contents_train)
books_test_wtoks = utils.wtok_books(book_contents_test)

In [4]:
# Get 100 samples per book of around 1000 words each
from importlib import reload
reload(utils)
book_samples_train = utils.get_samples(books_train_wtoks, 100, [10, 1000], random_seed=42)
book_samples_test = utils.get_samples(books_test_wtoks, 100, [10, 1000], random_seed=42)

## Tabular model

In [5]:
# Do feature engineering
# Use ngram frequency as features
# cd_1grams is the frequency of 1-grams associated with Charles Dickens, for example
data_df_train = utils.get_data_df(book_samples_train, utils.book_authors_train)
data_df_test = utils.get_data_df(book_samples_test, utils.book_authors_test)

      cd_1grams  cd_2grams  cd_3grams  ja_1grams  ja_2grams  ja_3grams  \
0      0.000000   0.000000   0.000000   0.008929   0.008929   0.000000   
1      0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   
2      0.002817   0.004225   0.001408   0.011268   0.002817   0.001408   
3      0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   
4      0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   
...         ...        ...        ...        ...        ...        ...   
1795   0.000000   0.000000   0.000000   0.045016   0.017685   0.003215   
1796   0.000000   0.000000   0.000000   0.022222   0.000000   0.011111   
1797   0.000000   0.000000   0.000000   0.000000   0.000000   0.000000   
1798   0.000000   0.000000   0.000000   0.034783   0.017391   0.006957   
1799   0.000000   0.000000   0.000000   0.055034   0.030872   0.004027   

      hm_1grams  hm_2grams  hm_3grams  book_id  sample_num  
0      0.000000   0.000000        0.0     46.0    

In [6]:
tgt_cols = data_df_test.columns
tgt_cols = ['author_name']
X_train = data_df_train.drop(tgt_cols,axis=1)
y_train = data_df_train.filter(tgt_cols).to_numpy().ravel()
X_test = data_df_test.drop(tgt_cols,axis=1)
y_test = data_df_test.filter(tgt_cols).to_numpy().ravel()

In [7]:
print(X_train.shape)

(1800, 9)


In [8]:
X_test.columns.tolist()

['cd_1grams',
 'cd_2grams',
 'cd_3grams',
 'ja_1grams',
 'ja_2grams',
 'ja_3grams',
 'hm_1grams',
 'hm_2grams',
 'hm_3grams']

In [9]:
gb_model =  GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
conf_matrix = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
conf_matrix.index = set(y_train)
conf_matrix.columns = conf_matrix.index
display(conf_matrix)
print(metrics.classification_report(y_test, y_pred))
conf_matrix

Accuracy:  0.742


Unnamed: 0,Charles Dickens,Jane Austen,Herman Melville
Charles Dickens,218,47,35
Jane Austen,97,371,32
Herman Melville,13,34,153


                 precision    recall  f1-score   support

Charles Dickens       0.66      0.73      0.69       300
Herman Melville       0.82      0.74      0.78       500
    Jane Austen       0.70      0.77      0.73       200

       accuracy                           0.74      1000
      macro avg       0.73      0.74      0.73      1000
   weighted avg       0.75      0.74      0.74      1000



Unnamed: 0,Charles Dickens,Jane Austen,Herman Melville
Charles Dickens,218,47,35
Jane Austen,97,371,32
Herman Melville,13,34,153


In [10]:
cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5)
cv_scores

array([0.72222222, 0.76388889, 0.81388889, 0.78055556, 0.79722222])

In [11]:
feature_importances = pd.DataFrame([X_train.columns, gb_model.feature_importances_]).T
feature_importances.columns = ['Feature','Importance']
feature_importances.sort_values('Importance', ascending=False)

Unnamed: 0,Feature,Importance
3,ja_1grams,0.302603
0,cd_1grams,0.204245
4,ja_2grams,0.110881
5,ja_3grams,0.110865
6,hm_1grams,0.087898
1,cd_2grams,0.086048
7,hm_2grams,0.053378
2,cd_3grams,0.022582
8,hm_3grams,0.021499


In [12]:
pd.DataFrame(dict([(k, pd.Series(v)) for k, v in utils.book_authors_test.items()]))

Unnamed: 0,Charles Dickens,Herman Melville,Jane Austen
0,786.0,4045,946.0
1,580.0,8118,1212.0
2,883.0,2694,
3,,13720,
4,,53861,


## NN model

In [13]:
def get_author_num(data, sample_id):
    sample_id = int(str(sample_id).split("_")[0])

    authors_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))
    author_name = pd.melt(authors_df).query("value == @sample_id").variable.tolist()[0]
    return ['Charles Dickens', 'Jane Austen','Herman Melville'].index(author_name)
def stretch(arr, min_len, pad='<PAD>'):
    return arr + (min_len-len(arr))* [pad]
#train_data = [(stretch(words, 1000, '<PAD>'), get_author_num(utils.book_authors_train, sample_id)) for sample_id, words in book_samples_train.items()]
#test_data = [(stretch(words, 1000, '<PAD>'), get_author_num(utils.book_authors_test, sample_id)) for sample_id, words in book_samples_test.items()]
train_data = [(words, get_author_num(utils.book_authors_train, sample_id)) for sample_id, words in book_samples_train.items()]
test_data = [(words, get_author_num(utils.book_authors_test, sample_id)) for sample_id, words in book_samples_test.items()]


In [14]:
train_data[0][0][-10:-1]

['pencil-case', ',', 'a', 'pair', 'of', 'sleeve-buttons', ',', 'and', 'a']

In [15]:
from collections import Counter

all_words = [word for words, _ in train_data for word in words]
word_counts = Counter(all_words)
vocab = {word: idx + 1 for idx, (word, count) in enumerate(word_counts.most_common(1000))}

vocab['<PAD>'] = 0

In [16]:
vocab_df = (
    pd.DataFrame({k: pd.Series(v) for k, v in vocab.items()})
    .T.reset_index()
)
vocab_df.columns = ['name', 'num']
vocab_df.to_csv("../data/vocab.csv", index=False)

In [17]:
vocab

{',': 1,
 'the': 2,
 '.': 3,
 'and': 4,
 'of': 5,
 'to': 6,
 'a': 7,
 'I': 8,
 'in': 9,
 ';': 10,
 'was': 11,
 'that': 12,
 '“': 13,
 '”': 14,
 'it': 15,
 'his': 16,
 'he': 17,
 '’': 18,
 'her': 19,
 'not': 20,
 'with': 21,
 'as': 22,
 'had': 23,
 'you': 24,
 'for': 25,
 'be': 26,
 'at': 27,
 'is': 28,
 '!': 29,
 'have': 30,
 'him': 31,
 'my': 32,
 'on': 33,
 'she': 34,
 '?': 35,
 'but': 36,
 'by': 37,
 'all': 38,
 's': 39,
 'me': 40,
 'so': 41,
 'from': 42,
 'which': 43,
 'this': 44,
 'said': 45,
 'were': 46,
 'would': 47,
 'been': 48,
 'or': 49,
 'one': 50,
 "''": 51,
 'no': 52,
 'they': 53,
 'an': 54,
 'very': 55,
 'The': 56,
 'could': 57,
 'Mr.': 58,
 '``': 59,
 'there': 60,
 'them': 61,
 'are': 62,
 'when': 63,
 'their': 64,
 'what': 65,
 'if': 66,
 'more': 67,
 'do': 68,
 'But': 69,
 '--': 70,
 'any': 71,
 'will': 72,
 'some': 73,
 'out': 74,
 'now': 75,
 'upon': 76,
 'who': 77,
 'your': 78,
 'It': 79,
 'than': 80,
 'into': 81,
 'He': 82,
 'such': 83,
 'man': 84,
 'little': 85,
 

In [18]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        words, label = self.data[idx]
        word_indices = [self.vocab.get(word, 0) for word in words]
        return torch.tensor(word_indices), torch.tensor(int(label))  # Assuming labels are integers

In [19]:
import torch.nn as nn

class AuthorClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(AuthorClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])  # Take the last LSTM output for each sequence
        return output

In [20]:
len(train_data[2][0])

710

In [31]:
from torch.utils.data import DataLoader
import torch

torch.manual_seed(42)

# Hyperparameters
embedding_dim = 64
hidden_size = 128
num_classes = len(set(label for _, label in train_data))
vocab_size = len(vocab)

# Model, loss, optimizer
model = AuthorClassifier(vocab_size, embedding_dim, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_dataset = TextDataset(train_data, vocab)
test_dataset = TextDataset(test_data, vocab)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [39]:
# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    print(epoch)
    model.train()
    loss_vals = []
    total_correct = 0
    total_samples = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        loss_vals.append(loss.item())
        _, pred_labels = torch.max(outputs, 1)
        total_correct += (pred_labels == labels).sum().item()
        total_samples += labels.size(0)
        optimizer.step()
    print("Loss:", np.mean(loss_vals))
    print("Accuracy: ", 100 * total_correct / total_samples)

0
Loss: 0.030522226322427237
Accuracy:  99.0
1
Loss: 0.027134019190254276
Accuracy:  99.11111111111111
2
Loss: 0.03455847775734697
Accuracy:  99.22222222222223
3
Loss: 0.004879653746814358
Accuracy:  99.94444444444444
4
Loss: 0.021396659104736607
Accuracy:  99.27777777777777


In [40]:
torch.save(model.state_dict, "../data/weights.pth")

In [41]:
model.load_state_dict(torch.load("../data/weights.pth")())

<All keys matched successfully>

In [42]:
# Evaluation
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total_samples += labels.size(0)
        total_correct += (predicted == labels).sum().item()

accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 78.40%
