In [38]:
import pandas as pd
import numpy as np
import gutenbergpy.textget
import re
import string
from nltk import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from plotnine import *
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import utils

In [39]:
# Load in datasets
book_contents_train = utils.load_book_contents(utils.book_authors_train)
book_contents_test = utils.load_book_contents(utils.book_authors_test)

In [40]:
# Tokenize the data
books_train_wtoks = utils.wtok_books(book_contents_train)
books_test_wtoks = utils.wtok_books(book_contents_test)

In [64]:
# Get 100 samples per book of around 1000 words each
from importlib import reload
reload(utils)
book_samples_train = utils.get_samples(books_train_wtoks, 100, [10, 1000], random_seed=42)
book_samples_test = utils.get_samples(books_test_wtoks, 100, [10, 1000], random_seed=42)

## Tabular model

In [91]:
# Do feature engineering
# Use ngram frequency as features
# cd_1grams is the frequency of 1-grams associated with Charles Dickens, for example
data_df_train = utils.get_data_df(book_samples_train, utils.book_authors_train)
data_df_test = utils.get_data_df(book_samples_test, utils.book_authors_test)

      cd_1grams  cd_2grams  cd_3grams  ja_1grams  ja_2grams  ja_3grams  \
0         0.003      0.001      0.000      0.010      0.003      0.001   
1         0.011      0.001      0.000      0.001      0.000      0.000   
2         0.003      0.003      0.000      0.000      0.000      0.000   
3         0.002      0.003      0.001      0.010      0.002      0.001   
4         0.004      0.003      0.001      0.000      0.001      0.000   
...         ...        ...        ...        ...        ...        ...   
1795      0.000      0.000      0.000      0.014      0.006      0.002   
1796      0.001      0.001      0.000      0.027      0.012      0.005   
1797      0.000      0.000      0.000      0.048      0.025      0.006   
1798      0.000      0.000      0.001      0.070      0.012      0.002   
1799      0.000      0.000      0.000      0.014      0.003      0.004   

      hm_1grams  hm_2grams  hm_3grams  book_id  sample_num  
0         0.000      0.001      0.000     46.0    

In [92]:
tgt_cols = data_df_test.columns
tgt_cols = ['author_name']
X_train = data_df_train.drop(tgt_cols,axis=1)
y_train = data_df_train.filter(tgt_cols).to_numpy().ravel()
X_test = data_df_test.drop(tgt_cols,axis=1)
y_test = data_df_test.filter(tgt_cols).to_numpy().ravel()

In [17]:
print(X_train.shape)

(1800, 9)


In [86]:
X_test.columns.tolist()

['cd_1grams',
 'cd_2grams',
 'cd_3grams',
 'ja_1grams',
 'ja_2grams',
 'ja_3grams',
 'hm_1grams',
 'hm_2grams',
 'hm_3grams']

In [18]:
gb_model =  GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
conf_matrix = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
conf_matrix.index = set(y_train)
conf_matrix.columns = conf_matrix.index
display(conf_matrix)
print(metrics.classification_report(y_test, y_pred))
conf_matrix

Accuracy:  0.837


Unnamed: 0,Jane Austen,Charles Dickens,Herman Melville
Jane Austen,255,24,21
Charles Dickens,81,404,15
Herman Melville,4,18,178


                 precision    recall  f1-score   support

Charles Dickens       0.75      0.85      0.80       300
Herman Melville       0.91      0.81      0.85       500
    Jane Austen       0.83      0.89      0.86       200

       accuracy                           0.84      1000
      macro avg       0.83      0.85      0.84      1000
   weighted avg       0.84      0.84      0.84      1000



Unnamed: 0,Jane Austen,Charles Dickens,Herman Melville
Jane Austen,255,24,21
Charles Dickens,81,404,15
Herman Melville,4,18,178


In [19]:
cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5)
cv_scores

array([0.81111111, 0.91666667, 0.89444444, 0.89444444, 0.91944444])

In [20]:
feature_importances = pd.DataFrame([X_train.columns, gb_model.feature_importances_]).T
feature_importances.columns = ['Feature','Importance']
feature_importances.sort_values('Importance', ascending=False)

Unnamed: 0,Feature,Importance
5,ja_3grams,0.24418
1,cd_2grams,0.161869
0,cd_1grams,0.157717
7,hm_2grams,0.133258
6,hm_1grams,0.130017
3,ja_1grams,0.071564
4,ja_2grams,0.067519
8,hm_3grams,0.028722
2,cd_3grams,0.005154


In [21]:
pd.DataFrame(dict([(k, pd.Series(v)) for k, v in utils.book_authors_test.items()]))

Unnamed: 0,Charles Dickens,Herman Melville,Jane Austen
0,786.0,4045,946.0
1,580.0,8118,1212.0
2,883.0,2694,
3,,13720,
4,,53861,


## NN model

In [65]:
def get_author_num(data, sample_id):
    sample_id = int(str(sample_id).split("_")[0])

    authors_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))
    author_name = pd.melt(authors_df).query("value == @sample_id").variable.tolist()[0]
    return ['Charles Dickens', 'Jane Austen','Herman Melville'].index(author_name)
def stretch(arr, min_len, pad='<PAD>'):
    return arr + (min_len-len(arr))* [pad]
#train_data = [(stretch(words, 1000, '<PAD>'), get_author_num(utils.book_authors_train, sample_id)) for sample_id, words in book_samples_train.items()]
#test_data = [(stretch(words, 1000, '<PAD>'), get_author_num(utils.book_authors_test, sample_id)) for sample_id, words in book_samples_test.items()]
train_data = [(words, get_author_num(utils.book_authors_train, sample_id)) for sample_id, words in book_samples_train.items()]
test_data = [(words, get_author_num(utils.book_authors_test, sample_id)) for sample_id, words in book_samples_test.items()]


In [66]:
train_data[0][0][-10:-1]

['pencil-case', ',', 'a', 'pair', 'of', 'sleeve-buttons', ',', 'and', 'a']

In [67]:
from collections import Counter

all_words = [word for words, _ in train_data for word in words]
word_counts = Counter(all_words)
vocab = {word: idx + 1 for idx, (word, count) in enumerate(word_counts.most_common(1000))}

vocab['<PAD>'] = 0

In [68]:
vocab_df = (
    pd.DataFrame({k: pd.Series(v) for k, v in vocab.items()})
    .T.reset_index()
)
vocab_df.columns = ['name', 'num']
vocab_df.to_csv("../data/vocab.csv", index=False)

In [69]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        words, label = self.data[idx]
        word_indices = [self.vocab.get(word, 0) for word in words]
        return torch.tensor(word_indices), torch.tensor(int(label))  # Assuming labels are integers

In [70]:
import torch.nn as nn

class AuthorClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(AuthorClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])  # Take the last LSTM output for each sequence
        return output

In [71]:
len(train_data[2][0])

710

In [72]:
from torch.utils.data import DataLoader
import torch

torch.manual_seed(42)

# Hyperparameters
embedding_dim = 64
hidden_size = 128
num_classes = len(set(label for _, label in train_data))
vocab_size = len(vocab)

# Model, loss, optimizer
model = AuthorClassifier(vocab_size, embedding_dim, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_dataset = TextDataset(train_data, vocab)
test_dataset = TextDataset(test_data, vocab)

train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [89]:
# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    print(epoch)
    model.train()
    loss_vals = []
    total_correct = 0
    total_samples = 0
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        loss_vals.append(loss.item())
        _, pred_labels = torch.max(outputs, 1)
        total_correct += (pred_labels == labels).sum().item()
        total_samples += labels.size(0)
        optimizer.step()
    print("Loss:", np.mean(loss_vals))
    print("Accuracy: ", 100 * total_correct / total_samples)

0
Loss: 0.00010006673417337926
Accuracy:  100.0
1
Loss: 4.369890660254012e-05
Accuracy:  100.0


In [None]:
torch.save(model.state_dict, "../data/weights_new.pth")

In [None]:
model.load_state_dict(torch.load("../data/nn.pth")())

In [90]:
# Evaluation
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total_samples += labels.size(0)
        total_correct += (predicted == labels).sum().item()

accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 79.40%
