In [1]:
import pandas as pd
import numpy as np
import gutenbergpy.textget
import re
import string
from nltk import ngrams
from nltk.tokenize import word_tokenize, sent_tokenize, RegexpTokenizer
from plotnine import *
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
import utils

In [2]:
# Load in datasets)
book_contents_train = utils.load_book_contents(utils.book_authors_train)
book_contents_test = utils.load_book_contents(utils.book_authors_test)

In [3]:
# Tokenize the data
books_train_wtoks = utils.wtok_books(book_contents_train)
books_test_wtoks = utils.wtok_books(book_contents_test)

In [4]:
# Get 100 samples per book of around 1000 words each
from importlib import reload
reload(utils)
book_samples_train = utils.get_samples(books_train_wtoks, 100, [1000, 1000], random_seed=42)
book_samples_test = utils.get_samples(books_test_wtoks, 100, [1000, 1000], random_seed=42)

In [5]:
# Do feature engineering
# Use ngram frequency as features
# cd_1grams is the frequency of 1-grams associated with Charles Dickens, for example
data_df_train = utils.get_data_df(book_samples_train, utils.book_authors_train)
data_df_test = utils.get_data_df(book_samples_test, utils.book_authors_test)

In [6]:
tgt_cols = data_df_test.columns
tgt_cols = ['author_name']
X_train = data_df_train.drop(tgt_cols,axis=1)
y_train = data_df_train.filter(tgt_cols).to_numpy().ravel()
X_test = data_df_test.drop(tgt_cols,axis=1)
y_test = data_df_test.filter(tgt_cols).to_numpy().ravel()

In [7]:
print(X_train.shape)

(1800, 9)


In [8]:
gb_model =  GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
acc = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc)
conf_matrix = pd.DataFrame(metrics.confusion_matrix(y_test, y_pred))
conf_matrix.index = set(y_train)
conf_matrix.columns = conf_matrix.index
display(conf_matrix)
print(metrics.classification_report(y_test, y_pred))
conf_matrix

Accuracy:  0.834


Unnamed: 0,Charles Dickens,Herman Melville,Jane Austen
Charles Dickens,254,26,20
Herman Melville,83,402,15
Jane Austen,4,18,178


                 precision    recall  f1-score   support

Charles Dickens       0.74      0.85      0.79       300
Herman Melville       0.90      0.80      0.85       500
    Jane Austen       0.84      0.89      0.86       200

       accuracy                           0.83      1000
      macro avg       0.83      0.85      0.83      1000
   weighted avg       0.84      0.83      0.84      1000



Unnamed: 0,Charles Dickens,Herman Melville,Jane Austen
Charles Dickens,254,26,20
Herman Melville,83,402,15
Jane Austen,4,18,178


In [9]:
cv_scores = cross_val_score(gb_model, X_train, y_train, cv=5)
cv_scores

array([0.82222222, 0.91111111, 0.89444444, 0.88888889, 0.92222222])

In [36]:
feature_importances = pd.DataFrame([X_train.columns, gb_model.feature_importances_]).T
feature_importances.columns = ['Feature','Importance']
feature_importances.sort_values('Importance', ascending=False)

Unnamed: 0,Feature,Importance
5,ja_3grams,0.246483
1,cd_2grams,0.167665
0,cd_1grams,0.151983
7,hm_2grams,0.133599
6,hm_1grams,0.12914
3,ja_1grams,0.072968
4,ja_2grams,0.064822
8,hm_3grams,0.028246
2,cd_3grams,0.005094


In [11]:
pd.DataFrame(dict([(k, pd.Series(v)) for k, v in utils.book_authors_test.items()]))

Unnamed: 0,Charles Dickens,Herman Melville,Jane Austen
0,786.0,4045,946.0
1,580.0,8118,1212.0
2,883.0,2694,
3,,13720,
4,,53861,


In [12]:
def get_author_num(data, sample_id):
    sample_id = int(str(sample_id).split("_")[0])

    authors_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in data.items()]))
    author_name = pd.melt(authors_df).query("value == @sample_id").variable.tolist()[0]
    return ['Charles Dickens', 'Jane Austen','Herman Melville'].index(author_name)
train_data = [(words, get_author_num(utils.book_authors_train, sample_id)) for sample_id, words in book_samples_train.items()]
test_data = [(words, get_author_num(utils.book_authors_test, sample_id)) for sample_id, words in book_samples_test.items()]

In [13]:
from collections import Counter

all_words = [word for words, _ in train_data for word in words]
word_counts = Counter(all_words)
vocab = {word: idx + 1 for idx, (word, count) in enumerate(word_counts.most_common(1000))}

vocab['<PAD>'] = 0

In [14]:
import torch
from torch.utils.data import Dataset

class TextDataset(Dataset):
    def __init__(self, data, vocab):
        self.data = data
        self.vocab = vocab

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        words, label = self.data[idx]
        word_indices = [self.vocab.get(word, 0) for word in words]
        return torch.tensor(word_indices), torch.tensor(int(label))  # Assuming labels are integers

In [15]:
import torch.nn as nn

class AuthorClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_classes):
        super(AuthorClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])  # Take the last LSTM output for each sequence
        return output

In [16]:
len(train_data[2][0])

1000

In [27]:
from torch.utils.data import DataLoader
import torch

torch.manual_seed(42)

# Hyperparameters
embedding_dim = 64
hidden_size = 128
num_classes = len(set(label for _, label in train_data))
vocab_size = len(vocab)

# Model, loss, optimizer
model = AuthorClassifier(vocab_size, embedding_dim, hidden_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

train_dataset = TextDataset(train_data, vocab)
test_dataset = TextDataset(test_data, vocab)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    print(epoch)
    model.train()
    loss_vals = []
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        loss_vals.append(loss.item())
        optimizer.step()
    print(np.mean(loss_vals))

In [36]:
torch.save(model.state_dict, "weights.pth")

In [30]:
model.load_state_dict(torch.load("weights.pth")())

<All keys matched successfully>

In [31]:
# Evaluation
model.eval()
total_correct = 0
total_samples = 0

with torch.no_grad():
    for inputs, labels in test_loader:
        outputs = model(inputs)
        _, predicted = torch.max(outputs, 1)
        total_samples += labels.size(0)
        total_correct += (predicted == labels).sum().item()

accuracy = total_correct / total_samples
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 81.00%
