## Libraries

In [None]:
import pandas as pd
import numpy as np
import ast
import re
import spacy

from collections import Counter

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.utils import resample
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import make_pipeline

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

from scipy.sparse import csr_matrix
from scipy.stats import uniform

## Data Cleaning and Transformation

In [None]:
path = 'https://raw.githubusercontent.com/brad-paton/GoodReads/refs/heads/main/goodreads_data.csv'
df = pd.read_csv(path)
df = df.dropna()

df['Genres'] = df['Genres'].astype(str)
df['Description'] = df['Description'].astype(str)

#Drop columns not used in the model
df.drop(columns=['Unnamed: 0', 'URL', 'Num_Ratings', 'Avg_Rating', 'Author'], inplace=True)
df.head(5)

Unnamed: 0,Book,Description,Genres
0,To Kill a Mockingbird,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ..."
1,Harry Potter and the Philosopher’s Stone (Harr...,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',..."
2,Pride and Prejudice,"Since its immediate success in 1813, Pride and...","['Classics', 'Fiction', 'Romance', 'Historical..."
3,The Diary of a Young Girl,Discovered in the attic in which she spent the...,"['Classics', 'Nonfiction', 'History', 'Biograp..."
4,Animal Farm,Librarian's note: There is an Alternate Cover ...,"['Classics', 'Fiction', 'Dystopia', 'Fantasy',..."


In [None]:
#Remove rows with characters from other languages like Arabic and Cyrillic characters
def remove_other_languages(text):
    return all(ord(char) < 128 for char in text)

df = df[df['Description'].apply(remove_other_languages)]

#Number of books left after filtering
print("Number of books left:", len(df))
df.head(5)

Number of books left: 5197


Unnamed: 0,Book,Description,Genres
0,To Kill a Mockingbird,The unforgettable novel of a childhood in a sl...,"['Classics', 'Fiction', 'Historical Fiction', ..."
1,Harry Potter and the Philosopher’s Stone (Harr...,Harry Potter thinks he is an ordinary boy - un...,"['Fantasy', 'Fiction', 'Young Adult', 'Magic',..."
2,Pride and Prejudice,"Since its immediate success in 1813, Pride and...","['Classics', 'Fiction', 'Romance', 'Historical..."
5,The Little Prince,A pilot stranded in the desert awakes one morn...,"['Classics', 'Fiction', 'Fantasy', 'Childrens'..."
6,1984,The new novel by George Orwell is the major wo...,"['Classics', 'Fiction', 'Science Fiction', 'Dy..."


In [None]:
#Get unique categories
cats = []
for index in df['Genres'].index:
    genres_list = ast.literal_eval(df['Genres'][index])

    cats.extend(genres_list)

categories = list(set(cats))

#Category count
print("Number of Categories:", len(categories))
print(categories)


Number of Categories: 548
['Unfinished', 'M M F', 'Shojo', 'Military Romance', 'Comix', 'Culinary', 'Biology', 'Homeschool', 'Physics', 'Music', 'Journalism', 'Skepticism', 'Americana', 'Classical Music', 'Productivity', 'Writing', 'Egypt', 'Islam', 'Denmark', 'Aliens', 'Essays', 'Marathi', 'Chick Lit', 'Popular Science', 'New Age', 'Vampires', 'Engineering', 'Christian', 'Iran', 'Journal', 'Psychiatry', 'Suspense', 'Clean Romance', 'Satanism', 'Science Fiction Fantasy', 'Psychological Thriller', 'Wildlife', 'Genetics', 'Entrepreneurship', 'Scotland', 'Superheroes', 'Birds', 'Erotica', 'Space Opera', 'Hockey', 'Memoir', 'Maritime', 'Gender', 'Judaism', 'Fantasy', 'Asexual', 'Southern', 'New Adult', 'New Weird', 'European Literature', 'Nigeria', 'Ancient History', 'International', 'Wicca', 'South Africa', 'Dogs', 'Japan', 'Witches', 'Greece', 'Werewolves', 'Drawing', 'American History', 'Polish Literature', 'Historical', 'Finnish Literature', 'Geology', 'Anthologies', 'Go', 'Fairies', '

In [None]:
#Count categories
def count_cats(cats):
  return Counter(cats)

df['Genres'] = df['Genres'].apply(ast.literal_eval)
df['counts'] = df['Genres'].apply(count_cats)

totals = Counter()

#Aggregate counts
for counts in df['counts']:
  totals.update(counts)

#Create dataframe from counts
Keep_top = 20
df_counts = pd.DataFrame(totals.most_common(Keep_top), columns=['Category', 'Count'])
df_counts



Unnamed: 0,Category,Count
0,Fiction,2988
1,Classics,1275
2,Nonfiction,1207
3,Fantasy,1137
4,Historical Fiction,781
5,Young Adult,738
6,Mystery,696
7,Romance,660
8,Literature,630
9,Novels,609


In [None]:
#Filter out values from top categories
top_cats = set(df_counts['Category'])

def filter_top_cats(cats):
  return [cat for cat in cats if cat in top_cats]

df['Genres'] = df['Genres'].apply(filter_top_cats)

#Remove rows with no genre
df = df[df['Genres'].apply(len) > 0]
df.drop(columns=['counts'], inplace=True)

#Number of books left after filtering
print("Number of books left:", len(df))

df.head(10)

Number of books left: 4528


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=['counts'], inplace=True)


Unnamed: 0,Book,Description,Genres
0,To Kill a Mockingbird,The unforgettable novel of a childhood in a sl...,"[Classics, Fiction, Historical Fiction, Litera..."
1,Harry Potter and the Philosopher’s Stone (Harr...,Harry Potter thinks he is an ordinary boy - un...,"[Fantasy, Fiction, Young Adult, Childrens, Cla..."
2,Pride and Prejudice,"Since its immediate success in 1813, Pride and...","[Classics, Fiction, Romance, Historical Fictio..."
5,The Little Prince,A pilot stranded in the desert awakes one morn...,"[Classics, Fiction, Fantasy, Childrens, Philos..."
6,1984,The new novel by George Orwell is the major wo...,"[Classics, Fiction, Science Fiction, Literatur..."
7,The Great Gatsby,Alternate Cover Edition ISBN: 0743273567 (ISBN...,"[Classics, Fiction, Historical Fiction, Litera..."
9,The Lord of the Rings,"One Ring to rule them all, One Ring to find th...","[Fantasy, Classics, Fiction, Adventure]"
10,The Book Thief,Librarian's note: An alternate cover edition c...,"[Historical Fiction, Fiction, Young Adult, His..."
15,Harry Potter and the Deathly Hallows (Harry Po...,"Harry has been burdened with a dark, dangerous...","[Fantasy, Young Adult, Fiction, Childrens, Adv..."
16,The Kite Runner,1970s Afghanistan: Twelve-year-old Amir is des...,"[Fiction, Historical Fiction, Classics, Contem..."


In [None]:
#make data into single label data
X = df['Description']
Y = df['Genres']
print("X shape: ", X.shape)
print("Y shape: ", Y.shape)

X_temp = []
Y_temp = []

for x, labels in zip(X, Y):
    for label in labels:
        X_temp.append(x)
        Y_temp.append(label)

X_temp = np.array(X_temp)
Y_temp = np.array(Y_temp)

print("X shape", X_temp.shape)
print("Y shape", Y_temp.shape)

df_new = pd.DataFrame({'text': X_temp, 'label': Y_temp})
df_new.head(n = 5)

X shape:  (4528,)
Y shape:  (4528,)
X shape (15275,)
Y shape (15275,)


Unnamed: 0,text,label
0,The unforgettable novel of a childhood in a sl...,Classics
1,The unforgettable novel of a childhood in a sl...,Fiction
2,The unforgettable novel of a childhood in a sl...,Historical Fiction
3,The unforgettable novel of a childhood in a sl...,Literature
4,The unforgettable novel of a childhood in a sl...,Young Adult


## Text Processing/Sampling/Splitting

In [None]:
nlp = spacy.load('en_core_web_sm')

df_clean = df_new.copy()

#texting preprocess
def text_preprocess(text):
    # Lowercase and remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())

    # Parse text with spaCy
    doc = nlp(text)

    # Lemmatize, remove stopwords and punctuation tokens
    tokens = [token.lemma_ for token in doc
              if not token.is_stop and not token.is_punct and token.lemma_ != '-PRON-']

    return ' '.join(tokens)

df_clean['cleaned_text'] = df_clean['text'].apply(text_preprocess)

df_clean.head(5)

Unnamed: 0,text,label,cleaned_text
0,The unforgettable novel of a childhood in a sl...,Classics,unforgettable novel childhood sleepy southern ...
1,The unforgettable novel of a childhood in a sl...,Fiction,unforgettable novel childhood sleepy southern ...
2,The unforgettable novel of a childhood in a sl...,Historical Fiction,unforgettable novel childhood sleepy southern ...
3,The unforgettable novel of a childhood in a sl...,Literature,unforgettable novel childhood sleepy southern ...
4,The unforgettable novel of a childhood in a sl...,Young Adult,unforgettable novel childhood sleepy southern ...


In [None]:
X = df_clean['cleaned_text'].values
classes = df_clean['label']

#convert labels into numerical values
le = LabelEncoder()
y = le.fit_transform(classes)

#random over/under sampling -
ros = RandomOverSampler(random_state = 42)
X_over, y_over = ros.fit_resample(X.reshape(-1, 1), y)
X_over = X_over.flatten()

uos = RandomUnderSampler(random_state = 42)
X_under, y_under = ros.fit_resample(X.reshape(-1,1), y)
X_under = X_under.flatten()

In [None]:
#split -
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
X_o_train, X_o_test, y_o_train, y_o_test = train_test_split(X_over, y_over, test_size = 0.2, random_state = 42)
X_u_train, X_u_test, y_u_train, y_u_test = train_test_split(X_under, y_under, test_size = 0.2, random_state = 42)

#Vectorization -
#CountVectorizer()
c_vec = CountVectorizer()
#no sampling
X_tr_c = c_vec.fit_transform(X_train)
X_te_c = c_vec.transform(X_test)
#over
X_o_tr_c = c_vec.fit_transform(X_o_train)
X_o_te_c = c_vec.transform(X_o_test)
#under
X_u_tr_c = c_vec.fit_transform(X_u_train)
X_u_te_c = c_vec.transform(X_u_test)

#TF-IDF
tfid = TfidfVectorizer(max_features=5000)
#no sampling
X_tr_tf = tfid.fit_transform(X_train)
X_te_tf = tfid.transform(X_test)
#over
X_o_tr_tf = tfid.fit_transform(X_o_train)
X_o_te_tf = tfid.transform(X_o_test)
#under
X_u_tr_tf = tfid.fit_transform(X_u_train)
X_u_te_tf = tfid.transform(X_u_test)


In [None]:
#Reduce the features -
selector = SelectKBest(chi2, k = 5000)

#no sampling - count
X_tr_c_reduced = selector.fit_transform(X_tr_c, y_train)
X_te_c_reduced = selector.transform(X_te_c)
#over - count
X_o_tr_c_reduced = selector.fit_transform(X_o_tr_c, y_o_train)
X_o_te_c_reduced = selector.transform(X_o_te_c)
#under - count
X_u_tr_c_reduced = selector.fit_transform(X_u_tr_c, y_u_train)
X_u_te_c_reduced = selector.transform(X_u_te_c)
#no sampling - tfidf
X_tr_tf_reduced = selector.fit_transform(X_tr_tf, y_train)
X_te_tf_reduced = selector.transform(X_te_tf)
#over - tfidf
X_o_tr_tf_reduced = selector.fit_transform(X_o_tr_tf, y_o_train)
X_o_te_tf_reduced = selector.transform(X_o_te_tf)
#under - tfidf
X_u_tr_tf_reduced = selector.fit_transform(X_u_tr_tf, y_u_train)
X_u_te_tf_reduced = selector.transform(X_u_te_tf)

## Random Forest

In [None]:
#Basic Random Forest model - This was tested for all of the above reduced models for general accuracy
rfc = RandomForestClassifier(n_estimators = 100,
                             max_features = 'sqrt',
                             max_depth = 25,
                             min_samples_split = 10,
                             n_jobs = -1,++
                             random_state = 42,
                            class_weight = 'balanced')

#model that was most accuracy at the basic level
rfc.fit(X_o_tr_tf_reduced, y_o_train)

In [None]:
rf_predict = rfc.predict(X_o_te_tf_reduced)

rf_accuracy = accuracy_score(y_o_test, rf_predict)

print("Accuracy: ", rf_accuracy)
print(classification_report(y_o_test, rf_predict, target_names = le.classes_))

Accuracy:  0.3862115127175368
                    precision    recall  f1-score   support

         Adventure       0.37      0.58      0.45       573
         Audiobook       0.50      0.37      0.42       595
         Childrens       0.33      0.70      0.45       593
          Classics       0.19      0.07      0.10       538
      Contemporary       0.40      0.38      0.39       581
             Crime       0.40      0.76      0.52       606
           Fantasy       0.35      0.11      0.16       649
           Fiction       0.00      0.00      0.00       593
        Historical       0.33      0.48      0.39       570
Historical Fiction       0.25      0.15      0.19       590
           History       0.55      0.64      0.59       595
        Literature       0.32      0.39      0.35       626
           Mystery       0.29      0.12      0.17       602
        Nonfiction       0.67      0.35      0.46       591
            Novels       0.30      0.31      0.30       571
        P

In [None]:
#Determine the best model to use - this was run for all of the reduced models
# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Create GridSearchCV object
random_search = RandomizedSearchCV(estimator=rfc, param_distributions=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose = 2)

# Fit the grid search to the data
random_search.fit(X_u_tr_tf_reduced, y_u_train)

# Get the best hyperparameters
print("Best hyperparameters:", random_search.best_params_)

# Get the best model
best_rf_model = random_search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits


KeyboardInterrupt: 

In [None]:
rftuned_y_pred = best_rf_model.predict(X_u_te_tf_reduced)

rftuned_accuracy = accuracy_score(y_u_test, rftuned_y_pred)
print("Accuracy:", rftuned_accuracy)
print(classification_report(y_u_test, rftuned_y_pred, target_names=le.classes_))

Best model: rfc = RandomForestClassifier(n_estimators = 50,
                             max_features = 'log2',
                             max_depth = 30,
                             min_samples_split = 2,
                             min_samples_leaf = 1,
                             n_jobs = -1,
                             random_state = 42,
                            class_weight = 'balanced')

## Naive Bayes

In [None]:
nb = MultinomialNB()
nb.fit(X_o_tr_c, y_o_train)
nb_pred = nb.predict(X_o_te_c)

print("Accuracy:", accuracy_score(y_o_test, nb_pred))
print(classification_report(y_o_test, nb_pred, target_names=le.classes_))

Accuracy: 0.38847054886211513
                    precision    recall  f1-score   support

         Adventure       0.37      0.52      0.43       573
         Audiobook       0.53      0.28      0.37       595
         Childrens       0.41      0.78      0.53       593
          Classics       0.22      0.10      0.14       538
      Contemporary       0.38      0.43      0.40       581
             Crime       0.41      0.62      0.49       606
           Fantasy       0.30      0.14      0.19       649
           Fiction       0.18      0.01      0.02       593
        Historical       0.33      0.50      0.39       570
Historical Fiction       0.24      0.19      0.21       590
           History       0.60      0.56      0.58       595
        Literature       0.34      0.42      0.38       626
           Mystery       0.25      0.15      0.19       602
        Nonfiction       0.66      0.37      0.48       591
            Novels       0.28      0.28      0.28       571
        P

In [None]:
#best model checks
param_dist = {'alpha': [0.001, 0.01, 0.1, 0.5, 1.0, 1.5, 2.0, 5.0, 10.0],
              'fit_prior': [True, False]}

# Create GridSearchCV object
random_search = RandomizedSearchCV(estimator=nb, param_distributions=param_dist, cv=3, scoring='accuracy', n_jobs=-1, verbose = 2)

# Fit the grid search to the data
random_search.fit(X_o_tr_c_reduced, y_o_train)

# Get the best hyperparameters
print("Best hyperparameters:", random_search.best_params_)

# Get the best model
best_nb_model = random_search.best_estimator_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
Best hyperparameters: {'fit_prior': True, 'alpha': 0.001}


In [None]:
nb_tuned_y_pred = best_nb_model.predict(X_o_te_c_reduced)

nb_tuned_accuracy = accuracy_score(y_o_test, nb_tuned_y_pred)
print("Accuracy:", nb_tuned_accuracy)
print(classification_report(y_o_test, nb_tuned_y_pred, target_names=le.classes_))

Accuracy: 0.34872824631860777
                    precision    recall  f1-score   support

         Adventure       0.34      0.46      0.39       573
         Audiobook       0.48      0.18      0.27       595
         Childrens       0.39      0.67      0.50       593
          Classics       0.17      0.11      0.13       538
      Contemporary       0.31      0.35      0.33       581
             Crime       0.39      0.57      0.46       606
           Fantasy       0.25      0.16      0.19       649
           Fiction       0.07      0.01      0.02       593
        Historical       0.30      0.41      0.35       570
Historical Fiction       0.22      0.20      0.21       590
           History       0.57      0.52      0.54       595
        Literature       0.31      0.43      0.36       626
           Mystery       0.22      0.15      0.18       602
        Nonfiction       0.52      0.35      0.42       591
            Novels       0.23      0.25      0.24       571
        P

Best model: alpha = 0.01 and fit_prior = True

## SVM

In [None]:
#basic linear kernel
svm_linear = SVC(kernel = 'linear', random_state = 42, class_weight = 'balanced')
svm_linear.fit(X_o_tr_c_reduced, y_o_train)
y_pred_lin = svm_linear.predict(X_o_te_c_reduced)

print("Linear Accuracy: ", accuracy_score(y_o_test, y_pred_lin))
print(classification_report(y_o_test, y_pred_lin, target_names=le.classes_))

Linear Accuracy:  0.39081325301204817
                    precision    recall  f1-score   support

         Adventure       0.36      0.56      0.44       573
         Audiobook       0.42      0.47      0.44       595
         Childrens       0.43      0.65      0.52       593
          Classics       0.18      0.07      0.10       538
      Contemporary       0.40      0.39      0.40       581
             Crime       0.40      0.69      0.51       606
           Fantasy       0.32      0.12      0.18       649
           Fiction       0.11      0.00      0.01       593
        Historical       0.32      0.49      0.39       570
Historical Fiction       0.23      0.17      0.19       590
           History       0.55      0.66      0.60       595
        Literature       0.34      0.39      0.36       626
           Mystery       0.26      0.15      0.19       602
        Nonfiction       0.76      0.35      0.48       591
            Novels       0.29      0.28      0.28       571
 

In [None]:
#basic rbf kernel
svm_rbf = SVC(kernel = 'rbf', random_state = 42, class_weight = 'balanced')
svm_rbf.fit(X_u_tr_c_reduced, y_u_train)
y_pred_rbf = svm_rbf.predict(X_u_te_c_reduced)

print("Linear Accuracy: ", accuracy_score(y_u_test, y_pred_rbf))
print(classification_report(y_u_test, y_pred_rbf, target_names=le.classes_))

In [None]:
#best model checks
param_dist_g = {'gamma': [0.0001, 0.0005, 0.001, 0.005]}

random_search = RandomizedSearchCV(estimator=sv_rbf, param_distributions=param_dist_g, cv=3, scoring='accuracy', n_jobs=-1, verbose = 2)

# Fit the grid search to the data
random_search.fit(X_u_tr_c_reduced, y_u_train)

# Get the best hyperparameters
print("Best hyperparameters:", random_search.best_params_)

# Get the best model
best_rbf_model = random_search.best_estimator_

rbf_tuned_y_pred = best_rbf_model.predict(X_u_te_c_reduced)

print("Accuracy:", accuracy_score(y_u_test, rbf_tuned_y_pred))
print(classification_report(y_u_test, rbf_tuned_y_pred, target_names=le.classes_))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [None]:
param_dist_c = {'C': [1, 5, 10, 50]}

random_search = RandomizedSearchCV(estimator=svm_linear, param_distributions=param_dist_c, cv=3, scoring='accuracy', n_jobs=-1, verbose = 2)

# Fit the grid search to the data
random_search.fit(X_u_tr_c_reduced, y_u_train)

# Get the best hyperparameters
print("Best hyperparameters:", random_search.best_params_)

# Get the best model
best_rbf_model = random_search.best_estimator_

rbf_tuned_y_pred = best_rbf_model.predict(X_u_te_c_reduced)

print("Accuracy:", accuracy_score(y_u_test, rbf_tuned_y_pred))
print(classification_report(y_u_test, rbf_tuned_y_pred, target_names=le.classes_))

Best Model: linear kernel with C = 1