In [27]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Dense, Flatten, Concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder,StandardScaler

In [61]:
# read in full (cleaned) book list
full_book_list= pd.read_csv("/Users/caranix/Documents/GitHub/RomantasyRec/cleaned_romantasy_data.csv")

# read in favorite book list 
read_book_list= pd.read_csv("user_read_list_fantasy_romance_titles_authors.csv")
like_indices = read_book_list.index[read_book_list['Rating_x'] > 3].tolist() 


In [62]:
read_book_list= read_book_list.reset_index()

In [63]:
read_book_list['liked'] = read_book_list['index'].apply(lambda x: 1 if x in like_indices else 0)


In [64]:
read_book_list.columns

Index(['index', 'Title_x', 'Author', 'Rating_x', 'cleaned_title_x',
       'cleaned_author_x', 'api', 'Title_y', 'cleaned_title_y',
       'cleaned_author_y', 'Rating_y', 'Count of Ratings',
       'cleaned_description', 'cleaned_publisher', 'cleaned_page_count_v1',
       'cleaned_count_ratings_v1', 'published_year', 'mature', 'liked'],
      dtype='object')

In [65]:
read_book_list.head()

Unnamed: 0,index,Title_x,Author,Rating_x,cleaned_title_x,cleaned_author_x,api,Title_y,cleaned_title_y,cleaned_author_y,Rating_y,Count of Ratings,cleaned_description,cleaned_publisher,cleaned_page_count_v1,cleaned_count_ratings_v1,published_year,mature,liked
0,0,"Hush, Hush",Becca Fitzpatrick,,"Hush, Hush",Becca Fitzpatrick,https://www.googleapis.com/books/v1/volumes?q=...,"Hush, Hush",hush hush,becca fitzpatrick,3.92,725940,enter realm fallen angel rising passion boxed ...,other,450+,1.0,2013.0,0.0,0
1,1,Electric Idol,Katee Robert,5.0,Electric Idol,Katee Robert,https://www.googleapis.com/books/v1/volumes?q=...,Electric Idol,electric idol,katee robert,3.95,111306,instant new york time usa today bestseller bea...,other,<300,2.0,2022.0,1.0,1
2,2,Ruin and Rising,Leigh Bardugo,,Ruin and Rising,Leigh Bardugo,https://www.googleapis.com/books/v1/volumes?q=...,Ruin and Rising,ruin rising,leigh bardugo,3.98,559718,grishaverse coming netflix soon shadow bone or...,other,450+,1.0,2014.0,0.0,0
3,3,The Ex Hex,Erin Sterling,5.0,The Ex Hex,Erin Sterling,https://www.googleapis.com/books/v1/volumes?q=...,The Ex Hex,ex hex,erin sterling,3.48,248969,new york time bestseller erin sterling cast de...,harper collins,<300,1.0,2021.0,1.0,1
4,4,A Game of Fate,Scarlett St. Clair,5.0,A Game of Fate,Scarlett St Clair,https://www.googleapis.com/books/v1/volumes?q=...,A Game of Fate,game fate,scarlett st clair,4.0,84809,discover enthralling fantasy world god mortal ...,other,300-450,2.0,2021.0,1.0,1


In [66]:

df= read_book_list

# 1️⃣ Process Text Features
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['cleaned_description'])
desc_sequences = tokenizer.texts_to_sequences(df['cleaned_description'])
padded_descriptions = pad_sequences(desc_sequences, maxlen=256, padding='post')

tokenizer.fit_on_texts(df['cleaned_title_y'])
title_sequences = tokenizer.texts_to_sequences(df['cleaned_title_y'])
padded_titles = pad_sequences(title_sequences, maxlen=8, padding='post')

tokenizer.fit_on_texts(df['cleaned_author_y'])
author_sequences = tokenizer.texts_to_sequences(df['cleaned_author_y'])
padded_authors = pad_sequences(author_sequences, maxlen=3, padding='post')

# 2️⃣ Process Categorical Features (Label Encoding)
cat_features = ['cleaned_publisher', 'cleaned_page_count_v1', 'cleaned_count_ratings_v1']
encoders = {col: LabelEncoder().fit(df[col]) for col in cat_features}
encoded_cats = {col: encoders[col].transform(df[col]) for col in cat_features}

# 3️⃣ Process Numerical & Binary Features
scaler = StandardScaler()
scaled_years = scaler.fit_transform(df[['published_year']])

mature_labels = np.array(df['mature'])
liked_labels = np.array(df['liked'])

# 4️⃣ Define Model Inputs
title_input = Input(shape=(8,), name='title_input')
author_input = Input(shape=(3,), name='author_input')
description_input = Input(shape=(256,), name='description_input')

publisher_input = Input(shape=(1,), name='publisher_input')
page_count_input = Input(shape=(1,), name='page_count_input')
count_ratings_input = Input(shape=(1,), name='count_ratings_input')

year_input = Input(shape=(1,), name='year_input')
mature_input = Input(shape=(1,), name='mature_input')

# 5️⃣ Text Branches (CNN for title, author, description)
title_embedding = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=16)(title_input)
title_cnn = Conv1D(filters=64, kernel_size=3, activation='relu')(title_embedding)
title_pooling = GlobalMaxPooling1D()(title_cnn)

author_embedding = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=8)(author_input)
author_cnn = Conv1D(filters=32, kernel_size=3, activation='relu')(author_embedding)
author_pooling = GlobalMaxPooling1D()(author_cnn)

desc_embedding = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=50)(description_input)
desc_cnn = Conv1D(filters=128, kernel_size=3, activation='relu')(desc_embedding)
desc_pooling = GlobalMaxPooling1D()(desc_cnn)

# 6️⃣ Embedding for Categorical Features
publisher_embedding = Embedding(input_dim=len(encoders['cleaned_publisher'].classes_), output_dim=10)(publisher_input)
publisher_flat = Flatten()(publisher_embedding)

page_count_embedding = Embedding(input_dim=len(encoders['cleaned_page_count_v1'].classes_), output_dim=10)(page_count_input)
page_count_flat = Flatten()(page_count_embedding)

count_ratings_embedding = Embedding(input_dim=len(encoders['cleaned_count_ratings_v1'].classes_), output_dim=10)(count_ratings_input)
count_ratings_flat = Flatten()(count_ratings_embedding)

# 7️⃣ Merge All Features
merged = Concatenate()([
    title_pooling, author_pooling, desc_pooling,
    publisher_flat, page_count_flat, count_ratings_flat,
    year_input, mature_input
])

dense = Dense(64, activation='relu')(merged)
output = Dense(1, activation='sigmoid')(dense)

# 8️⃣ Build and Compile Model
model = Model(
    inputs=[title_input, author_input, description_input, 
            publisher_input, page_count_input, count_ratings_input,
            year_input, mature_input],
    outputs=output
)

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# 9️⃣ Summary
model.summary()

# 🔟 Train the Model
model.fit(
    x={
        'title_input': padded_titles, 
        'author_input': padded_authors, 
        'description_input': padded_descriptions,
        'publisher_input': encoded_cats['cleaned_publisher'],
        'page_count_input': encoded_cats['cleaned_page_count_v1'],
        'count_ratings_input': encoded_cats['cleaned_count_ratings_v1'],
        'year_input': scaled_years, 
        'mature_input': mature_labels
    },
    y=liked_labels,
    epochs=5,
    batch_size=2
)


Epoch 1/5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6381 - loss: 0.6543 
Epoch 2/5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6417 - loss: 0.6138 
Epoch 3/5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7292 - loss: 0.4731 
Epoch 4/5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7266 - loss: 0.3918 
Epoch 5/5
[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9348 - loss: 0.3241 


<keras.src.callbacks.history.History at 0x15f997e00>

In [50]:
# now that it's trained, we want to then predict what books we think the user would like! 

In [67]:
full_book_list.head()

Unnamed: 0,api,Title,cleaned_title,cleaned_author,Rating,Count of Ratings,cleaned_description,cleaned_publisher,cleaned_page_count_v1,cleaned_count_ratings_v1,published_year,mature
0,https://www.googleapis.com/books/v1/volumes?q=...,A Court of Thorns and Roses,court thorn rose,sarah j maas,4.18,3459145,first instalment global phenomenon tiktok sens...,bloomsbury,450+,1,2020.0,0
1,https://www.googleapis.com/books/v1/volumes?q=...,A Court of Mist and Fury,court mist fury,sarah j maas,4.65,2625974,new york time bestselling sequel sarah j maass...,bloomsbury,450+,1,2017.0,0
2,https://www.googleapis.com/books/v1/volumes?q=...,Fourth Wing,fourth wing,rebecca yarros,4.58,2508672,war grows deadly violet sorrengail join elite ...,entangled,<300,1,2023.0,0
3,https://www.googleapis.com/books/v1/volumes?q=...,A Court of Wings and Ruin,court wing ruin,sarah j maas,4.47,2194491,feyre must decide trust among high lord turn a...,bloomsbury,450+,1,2017.0,0
4,https://www.googleapis.com/books/v1/volumes?q=...,From Blood and Ash,blood ash,jennifer l armentrout,4.22,708676,captivating actionpacked blood ash sexy addict...,other,300-450,1,2020.0,0


In [53]:
# Process New Book Data
new_desc_sequences = tokenizer.texts_to_sequences(full_book_list['cleaned_description'])
new_padded_descriptions = pad_sequences(new_desc_sequences, maxlen=256, padding='post')

new_title_sequences = tokenizer.texts_to_sequences(full_book_list['cleaned_title'])
new_padded_titles = pad_sequences(new_title_sequences, maxlen=8, padding='post')

new_author_sequences = tokenizer.texts_to_sequences(full_book_list['cleaned_author'])
new_padded_authors = pad_sequences(new_author_sequences, maxlen=3, padding='post')


def encode_categorical_feature(encoder, values):
    try:
        return encoder.transform(values)
    except ValueError:  # Handle unseen labels
        known_labels = set(encoder.classes_)
        return [encoder.transform([val])[0] if val in known_labels else -1 for val in values]

new_encoded_cats = {
    col: np.array(encode_categorical_feature(encoders[col], full_book_list[col])).reshape(-1, 1)
    for col in cat_features
}

# Scale numerical features
new_scaled_years = scaler.transform(full_book_list[['published_year']])

# Convert binary feature
new_mature_labels = np.array(full_book_list['mature'])

# Make Predictions
predictions = model.predict({
    'title_input': new_padded_titles, 
    'author_input': new_padded_authors, 
    'description_input': new_padded_descriptions,
    'publisher_input':  new_encoded_cats['cleaned_publisher'],
    'page_count_input': new_encoded_cats['cleaned_page_count_v1'],
    'count_ratings_input':  new_encoded_cats['cleaned_count_ratings_v1'],
    'year_input': new_scaled_years, 
    'mature_input': new_mature_labels
})

[1m34/34[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [54]:
# Add Predictions to DataFrame
full_book_list['predicted_likelihood'] = predictions

# Sort Books by Highest Likelihood of User Liking Them
recommended_books = full_book_list.sort_values(by='predicted_likelihood', ascending=False)


In [59]:
print(recommended_books[['cleaned_title', 'cleaned_author', 'predicted_likelihood']][1:10])

             cleaned_title    cleaned_author  predicted_likelihood
25            empire storm      sarah j maas              0.842561
35              tower dawn      sarah j maas              0.841901
0         court thorn rose      sarah j maas              0.834072
210                 ex hex     erin sterling              0.833448
17        house sky breath      sarah j maas              0.832623
31             kingdom ash      sarah j maas              0.824689
283     priory orange tree  samantha shannon              0.822677
433              war storm  victoria aveyard              0.822160
6    court frost starlight      sarah j maas              0.815847


In [69]:
read_titles = set(read_book_list['cleaned_title_y'])  # Convert to a set for fast lookup
filtered_recommendations = recommended_books[~recommended_books['cleaned_title'].isin(read_titles)]

In [None]:
print(filtered_recommendations[['cleaned_title', 'cleaned_author', 'predicted_likelihood']][1:10])

          cleaned_title    cleaned_author  predicted_likelihood
283  priory orange tree  samantha shannon              0.822677
433           war storm  victoria aveyard              0.822160
3       court wing ruin      sarah j maas              0.809850
840           poppy war          rf kuang              0.807312
336             destroy      tahereh mafi              0.799199
515            fracture      tahereh mafi              0.798401
376           king cage  victoria aveyard              0.796013
44         moon hatched      sarah parker              0.793970
834          soul witch     harley laroux              0.791993
