In [1]:
import pandas as pd

In [19]:
dataset=pd.read_csv('dataset.csv')

In [20]:
# Keep the original title and goodreads_book_id columns
book_titles = dataset[['title', 'goodreads_book_id']]

In [21]:
# Convert average_rating to float
dataset['average_rating'] = dataset['average_rating'].str.strip('[]').astype(float)

# Convert genres and authors from string to list
import ast
dataset['genres'] = dataset['genres'].apply(ast.literal_eval)
dataset['authors'] = dataset['authors'].apply(ast.literal_eval)

In [75]:
# Encode genres and authors using MultiLabelBinarizer
from sklearn.preprocessing import MultiLabelBinarizer
mlb_genres = MultiLabelBinarizer()
genres_encoded = mlb_genres.fit_transform(dataset['genres'])
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)

mlb_authors = MultiLabelBinarizer()
authors_encoded = mlb_authors.fit_transform(dataset['authors'])
authors_df = pd.DataFrame(authors_encoded, columns=mlb_authors.classes_)

# Combine the encoded genres, authors, and average_rating
dataset_combined = pd.concat([dataset[['goodreads_book_id','title','average_rating']],genres_df, authors_df], axis=1)

In [76]:
print(dataset_combined['average_rating'])

0       4.54
1       4.46
2       4.44
3       4.53
4       4.53
        ... 
9995    4.21
9996    4.10
9997    4.62
9998    4.34
9999    4.22
Name: average_rating, Length: 10000, dtype: float64


In [45]:
# Standardize average_rating
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dataset_combined['average_rating'] = scaler.fit_transform(dataset_combined[['average_rating']])

In [47]:
from sklearn.neighbors import NearestNeighbors

# Initialize the Nearest Neighbors model
n_neighbors = 5  # Number of similar books to recommend
nn = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine')

# Fit the model on the combined dataset (excluding title and goodreads_book_id)
df=dataset_combined.drop(columns=['goodreads_book_id','title','average_rating'])
nn.fit(df)

# Find the nearest neighbors for a specific book (e.g., first book in the dataset)
distances, indices = nn.kneighbors(df.iloc[30].values.reshape(1, -1))



In [48]:
# Retrieve recommended books
recommended_books_indices = indices[0]
recommended_books = book_titles.iloc[recommended_books_indices]

# Print the recommended books with titles and goodreads_book_id
print(recommended_books[['title', 'goodreads_book_id']])

                               title  goodreads_book_id
30            I am Charlotte Simmons                231
6632                         One Day            6280118
169   A Home at the End of the World               2137
3475            Call Me by Your Name              98687
702                        Crow Lake               8646


In [59]:
# Example of user input (top 5 books for each user)
group_favorite_book_ids = [117, 249, 424, 597, 629]

In [60]:
# Extract the feature rows for these book IDs
group_favorite_books = dataset_combined[dataset_combined['goodreads_book_id'].isin(group_favorite_book_ids)]

In [61]:
print(group_favorite_books)

    goodreads_book_id                                              title  \
27                117              Heretics of Dune (Dune Chronicles #5)   
31                249                                   Tropic of Cancer   
52                424                        Slouching Towards Bethlehem   
61                597      Killing Yourself to Live: 85% of a True Story   
64                629  Zen and the Art of Motorcycle Maintenance: An ...   

    average_rating  action  adult-fiction  adventure  alternative-history  \
27       -0.676812       0              1          1                    0   
31       -1.148483       0              1          0                    0   
52        0.934730       0              0          0                    0   
61       -0.598200       0              0          0                    0   
64       -0.951953       0              0          1                    0   

    anthology  biographical-fiction  biography  ...  \
27          0            

In [62]:
# Aggregate the features by averaging
features_to_average = group_favorite_books.drop(columns=['goodreads_book_id','title'])
average_features = features_to_average.mean(axis=0).values.reshape(1, -1)  # Average feature vector


In [73]:
dataset_combined['average_rating'].apply(type)

Series([], Name: average_rating, dtype: float64)

In [77]:
dataset_combined = dataset_combined[dataset_combined['average_rating'] >= 4.0]

In [78]:
print(dataset_combined)

      goodreads_book_id                                              title  \
0                     1  Harry Potter and the Half-Blood Prince (Harry ...   
1                     2  Harry Potter and the Order of the Phoenix (Har...   
2                     3  Harry Potter and the Sorcerer's Stone (Harry P...   
3                     5  Harry Potter and the Prisoner of Azkaban (Harr...   
4                     6  Harry Potter and the Goblet of Fire (Harry Pot...   
...                 ...                                                ...   
9995           31538647  Hogwarts: An Incomplete and Unreliable Guide (...   
9996           31845516                                       Love Warrior   
9997           32075671                                    The Hate U Give   
9998           32848471                                          Egomaniac   
9999           33288638                                        Wait for It   

      average_rating  action  adult-fiction  adventure  alterna

In [79]:
# Fit the Nearest Neighbors model on the full dataset_combined (which has all features and book IDs)
knn_model = NearestNeighbors(n_neighbors=5, metric='cosine')
knn_model.fit(dataset_combined.drop(columns=['goodreads_book_id','title']))  # Fit without book ID column

In [80]:
# Find nearest neighbors for the aggregated feature vector
distances, indices = knn_model.kneighbors(average_features)



In [81]:
# Retrieve and display recommended books
recommended_books = dataset_combined.iloc[indices[0]]

# Map to original dataset for titles and other details
recommended_books_with_details = dataset[dataset['goodreads_book_id'].isin(recommended_books['goodreads_book_id'])]

In [82]:
print("Recommended books based on the group's preferences:")
print(recommended_books_with_details[['goodreads_book_id', 'title', 'genres', 'authors', 'average_rating']])

Recommended books based on the group's preferences:
      goodreads_book_id                                              title  \
52                  424                        Slouching Towards Bethlehem   
381                4979                            A Man Without a Country   
589                6900                               Tuesdays with Morrie   
656                7745                     Fear and Loathing in Las Vegas   
2149              34760  All I Really Need to Know I Learned in Kinderg...   

                                                 genres  \
52    [non-fiction, memoir, politics, biography, cla...   
381   [non-fiction, memoir, humor, biography, politi...   
589   [non-fiction, memoir, biography, philosophy, c...   
656   [non-fiction, humor, memoir, biography, classi...   
2149  [non-fiction, humor, self-help, philosophy, me...   

                                   authors  average_rating  
52                           [Joan Didion]            4.24  
38

In [83]:
import joblib

In [84]:
# Save the model and dataset
joblib.dump(knn_model, 'nearest_neighbors_model.pkl')
dataset_combined.to_csv('dataset_combined.csv', index=False)