In [1]:
import pandas as pd  
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

data = pd.read_csv('cleaned_data.csv')
data

Unnamed: 0.1,Unnamed: 0,bookID,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_date
0,0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,652,2095690,27591,9/16/2006
1,1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,870,2153167,29221,9/1/2004
2,2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,352,6333,244,11/1/2003
3,3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,435,2339585,36325,5/1/2004
4,4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,2690,41428,164,9/13/2004
...,...,...,...,...,...,...,...,...,...
10532,11121,45630,Whores for Gloria,William T. Vollmann,3.69,160,932,111,2/1/1994
10533,11122,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael He...,4.06,512,156,20,12/21/2004
10534,11123,45633,You Bright and Risen Angels,William T. Vollmann,4.08,635,783,56,12/1/1988
10535,11124,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,415,820,95,8/1/1993


In [2]:
data.columns

Index(['Unnamed: 0', 'bookID', 'title', 'authors', 'average_rating',
       '  num_pages', 'ratings_count', 'text_reviews_count',
       'publication_date'],
      dtype='object')

In [3]:
bins = [-1, 1, 1.99, 2.99, 3.24, 3.49, 3.74, 3.99, 4.24, 4.49, 4.74, 5]
labels = ['0-.99', '1-1.99', '2-2.99','3-3.23', '3.25-3.49','3.5-3.74','3.75-3.99', '4-4.24', '4.25-4.49', '4.5-4.74', '4.75-5']
data['rating_bin'] = pd.cut(data['average_rating'], bins=bins, labels=labels,)
data


Unnamed: 0.1,Unnamed: 0,bookID,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_date,rating_bin
0,0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,652,2095690,27591,9/16/2006,4.5-4.74
1,1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,870,2153167,29221,9/1/2004,4.25-4.49
2,2,4,Harry Potter and the Chamber of Secrets (Harry...,J.K. Rowling,4.42,352,6333,244,11/1/2003,4.25-4.49
3,3,5,Harry Potter and the Prisoner of Azkaban (Harr...,J.K. Rowling/Mary GrandPré,4.56,435,2339585,36325,5/1/2004,4.5-4.74
4,4,8,Harry Potter Boxed Set Books 1-5 (Harry Potte...,J.K. Rowling/Mary GrandPré,4.78,2690,41428,164,9/13/2004,4.75-5
...,...,...,...,...,...,...,...,...,...,...
10532,11121,45630,Whores for Gloria,William T. Vollmann,3.69,160,932,111,2/1/1994,3.5-3.74
10533,11122,45631,Expelled from Eden: A William T. Vollmann Reader,William T. Vollmann/Larry McCaffery/Michael He...,4.06,512,156,20,12/21/2004,4-4.24
10534,11123,45633,You Bright and Risen Angels,William T. Vollmann,4.08,635,783,56,12/1/1988,4-4.24
10535,11124,45634,The Ice-Shirt (Seven Dreams #1),William T. Vollmann,3.96,415,820,95,8/1/1993,3.75-3.99


In [55]:
# Define your feature space
X = data[['average_rating', '  num_pages']].values

# Fit the NearestNeighbors model
nn_model = NearestNeighbors(n_neighbors=5, algorithm='auto', metric='euclidean')
nn_model.fit(X)

# Function to get recommendations
def get_recommendation(input_title):
    input_book = data[data['title'] == input_title]
    if input_book.empty:
        return "Book not found in the dataset. Please try another title."
    
    # Extract features for the input book
    input_features = input_book[['average_rating', '  num_pages']].values
    
    # Find the nearest neighbors
    distances, indices = nn_model.kneighbors(input_features)
    
    # Get the titles of the nearest neighbors
    neighbor_titles = data.iloc[indices[0]]['title']
    
    # Exclude the input book itself
    neighbor_titles = neighbor_titles[neighbor_titles != input_title]
    
    if neighbor_titles.empty:
        return "No recommendations found for this book. Please try another title."
    
    return neighbor_titles.tolist()

# Test the recommendation function with different input titles
input_title = 'City of Glass (The New York Trilogy  #1)'
recommendation = get_recommendation(input_title)
print("Recommendations for '{}' are: {}".format(input_title, recommendation))

input_title = 'To Kill a Mockingbird'
recommendation = get_recommendation(input_title)
print("Recommendations for '{}' are: {}".format(input_title, recommendation))


Recommendations for 'City of Glass (The New York Trilogy  #1)' are: ['On Authorship', 'The Man Who Watched Trains Go By', 'Heart Songs and Other Stories', 'Nonviolence: Twenty-Five Lessons from the History of a Dangerous Idea']
Recommendations for 'To Kill a Mockingbird' are: ['Maison Ikkoku  Volume 12 (Maison Ikkoku  #12)', 'There are No Children Here: The Story of Two Boys Growing Up in the Other America', 'Shadow Game (GhostWalkers  #1)']


In [18]:
# Assuming 'df' is your DataFrame containing identifier columns and data points
# Extracting data points
identifier_columns = ['title', 'authors']
data_points = data[['average_rating', 'text_reviews_count', 'ratings_count', '  num_pages']].values
# Initialize NearestNeighbors model
nbrs = NearestNeighbors(n_neighbors=11, algorithm='ball_tree').fit(data_points)  # Considering 11 neighbors to include the point itself
def find_nearest_neighbors(title, authors):
    matching_entries = data[(data['title'] == title) & (data['authors'] == authors)]
    if matching_entries.empty:
        return "No matching entry found in the dataset."
    
    entry_index = data[(data['title'] == title) & (data['authors'] == authors)].index[0]
    entry_data = data_points[entry_index]
    distances, indices = nbrs.kneighbors([entry_data])
    return indices[0][1:], distances[0][1:]  # Exclude the first entry since it will be the entry itself
# Example: Find nearest neighbors for the entry at index 0
nearest_indices, nearest_distances = find_nearest_neighbors('title', 'authors')
nearest_entries = data.iloc[nearest_indices]
print("Nearest Entries:")
find_nearest_neighbors('Libra', 'Don DeLillo')


ValueError: too many values to unpack (expected 2)

In [23]:
# Extracting data points
identifier_columns = ['title', 'authors']
data_points = data[['average_rating', 'text_reviews_count', 'ratings_count', '  num_pages']].values

# Initialize NearestNeighbors model
nbrs = NearestNeighbors(n_neighbors=11, algorithm='ball_tree').fit(data_points)  # Considering 11 neighbors to include the point itself

def find_nearest_neighbors(title, authors):
    matching_entries = data[(data['title'] == title) & (data['authors'] == authors)]
    if matching_entries.empty:
        return "No matching entry found in the dataset."
    
    entry_index = data[(data['title'] == title) & (data['authors'] == authors)].index[0]
    entry_data = data_points[entry_index]
    distances, indices = nbrs.kneighbors([entry_data])
    return indices[0][1:], distances[0][1:]  # Exclude the first entry since it will be the entry itself

# Example: Find nearest neighbors for the entry at index 0
nearest_indices, nearest_distances = find_nearest_neighbors('Salmon of Doubt: Hitchhiking the Galaxy One Last Time', 'Douglas Adams/Christopher Cerf')
nearest_entries = data.iloc[nearest_indices]
print("Nearest Entries:")
nearest_entries

Nearest Entries:


Unnamed: 0.1,Unnamed: 0,bookID,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_date,rating_bin
471,498,1563,Aristophanes 1: The Acharnians/Peace/Celebrati...,Aristophanes/David R. Slavitt,3.83,336,4,0,1/29/1998,3.75-3.99
3198,3366,12293,Education as My Agenda: Gertrude Williams Rac...,Jo Ann Ooiman Robinson,4.25,336,4,1,10/14/2005,4.25-4.49
6844,7248,27819,Numerical Recipes: Example Book C,William T. Vetterling/Saul A. Teukolsky/Willia...,3.75,336,7,0,11/27/1992,3.75-3.99
7599,8030,30811,Feeding the Future: From Fat to Famine How to...,Andrew Heintzman/Evan Solomon/Eric Schlosser,3.48,336,2,0,9/20/2006,3.25-3.49
5864,6210,23398,The Dig,Alan Dean Foster,3.4,336,2,0,1/1/1998,3.25-3.49
7349,7769,29976,Three Prophetic Science Fiction Novels,H.G. Wells/E.F. Bleiler,3.86,335,1,0,12/31/1975,3.75-3.99
8086,8547,32823,Fanning the Flame: Bible Cross and Mission,Chris Green/Chris Wright/Paul Douglas Gardner,5.0,336,1,1,6/17/2003,4.75-5
9914,10446,42489,After Collapse: The Regeneration of Complex So...,Glenn M. Schwartz,4.0,336,0,0,5/25/2006,4-4.24
4665,4940,17850,The House of Mirth,Edith Wharton/R.W.B. Lewis,3.95,335,10,0,1/1/1977,3.75-3.99
523,558,1804,Who Needs Greek? Contests in the Cultural Hist...,Simon Goldhill,3.4,334,10,1,4/4/2002,3.25-3.49


In [24]:
nearest_indices, nearest_distances = find_nearest_neighbors('Cosmopolis', 'Don DeLillo')
nearest_entries = data.iloc[nearest_indices]
print("Nearest Entries:")
nearest_entries

Nearest Entries:


Unnamed: 0.1,Unnamed: 0,bookID,title,authors,average_rating,num_pages,ratings_count,text_reviews_count,publication_date,rating_bin
5100,5392,19439,My Perfect Life (Confessions of a Teenage Dram...,Dyan Sheldon,3.32,208,460,39,3/3/2005,3.25-3.49
9541,10056,40484,Story of O,Pauline Réage/John Paul Hand,3.32,224,446,52,5/8/1998,3.25-3.49
9709,10227,41463,A Christmas Carol (Great Illustrated Classics),Malvina G. Vogel/Charles Dickens,4.36,238,466,31,1/1/2005,4.25-4.49
3020,3175,11635,The Death and Life of Sylvia Plath,Ronald Hayman,3.67,224,453,24,7/24/2003,3.5-3.74
10324,10883,44511,How Democratic Is the American Constitution?,Robert A. Dahl,3.64,240,466,48,11/10/2003,3.5-3.74
8524,9004,35048,Trial by Fire (Stargate SG-1 #1),Sabine C. Bauer,3.79,237,448,32,6/1/2006,3.75-3.99
10373,10935,44795,The Moon And Sixpence,W. Somerset Maugham,4.13,215,447,53,9/2/1999,4-4.24
10078,10616,43331,He Sees You When You're Sleeping,Mary Higgins Clark/Carol Higgins Clark,3.79,230,477,31,11/1/2002,3.75-3.99
6176,6541,24660,Exploring the Northern Tradition: A Guide to t...,Galina Krasskova/Swain Wódening,3.83,224,443,29,5/15/2005,3.75-3.99
5848,6194,23307,Ulysses S. Grant,Josiah Bunting/Arthur M. Schlesinger Jr.,3.89,208,445,40,9/8/2004,3.75-3.99
