In [295]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [296]:
original_df = pd.read_csv('data_old/books.csv')
columns = original_df.columns.tolist()
columns

['isbn13',
 'isbn10',
 'title',
 'subtitle',
 'authors',
 'genre',
 'thumbnail',
 'description',
 'published_year',
 'average_rating',
 'num_pages',
 'ratings_count']

In [297]:
df = original_df[['title', 'authors', 'genre', 'published_year', 'average_rating']]

print(df.head())  # Display the first few rows of the filtered DataFrame

            title                          authors  \
0          Gilead               Marilynne Robinson   
1    Spider's Web  Charles Osborne;Agatha Christie   
2    The One Tree             Stephen R. Donaldson   
3  Rage of angels                   Sidney Sheldon   
4  The Four Loves              Clive Staples Lewis   

                           genre  published_year  average_rating  
0                        Fiction          2004.0            3.85  
1  Detective and mystery stories          2000.0            3.83  
2               American fiction          1982.0            3.97  
3                        Fiction          1993.0            3.93  
4                 Christian life          2002.0            4.15  


In [298]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6810 entries, 0 to 6809
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           6810 non-null   object 
 1   authors         6738 non-null   object 
 2   genre           6711 non-null   object 
 3   published_year  6804 non-null   float64
 4   average_rating  6767 non-null   float64
dtypes: float64(2), object(3)
memory usage: 266.1+ KB


In [299]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6599 entries, 0 to 6809
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           6599 non-null   object 
 1   authors         6599 non-null   object 
 2   genre           6599 non-null   object 
 3   published_year  6599 non-null   float64
 4   average_rating  6599 non-null   float64
dtypes: float64(2), object(3)
memory usage: 309.3+ KB


In [300]:
df.to_csv('data/cleaned_books.csv')

In [301]:
# Selecting the relevant features for recommendation
selected_features = ['title','authors','genre','published_year','average_rating']
print(selected_features)

['title', 'authors', 'genre', 'published_year', 'average_rating']


In [302]:
# Replacing the null valuess with null string
for feature in selected_features:
    df[feature] = df[feature].fillna('')

In [303]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6599 entries, 0 to 6809
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           6599 non-null   object 
 1   authors         6599 non-null   object 
 2   genre           6599 non-null   object 
 3   published_year  6599 non-null   float64
 4   average_rating  6599 non-null   float64
dtypes: float64(2), object(3)
memory usage: 309.3+ KB


In [304]:
df.sample(10)

Unnamed: 0,title,authors,genre,published_year,average_rating
3100,Riding the Bus with My Sister,Rachel Simon,Biography & Autobiography,2003.0,3.75
1796,First King of Shannara,Terry Brooks,Fiction,1997.0,3.96
3535,Icebound,Dean Koontz,Fiction,2000.0,3.76
4622,Star Wars,David West Reynolds;James Luceno,Juvenile Nonfiction,2006.0,4.34
1597,A Quiver Full of Arrows,Jeffrey Archer,Fiction,2005.0,3.78
3088,Man Crazy,Joyce Carol Oates,Fiction,1998.0,3.39
6665,The Mistress Manual,"Mistress Lorelei;Lorelei Powers, Mistress",Family & Relationships,2000.0,3.76
2875,Season of the Machete,James Patterson,Fiction,1995.0,3.24
6779,Neptune Noir,Rob Thomas;Leah Wilson,Performing Arts,2007.0,3.64
2920,Fear and Loathing,Hunter S. Thompson,Biography & Autobiography,2006.0,4.11


In [305]:
# Split data into features and target
X = df[['genre', 'average_rating']]
y = df['title']

In [306]:
# Encode categorical data (Genre)
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[['genre']]).toarray()
X_encoded

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [307]:
# Set column names for encoded genre features
genre_cols = [f"genre_{i}" for i in range(len(encoder.categories_[0]))]  # Use len(encoder.categories_[0])
X_encoded_df = pd.DataFrame(X_encoded, columns=genre_cols) 

X_combined = pd.concat([X_encoded_df, X[['average_rating']].reset_index(drop=True)], axis=1)
X_combined

Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,...,genre_554,genre_555,genre_556,genre_557,genre_558,genre_559,genre_560,genre_561,genre_562,average_rating
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.85
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.83
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.97
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.93
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6594,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.51
6595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.08
6596,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.98
6597,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00


In [308]:
columns = X_combined.columns.tolist()
columns

['genre_0',
 'genre_1',
 'genre_2',
 'genre_3',
 'genre_4',
 'genre_5',
 'genre_6',
 'genre_7',
 'genre_8',
 'genre_9',
 'genre_10',
 'genre_11',
 'genre_12',
 'genre_13',
 'genre_14',
 'genre_15',
 'genre_16',
 'genre_17',
 'genre_18',
 'genre_19',
 'genre_20',
 'genre_21',
 'genre_22',
 'genre_23',
 'genre_24',
 'genre_25',
 'genre_26',
 'genre_27',
 'genre_28',
 'genre_29',
 'genre_30',
 'genre_31',
 'genre_32',
 'genre_33',
 'genre_34',
 'genre_35',
 'genre_36',
 'genre_37',
 'genre_38',
 'genre_39',
 'genre_40',
 'genre_41',
 'genre_42',
 'genre_43',
 'genre_44',
 'genre_45',
 'genre_46',
 'genre_47',
 'genre_48',
 'genre_49',
 'genre_50',
 'genre_51',
 'genre_52',
 'genre_53',
 'genre_54',
 'genre_55',
 'genre_56',
 'genre_57',
 'genre_58',
 'genre_59',
 'genre_60',
 'genre_61',
 'genre_62',
 'genre_63',
 'genre_64',
 'genre_65',
 'genre_66',
 'genre_67',
 'genre_68',
 'genre_69',
 'genre_70',
 'genre_71',
 'genre_72',
 'genre_73',
 'genre_74',
 'genre_75',
 'genre_76',
 'genre_7

In [309]:

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.25, random_state=42)

In [310]:
X_train

Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,...,genre_554,genre_555,genre_556,genre_557,genre_558,genre_559,genre_560,genre_561,genre_562,average_rating
5483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.32
1730,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.20
1210,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.96
6183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.52
1095,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.51
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.87
5191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.87
5226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.98
5390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.37


In [311]:
y_train

5647                       The Frequencies
1766           Tales of the Cthulhu Mythos
1237                        The Great Wave
6377                     Angels and Demons
1120                 Rose of No Man's Land
                       ...                
3864                               Cyclops
5339             The Cross from a Distance
5374                         Sweet Revenge
5545    Standard Catalog of Smith & Wesson
878            You Shall Know Our Velocity
Name: title, Length: 4949, dtype: object

In [326]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_neighbors': [5, 10, 20, 50],
    'metric': ['euclidean', 'manhattan'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

model_knn = KNeighborsClassifier()
# Perform grid search
grid_search = GridSearchCV(model_knn, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)



In [327]:
print(grid_search.best_params_)

{'algorithm': 'auto', 'metric': 'euclidean', 'n_neighbors': 10}


In [328]:
# Initialize and train the model
kn_model = KNeighborsClassifier(n_neighbors=10, metric='euclidean', algorithm='auto')
kn_model.fit(X_train, y_train)
kn_model.score(X_test, y_test)

0.0036363636363636364

In [329]:
# Make predictions
y_pred = kn_model.predict(X_test)
y_pred

array(['Dirt Music', 'Every Night Italian',
       "Anton Chekhov's Selected Plays", ..., 'Fear and Trembling',
       'Common Herbs for Natural Health', 'Encore Provence'], dtype=object)

In [332]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.00


In [319]:
# Function to recommend a book based on genre and average rating
def recommend_book(genre, avg_rating):
    genre_encoded = encoder.transform([[genre]]).toarray()
    input_data = pd.concat([pd.DataFrame(genre_encoded), pd.DataFrame([avg_rating])], axis=1)
    
    recommended_book = kn_model.predict(input_data)#[0]
    return recommended_book

# Example: Recommend a book
print(recommend_book('Fiction', 3.71))

['Baby Proof']




In [324]:
def recommend_books(genre, avg_rating, num_recommendations=1):
    genre_encoded = encoder.transform([[genre]]).toarray()
    input_data = pd.concat([pd.DataFrame(genre_encoded), pd.DataFrame([avg_rating])], axis=1)

    # Predict distances to all data points
    distances, indices = kn_model.kneighbors(input_data, n_neighbors=num_recommendations)

    # Get book titles based on indices
    recommended_books = df['title'].iloc[indices[0]]
    return recommended_books.tolist()

In [325]:
recommended_books = recommend_books('Fiction', 3.71, num_recommendations=3)
print(recommended_books)

['Miss Wyoming', 'DREAM & THE UNDERWOR', 'The Zero']




## GPT4o

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Generate synthetic data
def generate_synthetic_data(num_samples=1000):
    import random
    
    genres = ['Fiction', 'Non-Fiction', 'Sci-Fi', 'Fantasy', 'Biography', 'Mystery']
    data = {
        'book_title': [f'Book_{i}' for i in range(num_samples)],
        'genre': [random.choice(genres) for _ in range(num_samples)],
        'average_rating': [round(random.uniform(1, 5), 1) for _ in range(num_samples)],
    }
    
    return pd.DataFrame(data)

# Load or generate data
df = generate_synthetic_data()

# Split data into features and target
X = df[['genre', 'average_rating']]
y = df['book_title']

# Encode categorical data (Genre)
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[['genre']]).toarray()

# Combine encoded genre with average_rating
X_combined = pd.concat([pd.DataFrame(X_encoded), X[['average_rating']].reset_index(drop=True)], axis=1)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

X.columns = X.columns.astype(str)

# Initialize and train the model
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Function to recommend a book based on genre and average rating
def recommend_book(genre, avg_rating):
    genre_encoded = encoder.transform([[genre]]).toarray()
    input_data = pd.concat([pd.DataFrame(genre_encoded), pd.DataFrame([avg_rating])], axis=1)
    recommended_book = model.predict(input_data)[0]
    return recommended_book

# Example: Recommend a book
print(recommend_book('Sci-Fi', 4.5))


## Gemini

In [80]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression  # Using Linear Regression for example

# Generate synthetic data
def generate_synthetic_data(num_samples=1000):
    import random
    
    genres = ['Fiction', 'Non-Fiction', 'Sci-Fi', 'Fantasy', 'Biography', 'Mystery']
    data = {
        'book_title': [f'Book_{i}' for i in range(num_samples)],
        'genre': [random.choice(genres) for _ in range(num_samples)],
        'average_rating': [round(random.uniform(1, 5), 1) for _ in range(num_samples)],
    }
    
    return pd.DataFrame(data)

# Load or generate data
df = generate_synthetic_data()

print(df)



    book_title        genre  average_rating
0       Book_0      Mystery             1.1
1       Book_1      Fiction             3.4
2       Book_2  Non-Fiction             4.0
3       Book_3      Fantasy             3.8
4       Book_4  Non-Fiction             5.0
..         ...          ...             ...
995   Book_995  Non-Fiction             2.2
996   Book_996    Biography             3.6
997   Book_997  Non-Fiction             1.1
998   Book_998    Biography             4.9
999   Book_999       Sci-Fi             1.5

[1000 rows x 3 columns]


In [82]:
# Split data into features and target
X = df[['genre', 'average_rating']]
y = df['average_rating']  # Target is now average rating

# Encode categorical data (Genre)
encoder = OneHotEncoder()
X_encoded = encoder.fit_transform(X[['genre']]).toarray()
X_encoded


array([[0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [91]:
# Set column names for encoded genre features
genre_cols = [f"genre_{i}" for i in range(len(encoder.categories_[0]))]  # Use len(encoder.categories_[0])
X_encoded_df = pd.DataFrame(X_encoded, columns=genre_cols) 

X_combined = pd.concat([X_encoded_df, X[['average_rating']].reset_index(drop=True)], axis=1)
X_combined

Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4,genre_5,average_rating
0,0.0,0.0,0.0,1.0,0.0,0.0,1.1
1,0.0,0.0,1.0,0.0,0.0,0.0,3.4
2,0.0,0.0,0.0,0.0,1.0,0.0,4.0
3,0.0,1.0,0.0,0.0,0.0,0.0,3.8
4,0.0,0.0,0.0,0.0,1.0,0.0,5.0
...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,0.0,1.0,0.0,2.2
996,1.0,0.0,0.0,0.0,0.0,0.0,3.6
997,0.0,0.0,0.0,0.0,1.0,0.0,1.1
998,1.0,0.0,0.0,0.0,0.0,0.0,4.9


In [92]:

# Initialize and train the model (using Linear Regression for example)
model = LinearRegression()
model.fit(X_train, y_train)

In [95]:
# Make predictions
y_pred = model.predict(X_test)
y_pred

array([4.5, 1.4, 2.3, 3.7, 1. , 1.1, 2. , 1.8, 3.6, 1. , 3. , 4.8, 1.2,
       2.4, 4.5, 3.6, 1.8, 2.1, 1.8, 4.7, 3.6, 4. , 2.8, 1.7, 4.9, 4.2,
       3.8, 4.2, 1.8, 2.4, 4. , 1.2, 3.8, 2.9, 4.9, 2.4, 3.5, 3.7, 4.2,
       1.6, 1.3, 1.9, 2.2, 4.2, 1.8, 4.8, 2.4, 1.7, 3. , 3.9, 2.5, 1.6,
       3.3, 4.5, 5. , 2.1, 1.6, 4.1, 2.5, 2.7, 2.6, 1.1, 3.1, 1. , 3.3,
       3.6, 1.7, 2.4, 4.3, 2.1, 1.7, 1.2, 4.8, 4.9, 2.6, 4.8, 1.8, 1.1,
       3.6, 2. , 3.5, 4.2, 1.5, 4.8, 1.5, 2.3, 4.9, 4.9, 2.6, 1.4, 1.8,
       3.4, 4.4, 2.7, 4.9, 3. , 2.8, 4.7, 2.5, 4.4, 3.8, 4.8, 3.5, 2.2,
       4.3, 1.6, 2.3, 4.1, 4. , 1.1, 3.6, 1.7, 4. , 2.2, 4.1, 1.4, 1. ,
       3.7, 1.6, 2.5, 1.5, 2.3, 2.2, 3.3, 4.7, 1.8, 1.6, 2.3, 4.1, 3.9,
       2.8, 4.6, 4.8, 2.2, 2.5, 1.3, 3.5, 1. , 4.1, 4.9, 4.1, 2.3, 3.8,
       1.6, 3.8, 1.1, 4.4, 2.2, 4.2, 3.4, 1.4, 1.2, 3.5, 3.5, 3. , 4.9,
       3.5, 3.6, 1.4, 1.9, 1.4, 3.3, 1.8, 4.1, 2.5, 3.2, 4.1, 4.2, 2.1,
       3.7, 3.2, 3.2, 4.1, 4.6, 1.5, 4.5, 4.1, 4.2, 1.3, 2.4, 1.

In [98]:
model.score(X_test, y_test)

1.0

In [97]:
 #Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
accuracy
# print(f'Accuracy: {accuracy:.2f}')

ValueError: continuous is not supported

In [None]:
# Function to recommend a book based on genre and average rating
def recommend_book(genre, avg_rating):
    genre_encoded = encoder.transform([[genre]]).toarray()
    input_data = pd.concat([pd.DataFrame(genre_encoded), pd.DataFrame([avg_rating])], axis=1)
    predicted_rating = model.predict(input_data)[0]
    # You can use predicted_rating to suggest books with similar genre and high predicted rating
    print(f"Predicted rating for a book in genre '{genre}' with average rating {avg_rating}: {predicted_rating:.2f}")
    # Implement your logic to suggest actual book titles based on predicted rating

# Example: Recommend a book
print(recommend_book('genre_0',''))

## Test

In [292]:
from sklearn.neighbors import NearestNeighbors
import pandas as pd

# Sample dataset
data = {
    'title': ['Book1', 'Book2', 'Book3', 'Book4'],
    'genre': ['Fiction', 'Non-Fiction', 'Fiction', 'Horror'],
    'average_rating': [4.2, 3.8, 4.0, 4.5]
}
df = pd.DataFrame(data)

# Encode genre
df['genre_encoded'] = df['genre'].astype('category').cat.codes

# Features
X = df[['genre_encoded', 'average_rating']]

# Model
knn = NearestNeighbors(n_neighbors=3)
knn.fit(X)

# Query
input_genre = 'Fiction'
input_rating = 4.1

# Encode input genre
input_genre_encoded = df[df['genre'] == input_genre]['genre_encoded'].values[0]

# Find nearest neighbors
distances, indices = knn.kneighbors([[input_genre_encoded, input_rating]])

# Output recommendations
recommended_books = df.iloc[indices[0]]['title'].values
print(recommended_books)


['Book3' 'Book1' 'Book4']


