### Pre-Processing of GoodReads_100K
* If you want to process GoodReads_100K.csv: https://www.kaggle.com/datasets/mdhamani/goodreads-books-100k/
* Run this code to make final_data.csv

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset into a Pandas DataFrame
data = pd.read_csv("dataset\GoodReads_100k.csv")
data.size

# Remove duplicates from df
data = data.drop_duplicates()

# Handle outliers (e.g., filter unrealistic ratings)
data = data[(data['rating'] >= 1) & (data['rating'] <= 10)]

# Remove columns that are not needed
data = data[['isbn','title','author','rating','reviews','img','desc','genre','pages']]

# Rename columns
data.rename(columns={'isbn':'ISBN','title':'Title','link':'Link','author':'Author','rating':'Rating','reviews':'No. of ratings','img':'Image','desc':'Desc','genre':'Genre','pages':'Pages'},inplace=True)

data.isnull().sum() # no. of null values
data = data.dropna(subset=['Genre'])
data = data.dropna(subset=['Desc'])
data = data.dropna(subset=['Image'])
data = data.dropna(subset=['ISBN'])
data = data.reset_index(drop=True)

# Filter out books with greater than 50 ratings
final_data = data[data['No. of ratings'] >= 50]
final_data = final_data.reset_index(drop=True)

# Save the cleaned dataset
final_data.to_csv('dataset\\final_data.csv',index=False)

* Run this code to make `final_data_with_ratings.csv` file from `final_data.csv`

In [None]:
import numpy as np

# Load the Goodreads dataset
final_data = pd.read_csv('final_data.csv')

# Simulate user-book ratings
# Assume 1000 users and assign random ratings between 1 and 5 to each book by users
num_users = 1000
num_ratings = len(final_data)

# Generate random user IDs
user_ids = np.random.randint(1, num_users + 1, num_ratings)

# Generate random ratings
ratings = np.random.randint(1, 6, num_ratings)

# Add user IDs and ratings to the dataset
final_data['user_id'] = user_ids
final_data['rating'] = ratings

# Prepare the data for NCF
# Encode the user IDs and ISBNs
final_data['user_id'] = final_data['user_id'].astype(
    'category').cat.codes.values
final_data['ISBN'] = final_data['ISBN'].astype('category').cat.codes.values


# Save the final_data to a CSV file
final_data.to_csv("model/final_data_with_ratings.csv", index=False)