In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import pickle

# Load and preprocess the dataset
df = pd.read_csv('books.csv')

# Drop unnecessary columns
df.drop(['publication_date', 'bookID', 'isbn', 'isbn13'], axis=1, inplace=True)

# Check for missing values before processing
print("Initial DataFrame shape:", df.shape)
print("Missing values in each column:\n", df.isnull().sum())

# Define the columns to be used
columns = ['average_rating', 'num_pages', 'ratings_count', 'text_reviews_count', 'authors', 'language_code', 'publisher']

# Drop rows with any NaN values
df.dropna(subset=columns, inplace=True)

# Check DataFrame shape after dropping rows
print("DataFrame shape after dropping NaN values:", df.shape)

# Initialize LabelEncoders for categorical columns
label_encoder_authors = LabelEncoder()
label_encoder_language_code = LabelEncoder()
label_encoder_publisher = LabelEncoder()
label_encoder_title = LabelEncoder()

# Fit and transform label encoders on the entire dataset
df['authors'] = label_encoder_authors.fit_transform(df['authors'])
df['language_code'] = label_encoder_language_code.fit_transform(df['language_code'])
df['publisher'] = label_encoder_publisher.fit_transform(df['publisher'])
df['title'] = label_encoder_title.fit_transform(df['title'])

# Check for any remaining NaN values
print("Remaining NaN values in DataFrame:\n", df.isnull().sum())

# Define features and target
X = df[['average_rating', 'num_pages', 'ratings_count', 'text_reviews_count', 'authors', 'language_code', 'publisher']]
y = df['title']

# Check the shapes of X and y
print("Features shape:", X.shape)
print("Target shape:", y.shape)

# Split the data into training and testing sets
if X.shape[0] > 0:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create and train the Random Forest Regressor model
    model = RandomForestRegressor(n_estimators=10, random_state=42)
    model.fit(X_train, y_train)

    # Save the model and label encoders for future use
    with open('model.pkl', 'wb') as model_file:
        pickle.dump(model, model_file)
    with open('label_encoder_authors.pkl', 'wb') as le_authors_file:
        pickle.dump(label_encoder_authors, le_authors_file)
    with open('label_encoder_language_code.pkl', 'wb') as le_language_code_file:
        pickle.dump(label_encoder_language_code, le_language_code_file)
    with open('label_encoder_publisher.pkl', 'wb') as le_publisher_file:
        pickle.dump(label_encoder_publisher, le_publisher_file)
    with open('label_encoder_title.pkl', 'wb') as le_title_file:
        pickle.dump(label_encoder_title, le_title_file)

    print("Model and label encoders saved to 'model.pkl', 'label_encoder_authors.pkl', 'label_encoder_language_code.pkl', 'label_encoder_publisher.pkl', and 'label_encoder_title.pkl'")
else:
    print("Error: No samples available for training after preprocessing.")


Initial DataFrame shape: (11040, 8)
Missing values in each column:
 title                 0
authors               0
average_rating        0
language_code         0
num_pages             0
ratings_count         0
text_reviews_count    0
publisher             0
dtype: int64
DataFrame shape after dropping NaN values: (11040, 8)
Remaining NaN values in DataFrame:
 title                 0
authors               0
average_rating        0
language_code         0
num_pages             0
ratings_count         0
text_reviews_count    0
publisher             0
dtype: int64
Features shape: (11040, 7)
Target shape: (11040,)
Model and label encoders saved to 'model.pkl', 'label_encoder_authors.pkl', 'label_encoder_language_code.pkl', 'label_encoder_publisher.pkl', and 'label_encoder_title.pkl'


In [15]:
import pandas as pd

# Load your dataset
df = pd.read_csv('books.csv')

# Get unique language codes
unique_language_codes = df['language_code'].unique()

print("Unique Language Codes:", unique_language_codes)


Unique Language Codes: ['en-GB' 'eng' 'en-US' 'spa' 'fre' 'swe' 'ara' 'grc' 'ale' 'ger' 'lat'
 'gla' 'en-CA' 'por' 'nl' 'mul' 'rus' 'glg' 'jpn' 'enm' 'nor' 'srp' 'msa'
 'ita' 'wel' 'tur']


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.metrics.pairwise import cosine_similarity

# Load your dataset
df = pd.read_csv('books.csv')

# Fill missing values in 'publication_date' and drop rows with any remaining NaN values
df['publication_date'] = pd.to_datetime(df['publication_date'], errors='coerce')
df.dropna(subset=['publication_date'], inplace=True)
df['publication_date'] = df['publication_date'].apply(lambda x: x.timestamp())

# Initialize LabelEncoders for 'authors'
label_encoder_authors = LabelEncoder()
df['authors'] = label_encoder_authors.fit_transform(df['authors'])

# Initialize OneHotEncoder for 'language_code' and 'publisher'
onehot_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
onehot_encoded_features = onehot_encoder.fit_transform(df[['language_code', 'publisher']])

# Get feature names for one-hot encoded columns
onehot_feature_names = onehot_encoder.get_feature_names_out(['language_code', 'publisher'])

# Numerical features to scale
numerical_features = ['average_rating', 'num_pages', 'ratings_count', 'text_reviews_count']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

# Combine numerical features with one-hot encoded features
X = np.hstack((df[numerical_features].values, onehot_encoded_features))

# Create a DataFrame for the combined features
df_encoded = pd.DataFrame(X, columns=numerical_features + list(onehot_feature_names))

# Store the book IDs and titles for mapping
id_to_title = dict(zip(df['bookID'], df['title']))

# Features to be used for similarity calculation
features = numerical_features + list(onehot_feature_names)
def recommend_books(input_data, df_encoded, cosine_sim, label_encoder_authors, onehot_encoder, top_n=5):
    # Extract and scale numerical features
    numerical_input = np.array(input_data[1:5]).reshape(1, -1)
    numerical_input_scaled = scaler.transform(numerical_input)
    
    # Encode categorical features
    authors_encoded = label_encoder_authors.transform([input_data[0]]).reshape(1, -1)
    onehot_encoded_features = onehot_encoder.transform(np.array(input_data[5:7]).reshape(1, -1))
    
    # Combine scaled numerical features with encoded categorical features
    combined_input = np.hstack((authors_encoded, numerical_input_scaled, onehot_encoded_features))
    
    # Append the new input to the encoded DataFrame
    temp_df = df_encoded.copy()
    temp_df.loc[len(temp_df)] = combined_input[0]
    
    # Compute cosine similarity for the new input
    new_cosine_sim = cosine_similarity(temp_df)
    
    # Get the pairwise similarity scores of all books with the new input
    sim_scores = list(enumerate(new_cosine_sim[-1]))
    
    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores[:-1], key=lambda x: x[1], reverse=True)
    
    # Get the scores of the top_n most similar books
    sim_scores = sim_scores[:top_n]
    
    # Get the book indices
    book_indices = [i[0] for i in sim_scores]
    
    # Return the top_n most similar books
    recommended_books = [id_to_title[df.iloc[idx]['bookID']] for idx in book_indices]
    return recommended_books

