### TF-idf Vectorization Model
- TF-idf stands for term frequency-inverse document frequency, and the tf-idf weight is a weight often used in information retrieval and text mining. This weight is a statistical measure used to evaluate how important a word is to a document in a collection or corpus.
- The importance increases proportionally to the number of times a word appears in the document but is offset by the frequency of the word in the corpus.
Cosine Similarity is a measure of similarity between two non-zero vectors of an inner product space that measures the cosine of the angle between them. The cosine of 0° is 1, and it is less than 1 for any other angle.

In [14]:
import pandas as pd
import numpy as np
import pickle, os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Load the Goodreads dataset
final_data = pd.read_csv('final_data.csv')

# Create a TF-IDF Vectorizer for the 'desc' column
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)

"""To check Output from above code: 
# print(f"Final Data Null Values: {final_data['Desc'].isnull().sum()}")
# print(f"Lenght of Final Data: {len(final_data)}")

# print(f"TfidfVectorizer: {tfidf_vectorizer}")

"""

# Replace NaN values with an empty string
final_data['Desc'] = final_data['Desc'].fillna('')

# Apply the TF-IDF vectorizer to the 'desc' column
tfidf_matrix_desc = tfidf_vectorizer.fit_transform(final_data['Desc'])

# print(f"tfidf_matrix_desc: {tfidf_matrix_desc}") # To check Output from above code
# Convert the data type to float32
tfidf_matrix_desc = tfidf_matrix_desc.astype(np.float32)

# Compute the cosine similarity matrix for book descriptions
cosine_sim_desc = linear_kernel(tfidf_matrix_desc, tfidf_matrix_desc)

# Save the model as a pickle file
if not os.path.exists('cosing_sim_desc.pkl'):
    with open('cosing_sim_desc.pkl', 'wb') as f:
        pickle.dump(cosine_sim_desc, f)



### Neural Collaborative Filtering (NCF) Model for Book Recommendation


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Flatten, Dense, Concatenate, Dropout
from tensorflow.keras.optimizers import Adam

# Load the Goodreads dataset
final_data = pd.read_csv('GoodReads_100k.csv')

### Simulate User-Book Ratings
Since the dataset does not contain user ratings, we simulate user-book interactions by generating random user IDs and ratings.

In [None]:
# Simulate user-book ratings
# Assume 1000 users and assign random ratings between 1 and 5 to each book by users
num_users = 1000
num_ratings = len(final_data)

# Generate random user IDs
user_ids = np.random.randint(1, num_users + 1, num_ratings)

# Generate random ratings
ratings = np.random.randint(1, 6, num_ratings)

# Add user IDs and ratings to the dataset
final_data['user_id'] = user_ids
final_data['rating'] = ratings


### Prepare the Dataset for NCF
We encode the user IDs and ISBNs as categorical variables to prepare the data for the NCF model. We then split the dataset into training and testing sets.

In [None]:
# Prepare the data for NCF
# Encode the user IDs and ISBNs
final_data['user_id'] = final_data['user_id'].astype(
    'category').cat.codes.values
final_data['ISBN'] = final_data['ISBN'].astype('category').cat.codes.values

# Select necessary columns for NCF
user_item_data = final_data[['user_id', 'ISBN', 'rating']]

# Split the data into training and testing sets
train, test = train_test_split(user_item_data, test_size=0.2, random_state=42)

### Define the NCF Model
We define the NCF model architecture, which includes user and item embeddings, concatenation of these embeddings, and a series of dense layers to predict user ratings.

In [None]:
# Define the NCF model
num_users = user_item_data['user_id'].nunique()
num_items = user_item_data['ISBN'].nunique()

# User input and embedding
user_input = Input(shape=(1,), name='user_input')
user_embedding = Embedding(
    input_dim=num_users, output_dim=50, name='user_embedding')(user_input)
user_vec = Flatten(name='user_flatten')(user_embedding)

# Item input and embedding
item_input = Input(shape=(1,), name='item_input')
item_embedding = Embedding(
    input_dim=num_items, output_dim=50, name='item_embedding')(item_input)
item_vec = Flatten(name='item_flatten')(item_embedding)

# Concatenate the embeddings
concat = Concatenate(name='concat')([user_vec, item_vec])
dense = Dense(128, activation='relu', name='dense1')(concat)
dropout = Dropout(0.3, name='dropout')(dense)
output = Dense(1, activation='linear', name='output')(dropout)

# Compile the model
model = Model([user_input, item_input], output)
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='mean_squared_error', metrics=['mae']) # mae = Mean Absolute Error

### Train & Save the NCF Model
We train the NCF model using the training data and evaluate its performance on the testing data.

In [None]:
# Train the model
history = model.fit([train['user_id'], train['ISBN']], train['rating'],
                    validation_data=(
                        [test['user_id'], test['ISBN']], test['rating']),
                    epochs=10, batch_size=64, verbose=1)

# Save the model
model.save('ncf_model.h5')

# Evaluate the model
loss, mae = model.evaluate(
    [test['user_id'], test['ISBN']], test['rating'], verbose=0)
print(f'NCF Model Test MAE: {mae}')

# Save the final_data to a CSV file
final_data.to_csv("final_data_with_ratings.csv", index=False)

# Save the model architecture and weights
model.save('ncf_model.h5')



Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


NCF Model Test MAE: 1.2234209775924683


### CNN Model
#### Notes:

* Data Preprocessing:

We fill any missing descriptions with an empty string.
Tokenize the text descriptions and convert them into sequences of integers.
Pad these sequences to ensure they all have the same length for input into the CNN.

* Simulate User-Book Interactions:

Generate random user IDs and ratings to simulate user interactions with books.

* Split the Data:

Split the data into training and testing sets to evaluate the model's performance.

* Define the CNN Model:

The CNN model includes an embedding layer to convert word indices to dense vectors of fixed size.
A Conv1D layer to apply convolutional operations on the text data.
A GlobalMaxPooling1D layer to reduce the output size and capture the most important features.
Dense layers to learn non-linear combinations of the features, and a dropout layer to prevent overfitting.
Train and Evaluate the Model:

Train the CNN model on the training data and evaluate its performance on the testing data.
Save the trained model for future use. 

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

# Load the Goodreads dataset
final_data = pd.read_csv('GoodReads_100k.csv')

# Preprocess the data
# Fill NaN descriptions with an empty string
final_data['Desc'] = final_data['Desc'].fillna('')

# Tokenize the descriptions
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(final_data['Desc'])
sequences = tokenizer.texts_to_sequences(final_data['Desc'])
word_index = tokenizer.word_index

# Pad the sequences to ensure uniform input size
max_length = 500  # You can choose an appropriate max length based on the data
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Create user-book interactions for the model
num_users = 1000
num_ratings = len(final_data)

# Generate random user IDs
user_ids = np.random.randint(1, num_users + 1, num_ratings)

# Generate random ratings
ratings = np.random.randint(1, 6, num_ratings)

# Add user IDs and ratings to the dataset
final_data['user_id'] = user_ids
final_data['rating'] = ratings

# Encode the user IDs and ISBNs
final_data['user_id'] = final_data['user_id'].astype('category').cat.codes.values
final_data['ISBN'] = final_data['ISBN'].astype('category').cat.codes.values

# Split the data into training and testing sets
train, test = train_test_split(final_data, test_size=0.2, random_state=42)

# Prepare input data for the CNN model
X_train = padded_sequences[train.index]
X_test = padded_sequences[test.index]
y_train = train['rating'].values
y_test = test['rating'].values

# Define the CNN model
embedding_dim = 100

model = Sequential([
    Embedding(input_dim=len(word_index) + 1, output_dim=embedding_dim, input_length=max_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='linear')
])

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

# Train the model
history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test),
                    epochs=10, batch_size=64, verbose=1)

# Save the model
model.save('cnn_model.h5')

# Evaluate the model
loss, mae = model.evaluate(X_test, y_test, verbose=0)
print(f'CNN Model Test MAE: {mae}')



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


  saving_api.save_model(


CNN Model Test MAE: 1.3479958772659302


### Transformer Based Model

1. Data Preprocessing:

   * Fill any missing descriptions with an empty string.
   * Tokenize the text descriptions using DistilBertTokenizer.

2. Simulate User-Book Interactions:

   * Generate random user IDs and ratings to simulate user interactions with books.

3. Split the Data:

   * Split the data into training and testing sets to evaluate the model's performance.

4. Prepare the Tokenizer and Encode the Text:

   * Tokenize the descriptions and create TensorFlow datasets for training and testing.

5. Define the Transformer-based Model:

   * Use a pre-trained DistilBERT model to encode the book descriptions.
   * Define the model architecture with additional dense and dropout layers for prediction.

6. Train and Evaluate the Model:

   * Train the Transformer-based model on the training data and evaluate its performance on the testing data.
   * Save the trained model for future use.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertModel
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout

def train_transformer_model():
    """
    Train a transformer-based model for book recommendation.

    This function loads the Goodreads dataset, preprocesses the data, generates random user-book ratings,
    encodes the user IDs and ISBNs, splits the data into training and testing sets, prepares the tokenizer,
    tokenizes the descriptions, converts the data to TensorFlow datasets, loads a pre-trained DistilBERT model,
    defines a transformer-based model, compiles the model, trains the model, saves the model, and evaluates the model.

    Returns:
    - history: Training history of the model.
    - loss: Loss value of the model on the test dataset.
    - mae: Mean absolute error of the model on the test dataset.
    """

    # Load the Goodreads dataset
    final_data = pd.read_csv('GoodReads_100k.csv')

    # Preprocess the data
    final_data['Desc'] = final_data['Desc'].fillna('')

    # Simulate user-book ratings
    num_users = 1000
    num_ratings = len(final_data)

    # Generate random user IDs
    user_ids = np.random.randint(1, num_users + 1, num_ratings)

    # Generate random ratings
    ratings = np.random.randint(1, 6, num_ratings)

    # Add user IDs and ratings to the dataset
    final_data['user_id'] = user_ids
    final_data['rating'] = ratings

    # Encode the user IDs and ISBNs
    final_data['user_id'] = final_data['user_id'].astype('category').cat.codes.values
    final_data['ISBN'] = final_data['ISBN'].astype('category').cat.codes.values

    # Split the data into training and testing sets
    train, test = train_test_split(final_data, test_size=0.2, random_state=42)

    # Prepare the tokenizer
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

    # Tokenize the descriptions
    train_encodings = tokenizer(train['Desc'].tolist(), truncation=True, padding=True, max_length=512)
    test_encodings = tokenizer(test['Desc'].tolist(), truncation=True, padding=True, max_length=512)

    # Convert to TensorFlow datasets
    def create_tf_dataset(encodings, labels):
        dataset = tf.data.Dataset.from_tensor_slices((
            dict(encodings),
            labels
        ))
        return dataset

    train_dataset = create_tf_dataset(train_encodings, train['rating'].values).batch(16)
    test_dataset = create_tf_dataset(test_encodings, test['rating'].values).batch(16)

    # Load the pre-trained DistilBERT model
    transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

    # Define the Transformer-based model
    input_ids = Input(shape=(512,), dtype=tf.int32, name='input_ids')
    attention_mask = Input(shape=(512,), dtype=tf.int32, name='attention_mask')

    transformer_output = transformer_model(input_ids, attention_mask=attention_mask)
    hidden_state = transformer_output.last_hidden_state
    pooled_output = hidden_state[:, 0]

    dense = Dense(128, activation='relu')(pooled_output)
    dropout = Dropout(0.3)(dense)
    output = Dense(1, activation='linear')(dropout)

    model = Model(inputs=[input_ids, attention_mask], outputs=output)

    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), loss='mean_squared_error', metrics=['mae'])

    # Train the model
    history = model.fit(train_dataset, validation_data=test_dataset, epochs=1)

    # Save the model
    model.save('transformer_model.h5')

    # Evaluate the model
    loss, mae = model.evaluate(test_dataset, verbose=0)
    print(f'Transformer Model Test MAE: {mae}')

    return history, loss, mae

history, loss, mae = train_transformer_model()






Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.




  24/3682 [..............................] - ETA: 73:04:19 - loss: 3.4622 - mae: 1.5500

### Hybrid Model

In [21]:
import pandas as pd
import numpy as np
import pickle, os
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Load the final data
df = pd.read_csv('final_data_with_ratings.csv')

# load the model
cosine_sim = pickle.load(open('cosing_sim_desc.pkl', 'rb'))
cnn_model = tf.keras.models.load_model('cnn_model.h5')
ncf_model = tf.keras.models.load_model('ncf_model.h5')

def hybrid_recommendation(book_name, top_n=10):
    # Cosine Similarity recommendations
    idx = df[df['Title'].str.contains(book_name, case=False, na=False)].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    cosine_recs = [i[0] for i in sim_scores[1:top_n+1]]
    
    # NCF recommendations (dummy user for demo purposes)
    user_input = np.array([0] * top_n)
    item_input = np.array(cosine_recs)
    ncf_preds = ncf_model.predict([user_input, item_input]).flatten()
    
    # CNN recommendations
    # Tokenization
    tokenizer = Tokenizer(num_words=5000)
    tokenizer.fit_on_texts(df['Title'])
    text_input = tokenizer.texts_to_sequences([book_name] * top_n)
    text_input = pad_sequences(text_input, maxlen=100)
    cnn_preds = cnn_model.predict(text_input).flatten()
    
    # Aggregate and rank recommendations
    combined_scores = ncf_preds + cnn_preds
    recommended_indices = np.argsort(combined_scores)[-top_n:]
    
    recommended_books = df.iloc[recommended_indices]['Title'].values
    return recommended_books

# Example usage
book_name = "Clojure Programming"
recommendations = hybrid_recommendation(book_name)
print(f"Recommendations for '{book_name}':")
for i, book in enumerate(recommendations):
    print(f"{i+1}. {book}")


  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 219ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
Recommendations for 'Clojure Programming':
1. The Human Equation: Building Profits by Putting People First
2. Fashion Sourcebook 1920s
3. All-American Anarchist: Joseph A. Labadie and the Labor Movement
4. Hawaii: An Uncommon History
5. Hungary 56
6. Genuine Happiness: Meditation as the Path to Fulfillment
7. Anarchism And Ecology
8. Anthropological Studies of Religion: An Introductory Text
9. Competitive Advantage Through People: Unleashing the Power of the Work Force
10. Between Two Fires: American Indians in the Civil War
