## Colloborative filtering using Neural Collaborative Filtering [NCF]

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load your datasets
reviews_data = pd.read_csv('/Users/yogesh/Study/Recommender_Systems/Reviews/combined_file.csv', dtype=str)
product_info_data = pd.read_csv('/Users/yogesh/Study/Recommender_Systems/product_info.csv', dtype=str)

# Clean and preprocess data
reviews_data['rating'] = reviews_data['rating'].astype(float)
reviews_data = reviews_data[['author_id', 'product_id', 'rating']]

# Encode user IDs and product IDs
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

reviews_data['user'] = user_encoder.fit_transform(reviews_data['author_id'])
reviews_data['product'] = product_encoder.fit_transform(reviews_data['product_id'])

# Create training and testing datasets
train_data, test_data = train_test_split(reviews_data, test_size=0.2, random_state=42)

# Prepare user-product interaction matrix
num_users = reviews_data['user'].nunique()
num_products = reviews_data['product'].nunique()


In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models

# Define the model
def create_ncf_model(num_users, num_products):
    user_input = layers.Input(shape=(1,), name='user_input')
    product_input = layers.Input(shape=(1,), name='product_input')
    
    # Embedding layers
    user_embedding = layers.Embedding(input_dim=num_users, output_dim=16)(user_input)
    product_embedding = layers.Embedding(input_dim=num_products, output_dim=16)(product_input)

    # Flatten the embeddings
    user_vecs = layers.Flatten()(user_embedding)
    product_vecs = layers.Flatten()(product_embedding)

    # Concatenate the two embeddings
    concat = layers.Concatenate()([user_vecs, product_vecs])

    # Dense layers
    dense = layers.Dense(64, activation='relu')(concat)
    dense = layers.Dense(32, activation='relu')(dense)
    
    # Output layer
    output = layers.Dense(1)(dense)

    model = models.Model(inputs=[user_input, product_input], outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Create the model
ncf_model = create_ncf_model(num_users, num_products)


In [3]:
# Train the model
X_train = [train_data['user'].values, train_data['product'].values]
y_train = train_data['rating'].values

ncf_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1)


Epoch 1/10
[1m38126/38126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m678s[0m 18ms/step - loss: nan - val_loss: nan
Epoch 2/10
[1m38126/38126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m684s[0m 18ms/step - loss: nan - val_loss: nan
Epoch 3/10
[1m38126/38126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m693s[0m 18ms/step - loss: nan - val_loss: nan
Epoch 4/10
[1m38126/38126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m698s[0m 18ms/step - loss: nan - val_loss: nan
Epoch 5/10
[1m38126/38126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m682s[0m 18ms/step - loss: nan - val_loss: nan
Epoch 6/10
[1m38126/38126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m684s[0m 18ms/step - loss: nan - val_loss: nan
Epoch 7/10
[1m38126/38126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m686s[0m 18ms/step - loss: nan - val_loss: nan
Epoch 8/10
[1m38126/38126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m686s[0m 18ms/step - loss: nan - val_loss: nan
Epoch 9/10
[1m38126/38126[0m 

<keras.src.callbacks.history.History at 0x3650690d0>

In [35]:
def get_recommendations(user_id, model, num_recommendations=5):
    user_idx = user_encoder.transform([user_id])[0]
    product_indices = range(num_products)

    # Create input for prediction
    user_input = tf.constant([user_idx] * num_products)
    product_input = tf.constant(product_indices)

    # Predict ratings
    predicted_ratings = model.predict([user_input, product_input])

    # Get the top N recommendations
    recommended_product_indices = predicted_ratings.flatten().argsort()[-num_recommendations:][::-1]
    recommended_products = product_encoder.inverse_transform(recommended_product_indices)
    
    return recommended_products





In [36]:
# Example usage
user_id = '2190293206'  # replace with a valid author_id
recommendations = get_recommendations(user_id, ncf_model)
print("Recommended products for user:", recommendations)

[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242us/step
Recommended products for user: [nan 'P443358' 'P443830' 'P443829' 'P443812']


In [46]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate model performance on test data
def evaluate_model(model, test_data, threshold=3):
    # Get test user and product inputs
    X_test = [test_data['user'].values, test_data['product'].values]
    
    # Get actual ratings
    y_true = test_data['rating'].values
    
    # Get model predictions
    y_pred = model.predict(X_test).flatten()
    
    # Apply threshold to predictions and true labels (converting to binary labels)
    y_pred_binary = np.where(y_pred >= threshold, 1, 0)
    y_true_binary = np.where(y_true >= threshold, 1, 0)

    # Calculate performance metrics
    accuracy = accuracy_score(y_true_binary, y_pred_binary)
    precision = precision_score(y_true_binary, y_pred_binary, zero_division=1)
    recall = recall_score(y_true_binary, y_pred_binary, zero_division=1)
    f1 = f1_score(y_true_binary, y_pred_binary, zero_division=1)
    
    # Print results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    
    return accuracy, precision, recall, f1

# Evaluate the model on the test data
evaluate_model(ncf_model, test_data)


[1m21182/21182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 225us/step
Accuracy: 0.3433
Precision: 1.0000
Recall: 0.0000
F1 Score: 0.0000


(0.34331522557001093, 1.0, 0.0, 0.0)

## Colloborative filtering using [Neutralised matrix factorisation]

In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Load datasets
reviews_data = pd.read_csv('/Users/yogesh/Study/Recommender_Systems/Reviews/combined_file.csv', dtype=str)
product_info_data = pd.read_csv('/Users/yogesh/Study/Recommender_Systems/product_info.csv', dtype=str)

# Extract relevant columns
data = reviews_data[['author_id', 'product_id', 'rating']]

# Convert rating to numeric
data['rating'] = pd.to_numeric(data['rating'])

# Encode user IDs and product IDs
user_encoder = LabelEncoder()
data['user_id'] = user_encoder.fit_transform(data['author_id'])

product_encoder = LabelEncoder()
data['product_id'] = product_encoder.fit_transform(data['product_id'])

# Create a final dataset with user_id, product_id, and rating
final_data = data[['user_id', 'product_id', 'rating']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['rating'] = pd.to_numeric(data['rating'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['user_id'] = user_encoder.fit_transform(data['author_id'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['product_id'] = product_encoder.fit_transform(data['product_id'])


In [3]:
train_data, test_data = train_test_split(final_data, test_size=0.2, random_state=42)


In [4]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Get number of users and products
num_users = data['user_id'].nunique()
num_products = data['product_id'].nunique()

# Hyperparameters
embedding_size = 8

# Define the NeuMF model
def create_neumf_model(num_users, num_products, embedding_size):
    # Input layers
    user_input = layers.Input(shape=(1,), name='user_input')
    product_input = layers.Input(shape=(1,), name='product_input')
    
    # User and product embeddings
    user_embedding = layers.Embedding(input_dim=num_users, output_dim=embedding_size)(user_input)
    product_embedding = layers.Embedding(input_dim=num_products, output_dim=embedding_size)(product_input)
    
    # Flatten the embeddings
    user_vecs = layers.Flatten()(user_embedding)
    product_vecs = layers.Flatten()(product_embedding)
    
    # Concatenate the vectors
    merged_vecs = layers.Concatenate()([user_vecs, product_vecs])
    
    # MLP layers
    x = layers.Dense(128, activation='relu')(merged_vecs)
    x = layers.Dense(64, activation='relu')(x)
    x = layers.Dense(32, activation='relu')(x)
    
    # Output layer
    output = layers.Dense(1)(x)
    
    # Create the model
    model = keras.Model(inputs=[user_input, product_input], outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Instantiate the model
neumf_model = create_neumf_model(num_users, num_products, embedding_size)


In [5]:
# Prepare training data
X_train = [train_data['user_id'].values, train_data['product_id'].values]
y_train = train_data['rating'].values

# Prepare test data
X_test = [test_data['user_id'].values, test_data['product_id'].values]
y_test = test_data['rating'].values


In [6]:
# Train the model
neumf_model.fit(X_train, y_train, epochs=1, batch_size=128, validation_data=(X_test, y_test))


[1m21182/21182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 10ms/step - loss: nan - val_loss: nan


<keras.src.callbacks.history.History at 0x38bead4c0>

In [8]:
import numpy as np
import pandas as pd

# Assuming you have your trained NeuMF model as `neumf_model`
# And your dataset has been encoded with user_ids and product_ids

# Define the specific user ID for whom you want recommendations
specific_user_id = 0  # Replace with the actual user ID you want to check

# Get all product IDs (assuming they are encoded)
product_ids = final_data['product_id'].unique()

# Create user-product pairs for the specific user
user_product_pairs = np.array([[specific_user_id, product_id] for product_id in product_ids])

# Predict ratings for the user-product pairs
predicted_ratings = neumf_model.predict([user_product_pairs[:, 0], user_product_pairs[:, 1]])

# Create a DataFrame for the predicted ratings
user_recommendations_df = pd.DataFrame(predicted_ratings, columns=['predicted_rating'])
user_recommendations_df['user_id'] = specific_user_id
user_recommendations_df['product_id'] = user_product_pairs[:, 1]

# Get the top N recommendations for the specific user
top_n = 5  # Specify the number of recommendations
top_recommendations = user_recommendations_df.nlargest(top_n, 'predicted_rating')

# If you want to map back to original product IDs (if encoded)
top_recommendations['product_id'] = product_encoder.inverse_transform(top_recommendations['product_id'].astype(int))

# Display recommendations for the specific user
print(f"Recommendations for User ID {specific_user_id}:")
if top_recommendations.empty:
    print("No recommendations found.")
else:
    for _, row in top_recommendations.iterrows():
        print(f"- Product ID: {row['product_id']}")


[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 276us/step
Recommendations for User ID 0:
- Product ID: P443842
- Product ID: P448802
- Product ID: P479841
- Product ID: P500757
- Product ID: P418624


In [13]:
import numpy as np
import pandas as pd

# Assuming you have your trained NeuMF model as `neumf_model`
# And your dataset has been encoded with user_ids and product_ids

# Define the specific user ID for whom you want recommendations
specific_user_id = 0  # Replace with the actual user ID you want to check

# Get all product IDs (assuming they are encoded)
product_ids = final_data['product_id'].unique()

# Create user-product pairs for the specific user
user_product_pairs = np.array([[specific_user_id, product_id] for product_id in product_ids])

# Predict ratings for the user-product pairs
predicted_ratings = neumf_model.predict([user_product_pairs[:, 0], user_product_pairs[:, 1]])

# Create a DataFrame for the predicted ratings
user_recommendations_df = pd.DataFrame(predicted_ratings, columns=['predicted_rating'])
user_recommendations_df['user_id'] = specific_user_id
user_recommendations_df['product_id'] = user_product_pairs[:, 1]

# Get the top N recommendations for the specific user
top_n = 5  # Specify the number of recommendations
top_recommendations = user_recommendations_df.nlargest(top_n, 'predicted_rating')

# If you want to map back to original product IDs (if encoded)
top_recommendations['product_id'] = product_encoder.inverse_transform(top_recommendations['product_id'].astype(int))

# Display recommendations for the specific user
print(f"Recommendations for User ID {specific_user_id}:")
if top_recommendations.empty:
    print("No recommendations found.")
else:
    for _, row in top_recommendations.iterrows():
        print(f"- Product ID: {row['product_id']} with predicted rating: {row['predicted_rating']:.2f}")


[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 275us/step
Recommendations for User ID 0:
- Product ID: P443842 with predicted rating: nan
- Product ID: P448802 with predicted rating: nan
- Product ID: P479841 with predicted rating: nan
- Product ID: P500757 with predicted rating: nan
- Product ID: P418624 with predicted rating: nan


In [14]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming you have your trained NeuMF model as `neumf_model`
# And your dataset has been encoded with user_ids and product_ids

# Define the specific user ID for whom you want recommendations
specific_user_id = 10  # Replace with the actual user ID you want to check

# Get all product IDs (assuming they are encoded)
product_ids = final_data['product_id'].unique()

# Create user-product pairs for the specific user
user_product_pairs = np.array([[specific_user_id, product_id] for product_id in product_ids])

# Predict ratings for the user-product pairs
predicted_ratings = neumf_model.predict([user_product_pairs[:, 0], user_product_pairs[:, 1]])

# Create a DataFrame for the predicted ratings
user_recommendations_df = pd.DataFrame(predicted_ratings, columns=['predicted_rating'])
user_recommendations_df['user_id'] = specific_user_id
user_recommendations_df['product_id'] = user_product_pairs[:, 1]

# For evaluation, let's assume you have actual ratings in a DataFrame `actual_ratings_df`
# which contains columns ['user_id', 'product_id', 'rating']
actual_ratings_df = final_data[final_data['user_id'] == specific_user_id]

# Merge predicted ratings with actual ratings
merged_df = user_recommendations_df.merge(actual_ratings_df[['product_id', 'rating']], on='product_id', how='left')

# Define a threshold for positive predictions
threshold = 3  # Define a threshold rating

# Generate binary predictions based on the threshold
merged_df['predicted_positive'] = (merged_df['predicted_rating'] >= threshold).astype(int)
merged_df['actual_positive'] = (merged_df['rating'] >= threshold).astype(int)

# Calculate precision, recall, and F1 score
precision = precision_score(merged_df['actual_positive'], merged_df['predicted_positive'], zero_division=0)
recall = recall_score(merged_df['actual_positive'], merged_df['predicted_positive'], zero_division=0)
f1 = f1_score(merged_df['actual_positive'], merged_df['predicted_positive'], zero_division=0)

# Display the results
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 301us/step
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000


## COLLOBORATIVE FILTERING USING SVD

In [1]:
import pandas as pd
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load datasets
reviews_data = pd.read_csv('/Users/yogesh/Study/Recommender_Systems/Reviews/combined_file.csv', dtype=str)

In [2]:
# Convert ratings to numeric
reviews_data['rating'] = pd.to_numeric(reviews_data['rating'], errors='coerce')
reviews_data.dropna(subset=['rating'], inplace=True)

In [3]:
# Prepare the data for Surprise
reader = Reader(rating_scale=(1, 5))  # Assuming rating scale is from 1 to 5
data = Dataset.load_from_df(reviews_data[['author_id', 'product_id', 'rating']], reader)

In [4]:
# Split the dataset into training and testing sets
trainset, testset = train_test_split(data, test_size=0.2)


In [5]:
# Use SVD for collaborative filtering
model = SVD()
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x345ddc920>

In [6]:
# Make predictions on the test set
predictions = model.test(testset)

In [13]:
product_info_data = pd.read_csv('/Users/yogesh/Study/Recommender_Systems/product_info.csv', dtype=str)



In [14]:
# Function to get product names from product_ids
def get_product_name(product_id):
    product_info_row = product_info_data[product_info_data['product_id'] == product_id]
    if not product_info_row.empty:
        return product_info_row['product_name'].values[0]
    return None

In [15]:
# Function to get product recommendations for a user
def get_recommendations(user_id, model, num_recommendations=5):
    # Get a list of all product_ids
    all_product_ids = reviews_data['product_id'].unique()
    
    # Predict ratings for all products for the given user
    user_recommendations = []
    for product_id in all_product_ids:
        predicted_rating = model.predict(user_id, product_id).est
        user_recommendations.append((product_id, predicted_rating))
    
    # Sort by predicted rating and return the top recommendations
    user_recommendations.sort(key=lambda x: x[1], reverse=True)
    
    # Get product names for the top recommendations
    recommendations_with_names = []
    for product_id, predicted_rating in user_recommendations[:num_recommendations]:
        product_name = get_product_name(product_id)
        recommendations_with_names.append((product_id, product_name, predicted_rating))
    
    return recommendations_with_names

In [18]:
# Example usage
user_id_example = '2190293206' 
recommendations = get_recommendations(user_id_example, model)

# Print each recommendation on a new line
for product_id, product_name, predicted_rating in recommendations:
    print(f"Product ID: {product_id}, Product Name: {product_name}, Predicted Rating: {predicted_rating:.2f}")


Product ID: P439055, Product Name: GENIUS Sleeping Collagen Moisturizer, Predicted Rating: 4.99
Product ID: P471038, Product Name: Glaze Lip Oil, Predicted Rating: 4.97
Product ID: P378852, Product Name: GinZing Ultra-Hydrating Energy-Boosting Cream, Predicted Rating: 4.96
Product ID: P404322, Product Name: ExfoliKate Cleanser Daily Foaming Wash, Predicted Rating: 4.95
Product ID: P379510, Product Name: Advanced Génifique Radiance Boosting Face Serum, Predicted Rating: 4.92


## MODEL PERFROMANCE

In [24]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Function to calculate precision, accuracy, and F1 score
def calculate_metrics(predictions):
    # Get the true ratings and predicted ratings
    y_true = [pred.r_ui for pred in predictions]
    y_pred = [round(pred.est) for pred in predictions]  # Assuming ratings are in integer values

    # Calculate precision, recall, and F1 score using 'macro' average for multiclass
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=0)  # Handle zero division
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=0)
    
    # Calculate RMSE for accuracy
    accuracy_score = accuracy.rmse(predictions)  # Already calculated RMSE
    
    return precision, recall, f1, accuracy_score

# Calculate metrics
precision, recall, f1, accuracy_score = calculate_metrics(predictions)

# Print metrics
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy (RMSE): {accuracy_score:.2f}")


RMSE: 0.6332
Precision: 0.80
Recall: 0.73
F1 Score: 0.74
Accuracy (RMSE): 0.63


In [1]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVDpp, NMF, KNNBaseline
from surprise.model_selection import train_test_split
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score


# Load data
reviews_data = pd.read_csv('/Users/yogesh/Study/Recommender_Systems/Reviews/combined_file.csv', dtype={'author_id': str, 'product_id': str, 'rating': np.float32})
product_info_data = pd.read_csv('/Users/yogesh/Study/Recommender_Systems/product_info.csv', dtype={'product_id': str})

  reviews_data = pd.read_csv('/Users/yogesh/Study/Recommender_Systems/Reviews/combined_file.csv', dtype={'author_id': str, 'product_id': str, 'rating': np.float32})


In [2]:
!pip install tqdm



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m


In [3]:
# Create a Reader object
reader = Reader(rating_scale=(1, 5))

# Load the dataset into surprise
data = Dataset.load_from_df(reviews_data[['author_id', 'product_id', 'rating']], reader)

# Split the dataset into training and test sets
trainset, testset = train_test_split(data, test_size=0.2)

In [4]:
# Base models
base_models = [
    ('svdpp', SVDpp()),
    ('nmf', NMF()),
    ('knn_baseline', KNNBaseline()),
]


In [5]:
# Prepare features for the stacking model
X_train = []
y_train = []

In [7]:
from tqdm import tqdm 

In [None]:
# Train each base model and collect predictions
for name, model in tqdm(base_models, desc="Training base models"):
    model.fit(trainset)
    predictions = model.test(testset)
    X_train.append([pred.est for pred in predictions])
    y_train.append([pred.r_ui for pred in predictions])

Training base models:   0%|                               | 0/3 [00:00<?, ?it/s]