In [None]:
!pip install tensorflow
!pip install numpy
!pip install matplotlib
!pip install pandas
!pip install scikit-learn
!pip install seaborn
!pip install jupyterlab
!pip install pillow
!pip install scipy
!pip install implicit
!pip install sentence_transformers
!pip install pydot graphviz

In [None]:
import pandas as pd
dummy_df_df = pd.read_csv('GrammarandProductReviews.csv')
dummy_df_df.info()

In [None]:
dummy_df_df.head()

In [None]:
dummy_df_df['reviews.rating'].hist()

In [None]:
dummy_df_df['reviews.username'].value_counts().describe()

In [None]:
pip install nvidia-cuda-nvcc-cu12

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# tf.config.optimizer.set_jit(False)  # Disable XLA.

In [None]:
df = pd.read_csv(r'GrammarandProductReviews.csv')
df

In [None]:
df['reviews.title'] = df['reviews.title'].fillna('No Title') # fills empty titles
df['reviews.text'] = df['reviews.text'].fillna('No Review')
df['reviews.username'] = df['reviews.username'].fillna('Akshat') # Can also assign it as anonymous user but for sake of info i'll pretend i bought these products
df['manufacturer'] = df['manufacturer'].fillna(df['manufacturer'].mode()) # most occuring manufacturer

In [None]:
user_map = {u: i for i, u in enumerate(df['reviews.username'].unique())}
item_map = {i: j for j, i in enumerate(df['id'].unique())}

In [None]:
df['user_id'] = df['reviews.username'].map(user_map)
df['prod_id'] = df['id'].map(item_map)
df['reviews.rating'] = df['reviews.rating'].astype(np.float32)

In [None]:
"""  Negative Upsampling through normalized scores (alpha = 0.75) """
alpha_weight = 1.0
weighted_sum = (df['prod_id'].value_counts())**(alpha_weight)
normalized_prod_id = weighted_sum / weighted_sum.sum()
normalized_prod_id = normalized_prod_id.sort_index()
normalized_prod_id

In [None]:
cdf = np.cumsum(normalized_prod_id)
plt.figure()
plt.plot(np.arange(1, len(cdf) + 1), cdf)
plt.xlabel("Item rank (sorted by popularity)")
plt.ylabel("Cumulative probability mass")
plt.title("CDF of Item Popularity Distribution")
plt.show()

In [None]:
# ---- 3. Histogram of log-probabilities ----
from scipy.stats import norm
# eps = 1e-12
# log_probs = np.log(normalized_prod_id + eps)

# plt.figure()
# plt.hist(log_probs, bins=50, density=True)
# plt.xlabel("log(Probability)")
# plt.ylabel("Density")
# plt.title("Histogram of Log-Transformed Probabilities")
# plt.show()

# ---- 4. Normal (bell curve) fit on log-probabilities ----
mu, sigma = normalized_prod_id.mean(), normalized_prod_id.std()
x = np.linspace(normalized_prod_id.min(), normalized_prod_id.max(), 500)
pdf = norm.pdf(x, mu, sigma)

plt.figure()
plt.hist(normalized_prod_id,bins = 50, density=True, alpha=0.6, color='g')
plt.plot(x, pdf)
plt.xlabel("Probability")
plt.ylabel("Density")
plt.title("Normal Fit on Probabilities")
plt.show()

In [None]:
def generate_negative_prod_id_and_rating(normalized_prod_id):
    """Generates one prod id using label smoothning and its negative rating"""
    negative_prod_id = np.random.choice(np.arange(0,600), p = normalized_prod_id) # samples a negative value based on popularity label smoothed
    negative_rating_of_prod_id = np.random.uniform(0,2)
    return (negative_prod_id,negative_rating_of_prod_id)

In [None]:
product_lookup = (
    df
    .drop_duplicates(subset="prod_id")
    .set_index("prod_id")
)

In [None]:
""" Inserts k negative samples per record in the dataframe"""
rows = []
k = 3

for record in df.itertuples(index=False):
    curr_prod_id = record.prod_id

    for _ in range(k):
        while True:
            neg_prod_id, neg_rating = generate_negative_prod_id_and_rating(normalized_prod_id)
            if neg_prod_id != curr_prod_id: # Continue regenrating until we get a negative sample which is not the same as the positive one
                break

        # copy full product metadata
        prod_row = product_lookup.loc[neg_prod_id].to_dict()

        prod_row.update({
            "user_id": record.user_id,
            "reviews.rating": neg_rating,
            "is_negative": True
        })

        rows.append(prod_row)

negative_samples_df = pd.DataFrame(rows)
negative_samples_df.to_csv("negative_samples.csv", index=False)
negative_samples_df.shape

In [None]:
df = df.copy()
df['is_negative'] = False

final_df = pd.concat(
    [df, negative_samples_df],
    ignore_index=True
)
final_df = final_df.sample(frac=1).reset_index(drop=True) # shuffle the dataframe
final_df['prod_id'] = final_df['id'].map(item_map)
final_df.shape

In [None]:
final_df.to_csv("final_df.csv", index=False)
df = final_df.copy()
df.shape

In [None]:
df.info()

In [None]:
df[df['reviews.username'].isna()]['reviews.username']

In [None]:
df['reviews.username'].value_counts()

In [None]:
""" Get only those users who have given atleast 5 reviews"""
user_review_counts = df['reviews.username'].value_counts()
active_users = user_review_counts[user_review_counts >= 2].index
filtered_df = df[df['reviews.username'].isin(active_users)]
filtered_df['reviews.username'].value_counts()

In [None]:
df['reviews.username'].unique()

In [None]:
df.columns

In [None]:
# from sklearn.preprocessing import LabelEncoder

# l_encode = LabelEncoder()
# df['user_id'] = l_encode.fit_transform(df['reviews.username']) # gives each user a user_id

In [None]:
df['reviews.rating'].dtype

In [None]:
final_df['reviews.rating'].describe()

In [None]:
6.657402e-01

In [None]:
""" Below 2 cells are for experiemntation"""

import pandas as pd

# Create a dictionary of data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, 30, 35, 40],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}

dummy_df = pd.DataFrame(data)
dummy_df


In [None]:
dummy_df.loc[0] = {'Name':'Akshat', 'Age' : 21, 'City' : 'Sonipat'} # experimenting for adding new values per row
dummy_df

In [None]:
df['prod_id'].value_counts().describe()

In [None]:
df.tail()

In [None]:
len(df)

In [None]:
df['prod_id'].value_counts()

In [None]:
als_df = (
    df
    .groupby(['user_id', 'prod_id'], as_index=False)
    ['reviews.rating']
    .mean()
)
als_df # Used for aggregating ratings of each user as mean , suppose a user 0 bought 5 items out of which 2 are repeat so we aggregate the rating to not confuse ALS

In [None]:
als_df[als_df['user_id'] == 0] # how much user 0 liked each product , 

In [None]:
from scipy.sparse import coo_matrix
import numpy as np

rows = als_df['user_id'].to_numpy()
cols = als_df['prod_id'].to_numpy()
values = als_df['reviews.rating'].to_numpy()

c_mat = coo_matrix((values, (rows, cols)))
c_mat = c_mat.tocsr()

In [None]:
print(c_mat.shape)
print(c_mat.nnz)

In [None]:
import implicit

als = implicit.als.AlternatingLeastSquares(factors = 64)
als.fit(c_mat)

In [None]:
als.similar_items([18, 25, 76,100])[0]

In [None]:
reverse_item_map = {v : k for k,v in item_map.items()}
reverse_item_map[518]

In [None]:
df[df['id'] == 'AVpidjeH1cnluZ0-Nf75'] # Sample prediction from pure ALS (weak signals)

In [None]:
als.user_factors.shape

In [None]:
als.item_factors.shape

In [None]:
user_als = als.user_factors
product_als = als.item_factors

In [None]:
from sentence_transformers import SentenceTransformer

# Load a production-ready BERT model
model = SentenceTransformer('all-MiniLM-L6-v2') 

# Your review data
reviews = ["This product is amazing, high quality!", "Terrible experience, broke in a week."]

# Convert to 384 or 768-dim vectors
embeddings = model.encode(reviews)

print(embeddings.shape) # (2, 384)

In [None]:
from tqdm import tqdm

def generate_embeddings_for_review_title(review_titles,review_texts,model = 'all-MiniLM-L6-v2'):
    model = SentenceTransformer(model)
    concat_text = [f'Review Title : {review_title} \n Review : {review_text}' for review_title,review_text in tqdm(zip(review_titles,review_texts),total = len(review_titles))]
    return model.encode(concat_text)

In [None]:
df.columns

In [None]:
df.info()

In [None]:
dummy_df = df.copy()

In [None]:
all_review_embeddings = generate_embeddings_for_review_title(df['reviews.title'],df['reviews.text'])

In [None]:
all_review_embeddings.shape

In [None]:
df['review_embeddings'] = list(all_review_embeddings)

In [None]:
l = []
l.append([0] * 32)
l.append(1)
print(l)

In [None]:
l = [1,2,3]
l2 = [4,5,6]
l + l2

In [None]:
def running_mean_previous_embeddings(embeddings):
    result = []
    count = 0

    running_sum = np.zeros(384, dtype=np.float32)

    for emb in embeddings:
        if count == 0:
            result.append(np.zeros_like(emb))
        else:
            result.append(running_sum / count)

        running_sum += emb
        count += 1

    return result

In [None]:
res = df.groupby(by = 'reviews.username')['review_embeddings'].transform(running_mean_previous_embeddings)
res

In [None]:
sample_lists = list(df[df['reviews.username'] == 'Mike']['user_id'].index)

In [None]:
# embs = df.loc[sample_lists, "review_embeddings"].values

# emb0 = embs[0]
# emb1 = embs[1]

# np.allclose((emb0 + emb1) / 2, c, atol=1e-6)

In [None]:
df['average_mean_review_embeddings'] = res

In [None]:
GLOBAL_BERT_EMBEDDING_AVERAGE = df['average_mean_review_embeddings'].mean() # Average of all BERT reviews globally

In [None]:
df['reviews.username'].value_counts()

In [None]:
""" 
Till here we have completed the preprocessing required for User pipeline
"""

In [None]:
def generate_embeddings_for_product_description(names,brands,categories,manufacturers,model = 'all-MiniLM-L6-v2'):
    model = SentenceTransformer(model)
    concat_text = [f'Name of product : {name} \n Brand of product : {brand} \n category of product : {category} \n manufacturer of product : {manufacturer}' for name,brand,category,manufacturer in tqdm(zip(names,brands,categories,manufacturers),total = len(names))]
    return model.encode(concat_text)

In [None]:
df['categories'].loc[0]

In [None]:
embeddings_for_prod =  generate_embeddings_for_product_description(df['name'],df['brand'],df['categories'],df['manufacturer'])
embeddings_for_prod

In [None]:
df['product_embeddings'] = list(embeddings_for_prod)

In [None]:
df.columns

In [None]:
""" Here we conclude pre-processing for the product tower  """

In [None]:
""" Dealing with anonymous users (changing all of their mean embeddings to global mean , since they do not command)  """

In [None]:
df['reviews.username'].value_counts().head(10)

In [None]:
df['average_mean_review_embeddings'] = df.apply(lambda x: GLOBAL_BERT_EMBEDDING_AVERAGE if x['reviews.username'] == 'Anonymous' or x['reviews.username'] == 'An anonymous customer' or x['reviews.username'] == 'ByAmazon Customer' else x['average_mean_review_embeddings'], axis=1)

""" Averages the mean review embeddings for anonymous users to global average  """

In [None]:
df['user_ALS_embedding'] = df['reviews.username'].apply(lambda x: np.zeros(64) if x == 'Anonymous' or x == 'An anonymous customer' or x == 'ByAmazon Customer' else user_als[user_map[x]]) # type: ignore
df['product_ALS_embedding'] = df['prod_id'].apply(lambda prod_id : product_als[prod_id]) # type: ignore

""" Creates ALS embeddings for users and products  with anonymous users having zero embeddings """

In [None]:
df['is_anonymous_user'] = df['reviews.username'].apply(lambda x: 1 if x == 'Anonymous' or x == 'An anonymous customer' or x == 'ByAmazon Customer' else 0)
""" Creates a binary feature indicating if user is anonymous """

In [None]:
df[df['is_anonymous_user'] == 1][['reviews.username','is_anonymous_user','user_ALS_embedding','average_mean_review_embeddings']].head(10)

In [None]:
"""Now we will write the deep learning two tower model in tensorflow  """

In [None]:
df['user_ALS_embedding']

In [None]:
df['product_ALS_embedding']

In [None]:
df.columns

In [None]:
user_tower_df = df[['user_ALS_embedding','average_mean_review_embeddings']]
user_tower_df

In [None]:
user_tower_df['user_ALS_embedding'][0].shape

In [None]:
user_tower_df['average_mean_review_embeddings'][0].shape

In [None]:
user_tower_als_matrix = np.stack(df['user_ALS_embedding'].values)
user_tower_bert_matrix = np.stack(df['average_mean_review_embeddings'].values)
product_tower_als_matrix = np.stack(df['product_ALS_embedding'].values)
product_tower_bert_matrix = np.stack(df['product_embeddings'].values)
is_anonymous_user_array = df['is_anonymous_user'].values
# All shapes are (71044, 64) for ALS and (71044, 384) for BERT

In [None]:
labels = df['reviews.rating'] # labels such as ratings (0 - 5 continous)

In [None]:
features = {
    "user_ALS_Embedding": user_tower_als_matrix,
    "user_BERT_Embedding": user_tower_bert_matrix,
    "is_anonymous_user": is_anonymous_user_array,
    "product_ALS_Embedding": product_tower_als_matrix,
    "product_BERT_Embedding": product_tower_bert_matrix
}

full_dataset = tf.data.Dataset.from_tensor_slices((features, labels)) # Tensorflow dataset

In [None]:
for data in full_dataset.take(1):
    print(data) 

In [None]:
train_dataset = (
    full_dataset 
    .batch(32)
    .shuffle(buffer_size=1024)
    .prefetch(tf.data.AUTOTUNE)
)

In [None]:
import os
os.environ['XLA_FLAGS'] = '--xla_gpu_unsafe_fallback_to_driver_on_ptxas_not_found'
# Now run your training

In [None]:
""" Below is archictecture of User tower"""

In [None]:
import keras

user_input_1 = keras.Input(shape=(64,), name='user_ALS_Embedding')
user_input_2 = keras.Input(shape=(384,), name='user_BERT_Embedding')
user_input_3 = keras.Input(shape=(1,), name='is_anonymous_user')
concatenated = keras.layers.Concatenate()([user_input_1, user_input_2, user_input_3]) #449 dims
user_dense_1 = keras.layers.Dense(256, activation='selu', kernel_initializer='lecun_normal')(concatenated)
user_dense_2 = keras.layers.Dense(128, activation='selu', kernel_initializer='lecun_normal')(user_dense_1)
user_dropout_output = keras.layers.AlphaDropout(0.1, name='user_dropout')(user_dense_2)
user_tower_output = keras.layers.Dense(64, activation='selu', name='User_Tower_Output', kernel_initializer='lecun_normal')(user_dropout_output)

In [None]:
""" Below is architecture of Product tower """

In [None]:
product_input_1 = keras.Input(shape=(64,), name='product_ALS_Embedding')
product_input_2 = keras.Input(shape=(384,), name='product_BERT_Embedding')
product_concatenated = keras.layers.Concatenate()([product_input_1, product_input_2]) #448 dims
product_dense_1 = keras.layers.Dense(256, activation='selu', kernel_initializer='lecun_normal')(product_concatenated)
product_dense_2 = keras.layers.Dense(128, activation='selu', kernel_initializer='lecun_normal')(product_dense_1)
product_dropout_output = keras.layers.AlphaDropout(0.1, name='product_dropout')(product_dense_2)
product_tower_output = keras.layers.Dense(64, activation='selu', name='Product_Tower_Output', kernel_initializer='lecun_normal')(product_dropout_output)

In [None]:
# Final Combined Layers

combined = keras.layers.Concatenate()([user_tower_output, product_tower_output]) # 128 dims
final_dense_1 = keras.layers.Dense(64, activation='selu', kernel_initializer='lecun_normal')(combined)
final_dropout_1 = keras.layers.AlphaDropout(0.1)(final_dense_1)
final_dense_2 = keras.layers.Dense(32, activation='selu', name='final_dropout', kernel_initializer='lecun_normal')(final_dropout_1)
final_output = keras.layers.Dense(1, activation='linear', name='Final_Output')(final_dense_2)

In [None]:
model = keras.Model(inputs=[user_input_1, user_input_2,user_input_3, product_input_1, product_input_2], outputs=final_output)

In [None]:
model.summary()

In [None]:
from keras.utils import plot_model

plot_model(
    model,
    show_shapes=True,
    show_layer_names=True,
    expand_nested=True,
    dpi=320
)

In [None]:
model.compile(optimizer='nadam', loss='mean_squared_error', metrics=['mae'])

In [None]:
# labels = df['reviews.rating'].values.astype('float32')
# labels

In [None]:
history = model.fit(train_dataset, epochs=5)

In [None]:
# Save weights specifically (safer than saving the whole model for subclassed models)
model.save_weights('my_two_tower_model_v6_power.weights.h5')
print("SAFE! Weights saved successfully.")

In [None]:
model.save('two_tower_recsys_v6_power.keras')

In [None]:
df

In [None]:
""" Make 2 seperate dictionaries for each product id mapping to its bert and als embeddings  """
bert_product_embedding_dict = {}
als_product_embedding_dict = {}

for i in df['prod_id'].unique():
    df_temp = df[df['prod_id'] == i]
    bert_product_embedding_dict[i] = df_temp['product_embeddings'].values[0]
    als_product_embedding_dict[i] = df_temp['product_ALS_embedding'].values[0]

In [None]:
""" A dictionary mapping each product id to its corresponding forward pass through the product tower """
product_tower_forward_pass_embedding_dict = {}
for i in df['prod_id'].unique():
    df_temp = df[df['prod_id'] == i]
    product_als_embedding = df_temp['product_ALS_embedding'].values[0]
    product_bert_embedding = df_temp['product_embeddings'].values[0]
    
    # Create input tensors
    product_als_input = tf.convert_to_tensor([product_als_embedding], dtype=tf.float32)
    product_bert_input = tf.convert_to_tensor([product_bert_embedding], dtype=tf.float32)
    
    # Get the output from the product tower
    product_tower_model = keras.Model(
        inputs=[product_input_1, product_input_2],
        outputs=product_tower_output
    )
    
    product_tower_output_embedding = product_tower_model.predict([product_als_input, product_bert_input])
    product_tower_forward_pass_embedding_dict[i] = product_tower_output_embedding[0]

In [None]:
product_tower_forward_pass_embedding_dict #precomputed product tower embeddings for each product id

In [None]:
""" Now we will take a list of user_ids and product_ids and then recommend top k products for each user based on concatenating precomputed product tower embeddings and user tower embeddings and then passing through final dense layers to get the predicted rating and then recommending top k products with highest predicted ratings for each user  """
""" Also since ALS embeddings are same for each user BERT embeddings differ , so we will use the last BERT embedding of the user as user embedding and then concatenate with product tower embedding and then pass through final dense layers to get predicted rating for that product and then recommend top k products with highest predicted ratings for each user  """

import numpy as np

def predict_for_users_batched(user_ids, top_k=5, product_ids=None):
    if product_ids is None:
        product_ids = df['prod_id'].unique()
    
    recommendations = {}
    
    model_to_use = keras.Model(
        inputs=[user_input_1, user_input_2, user_input_3, product_input_1, product_input_2],
        outputs=final_output
    )

    # Pre-bundle Product Embeddings
    p_als_matrix = np.array([als_product_embedding_dict[p] for p in product_ids])
    p_bert_matrix = np.array([bert_product_embedding_dict[p] for p in product_ids])
    num_products = len(product_ids)

    for user_id in user_ids:
        # Get the last review row for this user
        user_rows = df[df['user_id'] == user_id]
        user_data = user_rows.iloc[-1]
        
        # Tile User Data
        u_als_batch = np.tile(user_data['user_ALS_embedding'], (num_products, 1))
        u_bert_batch = np.tile(user_data['average_mean_review_embeddings'], (num_products, 1))
        u_anon_batch = np.tile([user_data['is_anonymous_user']], (num_products, 1))

        # Single Batch Prediction
        predicted_ratings = model_to_use.predict(
            [u_als_batch, u_bert_batch, u_anon_batch, p_als_matrix, p_bert_matrix],
            batch_size=1024, # Higher batch size is usually faster for inference
        )

        scores = predicted_ratings.flatten()
        top_indices = np.argsort(scores)[-top_k:][::-1]
        
        # Map indices back to product IDs
        recommendations[user_id] = [(product_ids[i], scores[i]) for i in top_indices]

    return recommendations

In [None]:
predict_for_users_batched([0,1,2,3,4,5], top_k=3)

In [None]:
df['prod_id'].value_counts().head(10)

In [None]:
df['reviews.rating'].describe()