In [1]:
import pandas as pd
import numpy as np
import ast
import random

In [2]:
# Load interaction data (user-item interactions with implicit feedback signals)
interactions = pd.read_csv('interaction_table_2.csv')
# Load user metadata (user country and supported languages)
users_df = pd.read_csv('user_table_2.csv')
# Load item metadata (item country and language)
posts_df = pd.read_csv('post_table_2.csv')

In [3]:
interactions.head(5)

Unnamed: 0,user_id,post_id,likes,views,saves
0,10112,2649,0,1,0
1,10112,2655,0,2,0
2,10112,2702,0,1,0
3,10112,2707,0,1,0
4,10112,2827,0,1,0


In [4]:
users_df.head(5)

Unnamed: 0,user_id,country,supported_languages
0,10114,US,"[""en"", ""es"", ""haw"", ""fr""]"
1,10000000686,CA,"[""en"", ""fr"", ""iu""]"
2,10000000942,CA,"[""en"", ""fr"", ""iu""]"
3,10000001007,MV,"[""dv"", ""en""]"
4,10000001189,CA,"[""en"", ""fr"", ""iu""]"


In [5]:
posts_df.head(5)

Unnamed: 0,post_id,post_owner_id,effective_country,lang
0,10,10000000942,CA,en
1,17,10000000942,CA,en
2,18,100000337699,BM,en
3,19,100000337699,BM,en
4,22,10000014174,CA,en


In [6]:
print("Interactions shape:", interactions.shape)

Interactions shape: (31189, 5)


In [7]:
print("Users Table shape:",users_df.shape)

Users Table shape: (436, 3)


In [8]:
print("Post Table shape:",posts_df.shape)

Post Table shape: (1870, 4)


In [9]:
# We will use 'country' and 'primary_lang' as user features in the model.

users_df['primary_lang'] = users_df['supported_languages'].apply(
    lambda x: ast.literal_eval(x)[0] if pd.notnull(x) else None
)

In [10]:
users_df.head(5)

Unnamed: 0,user_id,country,supported_languages,primary_lang
0,10114,US,"[""en"", ""es"", ""haw"", ""fr""]",en
1,10000000686,CA,"[""en"", ""fr"", ""iu""]",en
2,10000000942,CA,"[""en"", ""fr"", ""iu""]",en
3,10000001007,MV,"[""dv"", ""en""]",dv
4,10000001189,CA,"[""en"", ""fr"", ""iu""]",en


In [11]:
# We will use 'effective_country' as item country and 'lang' as item language features.

posts_df.rename(columns={'effective_country': 'item_country', 'lang': 'item_lang'}, inplace=True)

In [12]:
# We will use 'effective_country' as item country and 'lang' as item language features.

posts_df.rename(columns={'effective_country': 'item_country', 'lang': 'item_lang'}, inplace=True)

In [13]:
print("Unique user countries:", users_df['country'].unique())
print("Unique user primary languages:", users_df['primary_lang'].unique())
print("Unique item countries:", posts_df['item_country'].unique())
print("Unique item languages:", posts_df['item_lang'].unique())

Unique user countries: ['US' 'CA' 'MV' 'CN' 'BM' 'IN' 'ES' 'BY' 'PL' 'DZ' 'AS' 'SA' 'GB' 'VN'
 'AE' 'PA' 'GY' 'IQ' 'TW' 'QA' 'UA' 'ZW']
Unique user primary languages: ['en' 'dv' 'zh' 'es' 'be' 'pl' 'ar' 'vi' 'uk']
Unique item countries: ['CA' 'BM' 'IN' 'US' 'BY' 'DE' 'AE' 'PL' 'FR' 'ES' 'GB' 'RO' 'NZ']
Unique item languages: ['en' 'ar' 'hi' 'es' 'fr' 'pl' 'de']


In [14]:
user_to_country = pd.Series(users_df.country.values, index=users_df.user_id).to_dict()
user_to_lang    = pd.Series(users_df.primary_lang.values, index=users_df.user_id).to_dict()
item_to_country = pd.Series(posts_df.item_country.values, index=posts_df.post_id).to_dict()
item_to_lang    = pd.Series(posts_df.item_lang.values, index=posts_df.post_id).to_dict()

In [15]:
user_to_country

{10114: 'US',
 10000000686: 'CA',
 10000000942: 'CA',
 10000001007: 'MV',
 10000001189: 'CA',
 10000001551: 'CA',
 10000007905: 'US',
 10000009679: 'CA',
 10000009919: 'CA',
 10000010321: 'CA',
 10000014174: 'CA',
 10000014265: 'CA',
 10000033459: 'CA',
 100000337582: 'CN',
 100000337699: 'BM',
 100000337707: 'CA',
 100000337715: 'CA',
 100000337756: 'CA',
 100000337764: 'CA',
 100000337780: 'US',
 100000337798: 'CN',
 100000338069: 'IN',
 100000338143: 'US',
 100000338226: 'US',
 100000338242: 'IN',
 100000338267: 'IN',
 100000338317: 'IN',
 100000338382: 'IN',
 100000338606: 'US',
 100000338630: 'US',
 100000338945: 'US',
 100000339034: 'US',
 100000339042: 'US',
 100000339059: 'IN',
 100000339083: 'CA',
 100000339109: 'CA',
 100000339141: 'IN',
 100000339158: 'IN',
 100000339281: 'US',
 100000339299: 'ES',
 100000339323: 'US',
 100000339331: 'IN',
 100000339349: 'IN',
 100000339364: 'US',
 100000339372: 'US',
 100000339380: 'IN',
 100000339448: 'BY',
 100000339455: 'US',
 1000003394

In [16]:
user_to_lang

{10114: 'en',
 10000000686: 'en',
 10000000942: 'en',
 10000001007: 'dv',
 10000001189: 'en',
 10000001551: 'en',
 10000007905: 'en',
 10000009679: 'en',
 10000009919: 'en',
 10000010321: 'en',
 10000014174: 'en',
 10000014265: 'en',
 10000033459: 'en',
 100000337582: 'zh',
 100000337699: 'en',
 100000337707: 'en',
 100000337715: 'en',
 100000337756: 'en',
 100000337764: 'en',
 100000337780: 'en',
 100000337798: 'zh',
 100000338069: 'en',
 100000338143: 'en',
 100000338226: 'en',
 100000338242: 'en',
 100000338267: 'en',
 100000338317: 'en',
 100000338382: 'en',
 100000338606: 'en',
 100000338630: 'en',
 100000338945: 'en',
 100000339034: 'en',
 100000339042: 'en',
 100000339059: 'en',
 100000339083: 'en',
 100000339109: 'en',
 100000339141: 'en',
 100000339158: 'en',
 100000339281: 'en',
 100000339299: 'es',
 100000339323: 'en',
 100000339331: 'en',
 100000339349: 'en',
 100000339364: 'en',
 100000339372: 'en',
 100000339380: 'en',
 100000339448: 'be',
 100000339455: 'en',
 1000003394

In [17]:
item_to_country

{10: 'CA',
 17: 'CA',
 18: 'BM',
 19: 'BM',
 22: 'CA',
 37: 'BM',
 38: 'BM',
 39: 'BM',
 51: 'CA',
 53: 'CA',
 56: 'CA',
 66: 'CA',
 67: 'BM',
 68: 'CA',
 76: 'IN',
 92: 'CA',
 94: 'CA',
 114: 'CA',
 116: 'CA',
 133: 'CA',
 137: 'US',
 138: 'US',
 155: 'CA',
 156: 'US',
 192: 'CA',
 193: 'CA',
 196: 'CA',
 197: 'CA',
 216: 'CA',
 232: 'US',
 250: 'CA',
 254: 'US',
 262: 'IN',
 266: 'IN',
 268: 'CA',
 272: 'CA',
 283: 'US',
 331: 'CA',
 335: 'US',
 372: 'CA',
 548: 'IN',
 549: 'IN',
 552: 'IN',
 556: 'IN',
 557: 'IN',
 564: 'IN',
 565: 'IN',
 566: 'IN',
 567: 'IN',
 577: 'IN',
 579: 'IN',
 580: 'IN',
 584: 'IN',
 585: 'IN',
 587: 'IN',
 589: 'IN',
 591: 'IN',
 592: 'IN',
 593: 'IN',
 594: 'IN',
 596: 'IN',
 597: 'IN',
 598: 'IN',
 599: 'IN',
 600: 'CA',
 601: 'CA',
 604: 'IN',
 613: 'CA',
 614: 'CA',
 615: 'CA',
 616: 'IN',
 617: 'IN',
 618: 'IN',
 625: 'CA',
 626: 'CA',
 627: 'CA',
 628: 'CA',
 629: 'CA',
 630: 'CA',
 632: 'CA',
 636: 'CA',
 638: 'IN',
 639: 'CA',
 640: 'CA',
 644: 'CA

In [18]:
item_to_lang

{10: 'en',
 17: 'en',
 18: 'en',
 19: 'en',
 22: 'en',
 37: 'en',
 38: 'en',
 39: 'en',
 51: 'en',
 53: 'en',
 56: 'en',
 66: 'en',
 67: 'en',
 68: 'en',
 76: 'en',
 92: 'en',
 94: 'en',
 114: 'en',
 116: 'en',
 133: 'en',
 137: 'en',
 138: 'en',
 155: 'en',
 156: 'en',
 192: 'en',
 193: 'en',
 196: 'en',
 197: 'en',
 216: 'en',
 232: 'en',
 250: 'en',
 254: 'en',
 262: 'en',
 266: 'en',
 268: 'en',
 272: 'en',
 283: 'en',
 331: 'en',
 335: 'en',
 372: 'en',
 548: 'en',
 549: 'en',
 552: 'en',
 556: 'en',
 557: 'en',
 564: 'en',
 565: 'en',
 566: 'en',
 567: 'en',
 577: 'en',
 579: 'en',
 580: 'en',
 584: 'en',
 585: 'en',
 587: 'en',
 589: 'en',
 591: 'en',
 592: 'en',
 593: 'en',
 594: 'en',
 596: 'en',
 597: 'en',
 598: 'en',
 599: 'en',
 600: 'en',
 601: 'en',
 604: 'en',
 613: 'en',
 614: 'en',
 615: 'en',
 616: 'en',
 617: 'en',
 618: 'en',
 625: 'en',
 626: 'en',
 627: 'en',
 628: 'en',
 629: 'en',
 630: 'en',
 632: 'en',
 636: 'en',
 638: 'en',
 639: 'en',
 640: 'en',
 644: 'en

In [19]:
# In implicit feedback, any form of engagement (like, save, view) can be considered a positive signal.
# Here, we assume each row in interactions is a positive instance (user interacted with item).
# We will create positive samples (label=1) from these interactions.

# Filter interactions to only include users present in users_df
interactions_filtered = interactions[interactions['user_id'].isin(users_df['user_id'])]

# We will now consider interactions with 'likes == 1' as positive samples.
positive_interactions = interactions_filtered[interactions_filtered['likes'] == 1]

pos_user_ids = positive_interactions['user_id'].tolist()
pos_item_ids = positive_interactions['post_id'].tolist()
pos_labels   = [1] * len(pos_user_ids)  # all positives labeled 1

In [20]:
pos_user_ids[:5]

[10114, 10114, 10114, 10114, 10114]

In [21]:
pos_item_ids[:5]

[866, 2577, 2654, 2692, 2694]

In [22]:
pos_labels[:5]

[1, 1, 1, 1, 1]

In [23]:
# # We also need negative samples (user did NOT interact with item) for training.
# # We'll do negative sampling by pairing each user with some items they have not interacted with.
# # For each positive interaction, we sample `neg_ratio` negative examples.

# user_pos_set = interactions.groupby('user_id')['post_id'].apply(set).to_dict()  # set of items each user interacted with
# all_items = posts_df['post_id'].unique().tolist()
# neg_ratio = 4  # number of negative samples per positive sample

In [24]:
# neg_user_ids = []
# neg_item_ids = []
# neg_labels   = []

In [25]:
# random.seed(42)  # for reproducibility
# for user, pos_items in user_pos_set.items():
#     for pos_item in pos_items:
#         # Generate `neg_ratio` negatives for this positive interaction
#         for _ in range(neg_ratio):
#             neg_item = random.choice(all_items)
#             # Ensure the sampled item is not one the user has interacted with
#             while neg_item in pos_items:
#                 neg_item = random.choice(all_items)
#             neg_user_ids.append(user)
#             neg_item_ids.append(neg_item)
#             neg_labels.append(0)  # negative label 0


In [26]:
negative_interactions = interactions_filtered[interactions_filtered['likes'] == 0]

neg_user_ids = negative_interactions['user_id'].tolist()
neg_item_ids = negative_interactions['post_id'].tolist()
neg_labels   = [0] * len(neg_user_ids)  # negative label 0

In [27]:
print(f"Actual {len(pos_user_ids)} positive samples and {len(neg_user_ids)} negative samples.")

Actual 9704 positive samples and 21480 negative samples.


In [28]:
# Combine positive and negative samples
all_user_ids = pos_user_ids + neg_user_ids
all_item_ids = pos_item_ids + neg_item_ids
all_labels   = pos_labels   + neg_labels

In [29]:
# Encode categorical IDs and features as numeric indices for model input.
# We will create index mappings for user IDs, item IDs, user countries, user languages, item countries, item languages.
user_ids_unique         = sorted(users_df['user_id'].unique().tolist())
item_ids_unique         = sorted(posts_df['post_id'].unique().tolist())
user_countries_unique   = sorted(users_df['country'].unique().tolist())
user_langs_unique       = sorted(users_df['primary_lang'].unique().tolist())
item_countries_unique   = sorted(posts_df['item_country'].unique().tolist())
item_langs_unique       = sorted(posts_df['item_lang'].unique().tolist())

user_id_to_index        = {uid: idx for idx, uid in enumerate(user_ids_unique)}
item_id_to_index        = {pid: idx for idx, pid in enumerate(item_ids_unique)}
user_country_to_index   = {c: idx for idx, c in enumerate(user_countries_unique)}
user_lang_to_index      = {l: idx for idx, l in enumerate(user_langs_unique)}
item_country_to_index   = {c: idx for idx, c in enumerate(item_countries_unique)}
item_lang_to_index      = {l: idx for idx, l in enumerate(item_langs_unique)}

In [30]:
# Map all user and item features to their indices
user_index_data        = [user_id_to_index[u] for u in all_user_ids]
user_country_index_data= [user_country_to_index[user_to_country[u]] for u in all_user_ids]
user_lang_index_data   = [user_lang_to_index[user_to_lang[u]] for u in all_user_ids]
item_index_data        = [item_id_to_index[i] for i in all_item_ids]
item_country_index_data= [item_country_to_index[item_to_country[i]] for i in all_item_ids]
item_lang_index_data   = [item_lang_to_index[item_to_lang[i]] for i in all_item_ids]
labels_data            = all_labels

In [31]:
# Convert to NumPy arrays for model training

user_index_data        = np.array(user_index_data, dtype='int32')
user_country_index_data= np.array(user_country_index_data, dtype='int32')
user_lang_index_data   = np.array(user_lang_index_data, dtype='int32')
item_index_data        = np.array(item_index_data, dtype='int32')
item_country_index_data= np.array(item_country_index_data, dtype='int32')
item_lang_index_data   = np.array(item_lang_index_data, dtype='int32')
labels_data            = np.array(labels_data, dtype='float32')

In [32]:
print("Sample encoded data:",
      user_index_data[0], user_country_index_data[0], user_lang_index_data[0],
      item_index_data[0], item_country_index_data[0], item_lang_index_data[0],
      "label", labels_data[0])


Sample encoded data: 0 19 3 162 8 2 label 1.0


In [33]:
import tensorflow as tf
from tensorflow.keras import layers, Model

In [34]:
# Define input layers for the model (each input is an integer index for a feature)
user_id_input       = layers.Input(shape=(1,), dtype='int32', name='user_id')
user_country_input  = layers.Input(shape=(1,), dtype='int32', name='user_country')
user_lang_input     = layers.Input(shape=(1,), dtype='int32', name='user_lang')
item_id_input       = layers.Input(shape=(1,), dtype='int32', name='item_id')
item_country_input  = layers.Input(shape=(1,), dtype='int32', name='item_country')
item_lang_input     = layers.Input(shape=(1,), dtype='int32', name='item_lang')

In [35]:
# Embedding layers for each categorical feature.
# These layers convert the integer indices into dense vectors (embeddings).
num_users          = len(user_ids_unique)
num_items          = len(item_ids_unique)
num_user_countries = len(user_countries_unique)
num_user_langs     = len(user_langs_unique)
num_item_countries = len(item_countries_unique)
num_item_langs     = len(item_langs_unique)

In [36]:
# User tower embeddings
user_id_emb = layers.Embedding(input_dim=num_users, output_dim=32, name='user_id_emb')(user_id_input)
user_country_emb = layers.Embedding(input_dim=num_user_countries, output_dim=8, name='user_country_emb')(user_country_input)
user_lang_emb = layers.Embedding(input_dim=num_user_langs, output_dim=8, name='user_lang_emb')(user_lang_input)
# The embedding outputs have shape (batch_size, 1, embedding_dim). Flatten them to shape (batch_size, embedding_dim).
user_id_vec = layers.Flatten()(user_id_emb)
user_country_vec = layers.Flatten()(user_country_emb)
user_lang_vec = layers.Flatten()(user_lang_emb)
# Concatenate user feature vectors into a single vector for the user tower.
user_features = layers.concatenate([user_id_vec, user_country_vec, user_lang_vec], name='user_features')


In [37]:
# User tower: a small neural network to process user features.
# We use a Dense layer to learn interactions between user embeddings.
user_hidden = layers.Dense(32, activation='relu')(user_features)
# The output of the user tower is a user embedding vector.
user_vector = layers.Dense(32, activation=None, name='user_vector')(user_hidden)

In [38]:
# Item tower embeddings
item_id_emb = layers.Embedding(input_dim=num_items, output_dim=32, name='item_id_emb')(item_id_input)
item_country_emb = layers.Embedding(input_dim=num_item_countries, output_dim=8, name='item_country_emb')(item_country_input)
item_lang_emb = layers.Embedding(input_dim=num_item_langs, output_dim=8, name='item_lang_emb')(item_lang_input)
item_id_vec = layers.Flatten()(item_id_emb)
item_country_vec = layers.Flatten()(item_country_emb)
item_lang_vec = layers.Flatten()(item_lang_emb)
# Concatenate item feature vectors for the item tower.
item_features = layers.concatenate([item_id_vec, item_country_vec, item_lang_vec], name='item_features')

In [39]:
# Item tower: a Dense layer to learn interactions between item features.
item_hidden = layers.Dense(32, activation='relu')(item_features)
# The output of the item tower is an item embedding vector.
item_vector = layers.Dense(32, activation=None, name='item_vector')(item_hidden)

In [40]:
# Compute similarity between user and item embeddings using dot product.
# This outputs a single score for each user-item pair.
dot_similarity = layers.Dot(axes=1, normalize=False)([user_vector, item_vector])

In [41]:
# For implicit feedback, we use a sigmoid activation to get a probability of interaction.
pred_score = layers.Activation('sigmoid', name='prediction')(dot_similarity)

In [42]:
# Define the full model that takes all inputs and produces the predicted interaction score.
model = Model(
    inputs=[user_id_input, user_country_input, user_lang_input,
            item_id_input, item_country_input, item_lang_input],
    outputs=pred_score
)

In [43]:
# Compile the model with binary crossentropy loss (for 0/1 labels) and an optimizer.
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()  # Print model architecture

In [44]:
# Training the Model
# ====================================

# Train the model on the prepared dataset (user-item pairs with labels).
# We use a validation split to monitor performance on unseen data during training.
history = model.fit(
    [user_index_data, user_country_index_data, user_lang_index_data,
     item_index_data, item_country_index_data, item_lang_index_data],
    labels_data,
    batch_size=256,
    epochs=5,
    validation_split=0.1,
    verbose=1
)

# After training, we can inspect the training and validation performance from history.
print("Final training accuracy:", history.history['accuracy'][-1])
print("Final validation accuracy:", history.history['val_accuracy'][-1])

Epoch 1/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 13ms/step - accuracy: 0.7204 - loss: 0.5423 - val_accuracy: 0.4030 - val_loss: 1.8196
Epoch 2/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8615 - loss: 0.2934 - val_accuracy: 0.2930 - val_loss: 2.0124
Epoch 3/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.9069 - loss: 0.2165 - val_accuracy: 0.2135 - val_loss: 3.1730
Epoch 4/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.9142 - loss: 0.1960 - val_accuracy: 0.2071 - val_loss: 3.7633
Epoch 5/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9157 - loss: 0.1876 - val_accuracy: 0.1911 - val_loss: 4.1877
Final training accuracy: 0.9119900465011597
Final validation accuracy: 0.19108688831329346


In [45]:
# Generating Recommendations (Inference)
# ====================================

# Example: Generate top-N recommendations for a specific user.
test_user_id = users_df['user_id'].iloc[0]  # pick an example user (first user in the user table)
print(f"\nGenerating recommendations for User ID {test_user_id}...")

# Prepare the user features for the model (using the same encoding as training).
user_idx = user_id_to_index[test_user_id]
user_country_idx = user_country_to_index[user_to_country[test_user_id]]
user_lang_idx = user_lang_to_index[user_to_lang[test_user_id]]

# Repeat the user feature values for all candidate items.
num_items = len(item_ids_unique)
user_idx_array       = np.full(shape=(num_items,), fill_value=user_idx, dtype='int32')
user_country_array   = np.full(shape=(num_items,), fill_value=user_country_idx, dtype='int32')
user_lang_array      = np.full(shape=(num_items,), fill_value=user_lang_idx, dtype='int32')

# Prepare item feature arrays for all items (0 to num_items-1 index for each feature).
# We can use the index mapping directly: item index 0 corresponds to item_ids_unique[0], etc.
item_idx_array       = np.arange(num_items, dtype='int32')
item_country_array   = np.array([ item_country_to_index[item_to_country[item]] for item in item_ids_unique ], dtype='int32')
item_lang_array      = np.array([ item_lang_to_index[item_to_lang[item]] for item in item_ids_unique ], dtype='int32')

# Use the model to predict scores for all item candidates for this user.
pred_scores = model.predict([user_idx_array, user_country_array, user_lang_array,
                              item_idx_array, item_country_array, item_lang_array],
                             verbose=0)
pred_scores = pred_scores.flatten()

# Rank the items by predicted score in descending order.
topN = 5
top_indices = np.argsort(-pred_scores)[:topN]  # indices of the top N scores
top_item_ids = [ item_ids_unique[i] for i in top_indices ]
top_scores = pred_scores[top_indices]

print(f"Top {topN} recommended items for user {test_user_id}:")
for rank, (item, score) in enumerate(zip(top_item_ids, top_scores), start=1):
    print(f"{rank}. Item ID {item} (predicted score={score:.4f})")


Generating recommendations for User ID 10114...
Top 5 recommended items for user 10114:
1. Item ID 1750 (predicted score=0.6657)
2. Item ID 896 (predicted score=0.6045)
3. Item ID 1753 (predicted score=0.5721)
4. Item ID 216 (predicted score=0.5313)
5. Item ID 880 (predicted score=0.5248)


In [46]:
import pandas as pd
import numpy as np

# Set N = number of recommendations per user
N = 3

# Store final recommendations here
recommendations = []

for user_id in users_df['user_id']:
    try:
        # Map user features to indices
        user_idx = user_id_to_index[user_id]
        user_country_idx = user_country_to_index[user_to_country[user_id]]
        user_lang_idx = user_lang_to_index[user_to_lang[user_id]]

        # Prepare user feature arrays (same value repeated for each item)
        num_items = len(item_ids_unique)
        user_idx_array = np.full((num_items,), user_idx, dtype='int32')
        user_country_array = np.full((num_items,), user_country_idx, dtype='int32')
        user_lang_array = np.full((num_items,), user_lang_idx, dtype='int32')

        # Prepare item feature arrays
        item_idx_array = np.arange(num_items, dtype='int32')
        item_country_array = np.array([item_country_to_index[item_to_country[item]] for item in item_ids_unique], dtype='int32')
        item_lang_array = np.array([item_lang_to_index[item_to_lang[item]] for item in item_ids_unique], dtype='int32')

        # Predict scores for this user across all items
        pred_scores = model.predict([
            user_idx_array, user_country_array, user_lang_array,
            item_idx_array, item_country_array, item_lang_array
        ], verbose=0).flatten()

        # Select Top-N items by score
        top_indices = np.argsort(-pred_scores)[:N]
        top_item_ids = [item_ids_unique[i] for i in top_indices]
        top_scores = pred_scores[top_indices]

        # Store in final list
        for rank, (item_id, score) in enumerate(zip(top_item_ids, top_scores), start=1):
            recommendations.append({
                'user_id': user_id,
                'recommended_post_id': item_id,
                'score': score,
                'rank': rank
            })

    except KeyError as e:
        print(f"Skipping user {user_id} due to missing mapping: {e}")
        continue

# Convert to DataFrame
recommendations_df = pd.DataFrame(recommendations)

# Preview
print(recommendations_df.head())


       user_id  recommended_post_id     score  rank
0        10114                 1750  0.665658     1
1        10114                  896  0.604486     2
2        10114                 1753  0.572134     3
3  10000000686                 1750  0.673979     1
4  10000000686                  896  0.617543     2


In [47]:
recommendations_df.head(10)

Unnamed: 0,user_id,recommended_post_id,score,rank
0,10114,1750,0.665658,1
1,10114,896,0.604486,2
2,10114,1753,0.572134,3
3,10000000686,1750,0.673979,1
4,10000000686,896,0.617543,2
5,10000000686,1753,0.590562,3
6,10000000942,1750,0.773569,1
7,10000000942,1753,0.766459,2
8,10000000942,705,0.760899,3
9,10000001007,1750,0.578619,1


In [48]:
# Training the Model
# ====================================

# Train the model on the prepared dataset (user-item pairs with labels).
# We use a validation split to monitor performance on unseen data during training.
history = model.fit(
    [user_index_data, user_country_index_data, user_lang_index_data,
     item_index_data, item_country_index_data, item_lang_index_data],
    labels_data,
    batch_size=256,
    epochs=5,
    validation_split=0.1,
    verbose=1
)

Epoch 1/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9161 - loss: 0.1850 - val_accuracy: 0.2100 - val_loss: 4.8157
Epoch 2/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9181 - loss: 0.1778 - val_accuracy: 0.1760 - val_loss: 5.2866
Epoch 3/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9192 - loss: 0.1747 - val_accuracy: 0.1799 - val_loss: 5.3019
Epoch 4/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9185 - loss: 0.1732 - val_accuracy: 0.2039 - val_loss: 5.4782
Epoch 5/5
[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9223 - loss: 0.1655 - val_accuracy: 0.1824 - val_loss: 5.5626


In [49]:
# After training, we can inspect the training and validation performance from history.
print("Final training accuracy:", history.history['accuracy'][-1] * 100, "%")
print("Final validation accuracy:", history.history['val_accuracy'][-1] * 100, "%")

Final training accuracy: 92.13967323303223 %
Final validation accuracy: 18.243026733398438 %
