In [1]:
import pandas as pd
import numpy as np
from lightfm import LightFM
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from scipy.sparse import csr_matrix
import os
import pickle
from tqdm import tqdm

# Load cleaned datasets
customers = pd.read_csv('../data/customers_cleaned.csv')
transactions_cleaned = pd.read_csv('../data/transactions_cleaned.csv')
articles_cleaned = pd.read_csv('../data/articles_cleaned.csv')

# Display first few rows of each dataset
display(customers.head(), transactions_cleaned.head(), articles_cleaned.head())


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0.0,0.0,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0.0,0.0,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0.0,0.0,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0.0,0.0,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [None]:
import sys
print(sys.executable)


In [2]:
from lightfm import LightFM
from lightfm.data import Dataset
from tqdm import tqdm  # Progress bar for iteration

# Create a LightFM Dataset object
dataset = Dataset()

# Print message to indicate fitting of dataset
print("Fitting the dataset with unique customer and article IDs...")

# Fit the dataset with unique customer and article IDs
dataset.fit(transactions_cleaned['customer_id'], transactions_cleaned['article_id'])

print("Building the interactions matrix...")

# Build the interactions matrix from the transactions data using tqdm to show progress
interactions_data = [
    (x['customer_id'], x['article_id'], x['price']) for _, x in tqdm(transactions_cleaned.iterrows(), total=len(transactions_cleaned))
]

# Build interactions matrix
(interactions, _) = dataset.build_interactions(interactions_data)

# Print the shape of the interactions matrix to confirm
print(f"Interactions matrix built. Shape: {interactions.shape}")

# # Initialize the LightFM model with WARP loss function
# model = LightFM(loss='warp')

# # Training the LightFM model and showing progress for each epoch
# print("Training the LightFM model...")

# # Fit the model with progress bar for epochs
# for epoch in range(10):
#     model.fit_partial(interactions, epochs=1, num_threads=4)
#     print(f"Completed Epoch {epoch + 1}/10")

# print("Model training completed!")

Fitting the dataset with unique customer and article IDs...
Building the interactions matrix...


100%|██████████| 31788324/31788324 [07:36<00:00, 69624.21it/s]


Interactions matrix built. Shape: (1362281, 104547)


In [7]:
# Save the trained model for future use
with open('../output/lightfm_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [3]:
# Define the path to the pickle file where the model is saved
model_file_path = '../output/lightfm_model.pkl'

# Load the LightFM model from the pickle file
with open(model_file_path, 'rb') as model_file:
    model = pickle.load(model_file)

# Confirm that the model is loaded
print("LightFM model loaded successfully.")

LightFM model loaded successfully.


In [4]:
# Define the path to the folder where the images are stored
image_folder = '../data/images/'

# Load the pre-trained ResNet50 model (excluding the top layers)
resnet_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')

# Function to preprocess an image for ResNet50
def preprocess_image(img_path):
    img = image.load_img(img_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)  # Add batch dimension
    img_array = preprocess_input(img_array)
    return img_array

# Function to extract features from an image
def extract_features(article_id):
    img_path = os.path.join(image_folder, f'{article_id}.jpg')
    if not os.path.exists(img_path):
        return np.zeros((2048,))  # Return a zero vector if the image doesn't exist
    
    img_array = preprocess_image(img_path)
    features = resnet_model.predict(img_array)
    return features.flatten()  # Flatten the features to a 1D vector

# Extract visual features for each article
articles_cleaned['visual_features'] = articles_cleaned['article_id'].apply(lambda x: extract_features(x))

# Save articles with visual features for future use
articles_cleaned.to_pickle('../output/articles_with_visual_features.pkl')


2024-10-13 22:02:50.144053: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2 Pro
2024-10-13 22:02:50.144302: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2024-10-13 22:02:50.144320: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2024-10-13 22:02:50.144830: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-10-13 22:02:50.144961: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [5]:
# Function to calculate similarity between articles based on visual features
def calculate_similarity(article_id, articles_df, top_n=10):
    # Check if the article exists in the articles_df and has visual features
    article_row = articles_df.loc[articles_df['article_id'] == article_id]
    if article_row.empty:
        print(f"Article {article_id} not found in the articles DataFrame.")
        return []
    
    # Get the visual features of the specified article
    target_features = article_row['visual_features'].values
    if len(target_features) == 0:
        print(f"No visual features found for article {article_id}.")
        return []
    
    # Extract the visual features of the target article (shape should be (1, 2048))
    target_features = np.array([target_features[0]])

    # Get all visual features in the dataframe and reshape them into a 2D array
    all_features = np.vstack(articles_df['visual_features'].values)

    # Calculate cosine similarity between the target article and all other articles
    similarities = cosine_similarity(target_features, all_features)[0]

    # Get the indices of the top N most similar articles
    top_similar_indices = np.argsort(similarities)[::-1][:top_n + 1]  # +1 to include the target article

    # Return the top N most similar article_ids, excluding the first one (which is the article itself)
    similar_articles = articles_df.iloc[top_similar_indices[1:]]['article_id'].values
    return similar_articles


# Example: Get visually similar articles for a given article_id
similar_articles = calculate_similarity(articles_cleaned['article_id'].iloc[0], articles_cleaned)
print("Visually Similar Articles:", similar_articles)


Visually Similar Articles: [646585001 646547001 646551001 646551006 646552001 646562001 646564001
 646564002 646564004 646580001]


In [6]:
# Function to generate recommendations using collaborative filtering
def generate_recommendations(model, customer_id, dataset, num_recommendations=10):
    # Get the LightFM mapping for customer and article IDs
    user_id_map, _, _, _ = dataset.mapping()

    # Check if the customer exists in the dataset
    if customer_id not in user_id_map:
        print(f"Customer {customer_id} not found in the dataset.")
        return []

    # Get the internal LightFM ID for the customer
    customer_internal_id = user_id_map[customer_id]

    # Predict scores for all articles (from 0 to the number of articles)
    scores = model.predict(customer_internal_id, np.arange(dataset.interactions_shape()[1]))

    # Get top N articles with the highest scores
    top_items = np.argsort(-scores)[:num_recommendations]

    # Retrieve article ID map and reverse map
    _, _, article_id_map, reverse_article_id_map = dataset.mapping()

    # Filter out any internal IDs that are not in the reverse_article_id_map
    recommended_articles = [
        reverse_article_id_map[item] for item in top_items if item in reverse_article_id_map
    ]

    return recommended_articles


In [7]:
# Hybrid recommendation system
def hybrid_recommendation(customer_id, article_id, model, dataset, articles_df, top_n=10):
    # Get collaborative filtering recommendations
    collab_recs = generate_recommendations(model, customer_id, dataset, num_recommendations=top_n)
    
    # If collaborative filtering cannot recommend enough items, use visual similarity as a fallback
    if len(collab_recs) < top_n:
        visual_recs = calculate_similarity(article_id, articles_df, top_n=top_n-len(collab_recs))
        collab_recs = np.concatenate([collab_recs, visual_recs])

    return collab_recs[:top_n]



In [None]:
# List the first few customer IDs from transactions_cleaned
print("Sample customer IDs from transactions_cleaned:")
print(transactions_cleaned['customer_id'].head())


In [None]:
# List the first few article IDs from articles_cleaned
print("Sample article IDs from articles_cleaned:")
print(articles_cleaned['article_id'].head())


In [8]:
# Get recommendations using collaborative filtering
customer_id = transactions_cleaned['customer_id'].iloc[0] # Replace with actual customer ID
article_id = articles_cleaned['article_id'].iloc[0] # Replace with actual article ID

# Get collaborative filtering recommendations
collab_recs = generate_recommendations(model, customer_id, dataset)
print(f"Collaborative Filtering Recommendations for Customer {customer_id}: {collab_recs}")

# Get hybrid recommendations (collaborative + visual similarity)
hybrid_recs = hybrid_recommendation(customer_id, article_id, model, dataset, articles_cleaned)
print(f"Hybrid Recommendations for Customer {customer_id}: {hybrid_recs}")


Collaborative Filtering Recommendations for Customer 000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318: []
Hybrid Recommendations for Customer 000058a12d5b43e67d225668fa1f8d618c13dc232df0cad8ffe7ad4a1091e318: [6.46585001e+08 6.46547001e+08 6.46551001e+08 6.46551006e+08
 6.46552001e+08 6.46562001e+08 6.46564001e+08 6.46564002e+08
 6.46564004e+08 6.46580001e+08]


In [9]:
from lightfm.evaluation import precision_at_k
from scipy.sparse import lil_matrix, csr_matrix
import numpy as np

# Split the interactions into training and test sets
def split_interactions(interactions, test_percentage=0.2):
    """
    Splits the LightFM interaction matrix into training and test sets.
    
    Args:
    interactions: Sparse matrix of interactions (in COO format).
    test_percentage: Percentage of interactions to include in the test set.

    Returns:
    train_interactions: Training interactions sparse matrix.
    test_interactions: Test interactions sparse matrix.
    """
    print("Starting to split the interaction matrix...")

    # Convert the interaction matrix to LIL format for easy manipulation
    print("Converting interactions to lil_matrix format for manipulation...")
    train_interactions = lil_matrix(interactions.copy())
    test_interactions = lil_matrix(interactions.copy())

    # Get non-zero indices (the actual interactions)
    non_zero_indices = interactions.nonzero()
    num_interactions = len(non_zero_indices[0])
    print(f"Found {num_interactions} non-zero interactions in the matrix.")

    # Create a mask for test interactions
    test_size = int(test_percentage * num_interactions)
    print(f"Selecting {test_size} interactions for the test set.")
    test_indices = np.random.choice(np.arange(num_interactions), size=test_size, replace=False)

    # Remove test interactions from the training set
    print("Removing test interactions from the training set...")
    train_interactions[non_zero_indices[0][test_indices], non_zero_indices[1][test_indices]] = 0
    
    # Remove training interactions from the test set
    print("Removing training interactions from the test set...")
    test_interactions[non_zero_indices[0][~test_indices], non_zero_indices[1][~test_indices]] = 0

    # Convert the matrices back to CSR format
    print("Converting the lil_matrix back to csr_matrix format for model evaluation...")
    return train_interactions.tocsr(), test_interactions.tocsr()

# Split the interactions matrix into training and test sets
print("Splitting the interactions matrix into training and test sets...")
train_interactions, test_interactions = split_interactions(interactions, test_percentage=0.2)
print("Successfully split the interaction matrix.")

# Evaluate the model using precision@12
print("Evaluating model on the training set using precision@12...")
train_precision = precision_at_k(model, train_interactions, k=12).mean()
print(f"Train Precision@12: {train_precision}")

print("Evaluating model on the test set using precision@12...")
test_precision = precision_at_k(model, test_interactions, k=12).mean()
print(f"Test Precision@12: {test_precision}")

print("Model evaluation complete.")


Splitting the interactions matrix into training and test sets...
Starting to split the interaction matrix...
Converting interactions to lil_matrix format for manipulation...
Found 31788324 non-zero interactions in the matrix.
Selecting 6357664 interactions for the test set.
Removing test interactions from the training set...
Removing training interactions from the test set...
Converting the lil_matrix back to csr_matrix format for model evaluation...
Successfully split the interaction matrix.
Evaluating model on the training set using precision@12...
Train Precision@12: 0.02203015796840191
Evaluating model on the test set using precision@12...
Test Precision@12: 0.022001944482326508
Model evaluation complete.
