In [8]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re


# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [9]:
# Import kdd_processor.py
from kdd_processor import load_all_data


# Memory management, None to load full dataset
slice_size = 50000


# Load data
products_train, sessions_train, sessions_test = load_all_data(slice_size)

In [10]:
products_train.head()

Unnamed: 0,id,locale,title,price,brand,color,size,model,material,author,desc
0,B005ZSSN10,DE,red dragon amberjack 3 steel tip 22 gramm wolf...,30.95,red dragon,unknown,unknown,rdd0089,unknown,unknown,amberjacks steel dartpfeile sind verf gbar in ...
1,B08PRYN6LD,DE,simply keto lower carb schokodrops ohne zucker...,17.9,simply keto,unknown,750 g 1er pack,unknown,unknown,unknown,nat rliche s sse durch erythrit wir stellen oh...
2,B09MBZJ48V,DE,sennheiser 508377 pc 52 chat stilvolles multip...,68.89,sennheiser,multicolour,one size,508377,kunstleder,unknown,35 mm buchse kann problemlos an ger te mit sta...
3,B08ZN6F26S,DE,amybenton auto ab 1 2 3 ahre baby aufziehbares...,18.99,amy benton,animal car,unknown,2008b,aufziehauto 1 jahr,unknown,auto aufziehbar dr cken sie einfach leicht auf...
4,B094DGRV7D,DE,playmobil 70522 cavaliere mit grauem pony,7.17,playmobil,nicht zutreffend,onesize,70522,polypropylen,unknown,inhalt 1 st ck


In [11]:
sessions_train.head()

Unnamed: 0,prev_items,next_item,locale
0,['B09W9FND7K' 'B09JSPLN1M'],B09M7GY217,DE
1,['B076THCGSG' 'B007MO8IME' 'B08MF65MLV' 'B001B...,B001B4THSA,DE
2,['B0B1LGXWDS' 'B00AZYORS2' 'B0B1LGXWDS' 'B00AZ...,B0767DTG2Q,DE
3,['B09XMTWDVT' 'B0B4MZZ8MB' 'B0B7HZ2GWX' 'B09XM...,B0B4R9NN4B,DE
4,['B09Y5CSL3T' 'B09Y5DPTXN' 'B09FKD61R8'],B0BGVBKWGZ,DE


In [12]:
sessions_test.head()

Unnamed: 0,prev_items,locale
0,['B08V12CT4C' 'B08V1KXBQD' 'B01BVG1XJS' 'B09VC...,DE
1,['B00R9R5ND6' 'B00R9RZ9ZS' 'B00R9RZ9ZS'],DE
2,['B07YSRXJD3' 'B07G7Q5N6G' 'B08C9Q7QVK' 'B07G7...,DE
3,['B08KQBYV43' '3955350843' '3955350843' '39553...,DE
4,['B09FPTCWMC' 'B09FPTQP68' 'B08HMRY8NG' 'B08TB...,DE


### Recommendations using co-occurrence matrix
This recommendation system is based on product co-occurrence matrix. The matrix is created by analyzing the previous items purchased by customers in their sessions. For each session, pairs of products that were purchased together are identified, and a score is assigned to the pair indicating how frequently the products were purchased together. The matrix is created from all of the scores for all of the product pairs.

Once the matrix is created, the function recommend_products takes a product ID as input and returns the top N recommendations for that product. The function looks for all pairs in the matrix that contain the given product ID, and then identifies the related product in each pair. The function sorts the related products based on their scores, and returns the top N products as recommendations for the given product ID.

In [13]:
# Function to create co-occurrence matrix
def cooccurrence_matrix(df):
    sessions = df['prev_items'].apply(lambda x: re.findall(r"'(.*?)'", x)).tolist()
    cooccurrence = {}
    
    for session in sessions:
        for i in range(len(session)):
            for j in range(i + 1, len(session)):
                pair = tuple(sorted((session[i], session[j])))
                if pair in cooccurrence:
                    cooccurrence[pair] += 1
                else:
                    cooccurrence[pair] = 1

    cooccurrence_df = pd.DataFrame(list(cooccurrence.items()), columns=['product_pair', 'score'])
    return cooccurrence_df


# Create co-occurrence matrix
train_cooccurrence = cooccurrence_matrix(sessions_train)


# Function to recommend products
def recommend_products(product_id, cooccurrence_df, top_n=10):
    pairs = cooccurrence_df[cooccurrence_df['product_pair'].apply(lambda x: product_id in x)]
    pairs['related_product'] = pairs['product_pair'].apply(lambda x: x[0] if x[1] == product_id else x[1])
    recommendations = pairs[['related_product', 'score']].sort_values(by='score', ascending=False).head(top_n)
    return recommendations

In [14]:
# Get recommendations
prod_id = 'B094R3R9XH'                  # Product ID to get recommendations for
prod_rec = 100                          # Number of recommendations for product_id
recommendations = recommend_products(prod_id, train_cooccurrence, top_n=prod_rec)
print(f'Recommendations for {prod_id}:')
recommendations

Recommendations for B094R3R9XH:


Unnamed: 0,related_product,score
97776,B09W5988V1,7
289869,B0BJPNPCYG,4
150199,B08CRV3XXV,3
97759,B07GPT8HPY,2
150211,B0911ZGLT2,2
363814,B004605SE8,2
325011,B0B74RK7V5,2
289871,B07JG9TFSB,2
289865,B0B7S7LBMB,2
279730,B07JDSHD4Z,2
