In [4]:
# Import libraries
import os
import glob
import time
import re

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import Word2Vec

import sklearn
import tensorflow as tf
import torch

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Attention, Concatenate


# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [781]:
# Function to load and clean data
def load_and_clean_data(file_path, is_products=False, processed_file=None, slice_size=None):
    if is_products and processed_file and os.path.exists(processed_file):
        df = pd.read_csv(processed_file, nrows=slice_size) if slice_size else pd.read_csv(processed_file)
    else:
        df = pd.read_csv(file_path, nrows=slice_size) if slice_size else pd.read_csv(file_path)

        if is_products:
            # Fill missing values
            df.fillna('unknown', inplace=True)

            # Remove special characters
            pattern = [
                (r'<.*?>', ''),                             # Remove HTML tags
                (r'[^\x00-\x7F]+', ' '),                    # Remove non-ASCII characters
                (r'[^a-zA-Z0-9\s]', ''),                    # Remove special characters
                (r'\s+', ' '),                              # Remove extra spaces
                (r'^\s+|\s+?$', ''),                        # Remove leading and trailing spaces
            ]

            # Clean text
            for col in df.select_dtypes('object').columns:
                if col not in non_text_cols:
                    text = df[col].str.lower()
                    for p, r in pattern:
                        text = text.str.replace(p, r, regex=True)
                    df[col] = text

            if processed_file and not os.path.exists(processed_file):
                df.to_csv(processed_file, index=False)

    return df


# Set up environment
task = 'task1'
train_path = '../../data/train/'
test_path = '../../data/test/sessions_test_' + task + '.csv'
PREDS_PER_SESSION = 100
slice_size = 50000                                           # Memory management, None for no slicing
non_text_cols = ['id', 'locale', 'price']


# Load and clean data
processed_file = train_path + '/products_train_processed.csv'
products_train = load_and_clean_data(train_path + '/products_train.csv', is_products=True, processed_file=processed_file, slice_size=slice_size)
sessions_train = load_and_clean_data(train_path + '/sessions_train.csv', slice_size=slice_size)
sessions_test = load_and_clean_data(test_path, slice_size=slice_size)

In [785]:
# prod_count = {prod: sum(1 for sess in sessions_train['prev_items'] if prod in re.findall(r"'(.*?)'", sess)) for prod in products_train['id']}

# # sort prod_count by value in descending order
# prod_count = {k: v for k, v in sorted(prod_count.items(), key=lambda item: item[1], reverse=True)}
# print(prod_count)

### Data Exploration

In [786]:
def cooccurrence_matrix(df):
    sessions = df['prev_items'].apply(lambda x: re.findall(r"'(.*?)'", x)).tolist()
    cooccurrence = {}

    for session in sessions:
        for i in range(len(session)):
            for j in range(i + 1, len(session)):
                pair = tuple(sorted((session[i], session[j])))
                if pair in cooccurrence:
                    cooccurrence[pair] += 1
                else:
                    cooccurrence[pair] = 1

    cooccurrence_df = pd.DataFrame(list(cooccurrence.items()), columns=['product_pair', 'count'])
    return cooccurrence_df

train_cooccurrence = cooccurrence_matrix(sessions_train)

def recommend_products(product_id, cooccurrence_df, top_n=10):
    pairs = cooccurrence_df[cooccurrence_df['product_pair'].apply(lambda x: product_id in x)]
    pairs['related_product'] = pairs['product_pair'].apply(lambda x: x[0] if x[1] == product_id else x[1])
    recommendations = pairs[['related_product', 'count']].sort_values(by='count', ascending=False).head(top_n)
    return recommendations

product_id = 'B07JG9TFSB'  # Replace this with the product ID you want recommendations for
prod_to_rec = 100
recommendations = recommend_products(product_id, train_cooccurrence, top_n=prod_to_rec)
print(recommendations)

       related_product  count
14622       B07JDSHD4Z     43
34692       B07JG9QZ2B     17
157274      B01BVG1XJS     14
14623       B07JG9TFSB     12
157269      B08QYYBTMC     11
...                ...    ...
324995      B091YCWH9S      1
324996      B08CRV3XXV      1
193123      B09YD42KBD      1
324998      B09PH91HX2      1
423064      B09TKMQKJS      1

[98 rows x 2 columns]


Rough

In [768]:
def process_sessions(df):
    # using r"'(.*?)'" to extract the product id from the string
    sessions = df['prev_items'].apply(lambda x: re.findall(r"'(.*?)'", x)).tolist()
    return sessions

train_sessions = process_sessions(sessions_train)

def cooccurrence_matrix(sessions):
    # Create an empty dictionary to store cooccurrence counts
    cooccurrence = {}

    # Iterate through the sessions
    for session in sessions:
        # Iterate through all possible product pairs in the session
        for i in range(len(session)):
            for j in range(i + 1, len(session)):
                # Sort the product pair to ensure a consistent key
                pair = tuple(sorted((session[i], session[j])))

                # Increment the count for this product pair
                if pair in cooccurrence:
                    cooccurrence[pair] += 1
                else:
                    cooccurrence[pair] = 1

    # Convert the cooccurrence dictionary to a DataFrame
    cooccurrence_df = pd.DataFrame(list(cooccurrence.items()), columns=['product_pair', 'count'])

    return cooccurrence_df

train_cooccurrence = cooccurrence_matrix(train_sessions)

In [769]:
def recommend_products(product_id, cooccurrence_df, top_n=10):
    # Find all pairs that include the target product
    pairs = cooccurrence_df[cooccurrence_df['product_pair'].apply(lambda x: product_id in x)]

    # Calculate the score for each related product
    pairs['related_product'] = pairs['product_pair'].apply(lambda x: x[0] if x[1] == product_id else x[1])
    pairs['score'] = pairs['count']

    # Rank the related products by score and take the top N
    recommendations = pairs[['related_product', 'score']].sort_values(by='score', ascending=False).head(top_n)

    return recommendations

In [772]:
product_id = 'B07JG9TFSB'  # Replace this with the product ID you want recommendations for
prod_to_rec = 10
recommendations = recommend_products(product_id, train_cooccurrence, top_n=prod_to_rec)
print(recommendations)

       related_product  score
14622       B07JDSHD4Z     43
34692       B07JG9QZ2B     17
157274      B01BVG1XJS     14
14623       B07JG9TFSB     12
157269      B08QYYBTMC     11
394226      B08V1KXBQD     10
394225      B08V12CT4C     10
194357      B0BHHZ9LPT      9
157273      B099NS1XPG      8
22569       B07T14HSNQ      7
