In [3]:
import json
import csv
from collections import Counter
from itertools import combinations
from multiprocessing import Pool
from tqdm import tqdm

# Load 'products_bought' data from a JSON file
with open('products_bought.json', 'r') as file:
    products_bought = json.load(file)

# Function to extract product pairs and individual items for a single user
def get_user_pairs_and_individuals(user_products):
    asin_list = [product['asin'] for product in user_products]
    # Generate sorted pairs for consistency
    pairs = [tuple(sorted(pair)) for pair in combinations(asin_list, 2)]
    # Count individual product occurrences
    individuals = asin_list
    return pairs, individuals

# Function to add progress bar to multiprocessing pool
def count_pairs_and_individuals(products_bought):
    # Initialize counters
    pair_count = Counter()
    individual_count = Counter()

    with Pool() as pool:
        # Wrap the map with tqdm for progress tracking
        results = list(tqdm(pool.imap(get_user_pairs_and_individuals, products_bought.values()), 
                            total=len(products_bought), desc="Processing users"))

        # Flatten results with tqdm for progress
        for pairs, individuals in tqdm(results, desc="Aggregating results"):
            pair_count.update(pairs)
            individual_count.update(individuals)
    
    return pair_count, individual_count

# Get pair and individual counts
pair_count, individual_count = count_pairs_and_individuals(products_bought)

# Filtering pairs with the additional condition
threshold_pairs = {
    pair: count for pair, count in pair_count.items()
    if count > 5 and count > 0.4 * max(individual_count[pair[0]], individual_count[pair[1]])
}

# Convert tuple keys to strings for JSON compatibility
threshold_pairs_json = {f"{pair[0]}, {pair[1]}": count for pair, count in threshold_pairs.items()}

# Display top N most frequently bought-together product pairs that meet the threshold
print("Product pairs bought together more than 40% of the times they were bought individually and more than 5 times:")
for (product1, product2), count in threshold_pairs.items():
    print(f"Products {product1} and {product2} bought together {count} times.")

# Save filtered pairs to JSON
with open('threshold_pairs.json', 'w') as threshold_file:
    json.dump(threshold_pairs_json, threshold_file, indent=4)

print("Filtered pairs saved to 'threshold_pairs.json'.")


Processing users: 100%|██████████| 10345123/10345123 [05:56<00:00, 28999.95it/s]
Aggregating results: 100%|██████████| 10345123/10345123 [00:40<00:00, 255349.03it/s]


Product pairs bought together more than 40% of the times they were bought individually and more than 5 times:
Products B092HLKZ74 and B095HJLY6W bought together 11 times.
Products B017T6Y1KW and B017T6YIWS bought together 6 times.
Products B07J1LYVHC and B07RM722DH bought together 8 times.
Products B00D5PYKGW and B00D5Q0W6S bought together 11 times.
Products B09C3ZD4P9 and B09C3ZRGH4 bought together 7 times.
Products B08CVJ64ZV and B08GTWDT5S bought together 10 times.
Products B08CVJ64ZV and B08GTSGGTJ bought together 10 times.
Products B08CVJ64ZV and B09DLFC6F1 bought together 10 times.
Products B08GTSGGTJ and B08GTWDT5S bought together 13 times.
Products B08GTWDT5S and B09DLFC6F1 bought together 10 times.
Products B08GTSGGTJ and B09DLFC6F1 bought together 10 times.
Products B06Y2JLF6N and B07XR8GX4C bought together 6 times.
Products B06Y2JLF6N and B07GWY7NV1 bought together 6 times.
Products B06Y2JLF6N and B07XDFN7DL bought together 6 times.
Products B06Y2JLF6N and B07WNX5K5G bought 

In [4]:
print(len(threshold_pairs_json.keys()))

4911


In [16]:
print(len(individual_count.keys()))

1318152


In [6]:
print(len(products_bought.keys()))

10345123


In [9]:
total_bought = 0

for user in tqdm(products_bought, desc="Processing dataset"):
    total_bought = total_bought + len(products_bought[user])

print(f"Average number of products bought by each user : {total_bought/len(products_bought.keys())}")

Processing dataset: 100%|██████████| 10345123/10345123 [00:05<00:00, 1816256.56it/s]

Average number of products bought by each user : 1.8923046154211989





In [18]:
no_products_bought = 0

for user in tqdm(products_bought, desc="Processing dataset"):
    if len(products_bought[user])>=10:
        no_products_bought = no_products_bought + 1

print(f"Number of user who didn't buy any products : {no_products_bought}")

Processing dataset: 100%|██████████| 10345123/10345123 [00:05<00:00, 1879174.85it/s]

Number of user who didn't buy any products : 141992



