In [1]:
import pandas as pd
import os
from tqdm import tqdm
import ast
import warnings
from concurrent.futures import ThreadPoolExecutor, as_completed
from concurrent.futures import ProcessPoolExecutor, as_completed
import numpy as np
import torch

### Load product information and co-purchase edges

In [8]:
all_details_df = pd.read_csv('amazon_dataset/clothing_shoes_jewellery/all_products.csv')
all_details_df.set_index('asin', inplace=True)

co_purchase_df = pd.read_csv('bidirectional_links.csv') # co-purchase edges

### Snippet to create the bidirectional co-purchase edges

In [2]:
def process_amazon_data(input_df):
    # Function to safely convert string to list
    def convert_string_to_list(string):
        try:
            return ast.literal_eval(string)
        except (ValueError, SyntaxError):
            return []

    # Step 1: Convert string representation of list to actual list and then expand
    expanded_rows = []
    for index, row in input_df.iterrows():
        root_asin = row['asin']
        also_bought_list = convert_string_to_list(row['also_bought'])
        if also_bought_list:
            for also_bought_asin in also_bought_list:
                expanded_rows.append({'asin': root_asin, 'also_bought': also_bought_asin})

    expanded_df = pd.DataFrame(expanded_rows)

    # Step 2: Create reverse pairs
    reverse_pairs = expanded_df.rename(columns={'asin': 'also_bought', 'also_bought': 'asin'})

    # Step 3: Combine and remove duplicates
    combined_df = pd.concat([expanded_df, reverse_pairs]).drop_duplicates().reset_index(drop=True)

    return combined_df

### Create a 5-core co-purchase graph. Each product in the graph will have atleast 5 edges. 

###### Note that this graph is not cleaned yet

In [6]:
def reduce_to_k_core(co_purchase_df, k=5):
    while True:
        # Count connections for each ASIN
        connection_counts = co_purchase_df['asin'].value_counts().add(co_purchase_df['also_bought'].value_counts(), fill_value=0)

        # Identify nodes with fewer than k connections
        underconnected_nodes = set(connection_counts[connection_counts < k].index)

        if not underconnected_nodes:
            break

        # Filter out rows where ASINs have fewer than k connections
        co_purchase_df = co_purchase_df[~co_purchase_df['asin'].isin(underconnected_nodes) & ~co_purchase_df['also_bought'].isin(underconnected_nodes)]

    return co_purchase_df

k_core_df = reduce_to_k_core(co_purchase_df)
k_core_df.set_index('asin', inplace=True)

In [13]:
def get_image_list(image_folder):
    return set(os.listdir(image_folder))

def has_required_data(product, required_columns):
    if product.empty:
        return False
    return all(isinstance(product[col], str) for col in required_columns)

def is_image_valid(product, image_list):
    if product.empty or product['imUrl'] is None or pd.isna(product['imUrl']):
        return False

    image_name = product['imUrl'].split('/')[-1]
    return image_name in image_list

def is_asin_valid(asin, co_purchase_df, all_details_df, image_list, required_columns):
    product = all_details_df.loc[asin]
    if has_required_data(product, required_columns) and is_image_valid(product, image_list):
        return asin
    else:
        return None

def process_asins(co_purchase_df, all_details_df, image_list, required_columns):
    valid_asins = set()
    asins = co_purchase_df.index.unique()

    for asin in tqdm(asins):
        result = is_asin_valid(asin, co_purchase_df, all_details_df, image_list, required_columns)
        if result:
            valid_asins.add(result)

    return valid_asins

100%|██████████| 407672/407672 [00:21<00:00, 19195.19it/s]


### 5-core graph is cleaned to remove products with missing image, title and brand

In [15]:
# Step 1: Create image list and list of valid ASINs
image_list = get_image_list('/home/arnuv/amazon_dataset/clothing_shoes_jewellery/images')
valid_asins = process_asins(k_core_df, all_details_df, image_list, ['title', 'imUrl', 'brand'])

# Step 2: Create masks for both 'asin' and 'also_bought'
k_core_df = k_core_df.reset_index()
asin_mask = k_core_df['asin'].isin(valid_asins)
also_bought_mask = k_core_df['also_bought'].apply(lambda x: x in valid_asins)
combined_mask = asin_mask & also_bought_mask
filtered_df = k_core_df[combined_mask]
filtered_df.to_csv('kcore5new_brand.csv', index=False)

In [6]:
class AsinIdMap:
    def __init__(self, asin_list):
        self.asin_list = asin_list
        self.asin_to_idx = {}
        self.product_idx_to_asin = {}
        self._build()
    
    def _build(self):
        for idx, asin in enumerate(self.asin_list):
            self.asin_to_idx[asin] = idx
            self.product_idx_to_asin[idx] = asin
    
    def get_idx(self, asin):
        return self.asin_to_idx[asin]
    
    def get_asin(self, product_idx):
        return self.product_idx_to_asin[product_idx]
    
    def get_count(self):
        return len(self.asin_to_idx)

### Load the k-core graph into memory

In [21]:
k_5_csv = pd.read_csv('kcore5new_brand.csv')

asin_list = list(k_5_csv['asin'].unique())
asinIdLookup = AsinIdMap(asin_list)

### Create and store the edge list for the 5-core graph

In [5]:
def build_product_to_product_edge_index(edges_pd, asin_id_map, right_key):
    product_to_product_edge_index = []
    for idx, row in tqdm(edges_pd.iterrows()):
            asin1 = row['asin']
            product1_idx = asin_id_map.get_idx(asin1)
            asin2 = row[right_key]
            product2_idx = asin_id_map.get_idx(asin2)
            if product1_idx == product2_idx:
                continue
            product_to_product_edge_index.append([product1_idx, product2_idx])

    return torch.tensor(product_to_product_edge_index).t().contiguous()

In [23]:
edge_index = build_product_to_product_edge_index(k_5_csv, asinIdLookup, 'also_bought')
print("Edge shape: ", edge_index.shape)

2183it [00:00, 21825.81it/s]

574160it [00:26, 21319.34it/s]


Edge shape:  torch.Size([2, 574160])


In [24]:
np.save("k5_edge_index_brand.npy", edge_index)