In [1]:
import pandas as pd
import os
from tqdm import tqdm
import ast
import warnings
from concurrent.futures import ThreadPoolExecutor, as_completed
from concurrent.futures import ProcessPoolExecutor, as_completed
import numpy as np
import torch


In [2]:
def count_nodes_edges(co_purchase_df):
    # Count unique nodes
    unique_nodes = co_purchase_df['asin'].nunique()

    # Count edges
    edges = len(co_purchase_df['asin'])

    return unique_nodes, edges

In [3]:
# Load datasets
all_details_df = pd.read_csv('amazon_dataset/clothing_shoes_jewellery/all_products.csv')
co_purchase_df = pd.read_csv('bidirectional_links.csv')

In [4]:

def count_bidirectional_links(df):
    # Create a set of tuples for each link
    links = set((row['asin'], row['also_bought']) for index, row in df.iterrows())

    # Count bidirectional links
    bidirectional_count = sum((b, a) in links for a, b in links)

    # Since each bidirectional link is counted twice, divide by 2
    return bidirectional_count // 2

In [5]:
def process_amazon_data(input_df):
    # Function to safely convert string to list
    def convert_string_to_list(string):
        try:
            return ast.literal_eval(string)
        except (ValueError, SyntaxError):
            return []

    # Step 1: Convert string representation of list to actual list and then expand
    expanded_rows = []
    for index, row in input_df.iterrows():
        root_asin = row['asin']
        also_bought_list = convert_string_to_list(row['also_bought'])
        if also_bought_list:
            for also_bought_asin in also_bought_list:
                expanded_rows.append({'asin': root_asin, 'also_bought': also_bought_asin})

    expanded_df = pd.DataFrame(expanded_rows)

    # Step 2: Create reverse pairs
    reverse_pairs = expanded_df.rename(columns={'asin': 'also_bought', 'also_bought': 'asin'})

    # Step 3: Combine and remove duplicates
    combined_df = pd.concat([expanded_df, reverse_pairs]).drop_duplicates().reset_index(drop=True)

    return combined_df

In [6]:
def reduce_to_k_core(co_purchase_df, k=5):
    while True:
        # Count connections for each ASIN
        connection_counts = co_purchase_df['asin'].value_counts().add(co_purchase_df['also_bought'].value_counts(), fill_value=0)

        # Identify nodes with fewer than k connections
        underconnected_nodes = set(connection_counts[connection_counts < k].index)

        if not underconnected_nodes:
            break

        # Filter out rows where ASINs have fewer than k connections
        co_purchase_df = co_purchase_df[~co_purchase_df['asin'].isin(underconnected_nodes) & ~co_purchase_df['also_bought'].isin(underconnected_nodes)]

    return co_purchase_df


In [7]:
# Reduce to k-core and save to a new file
k_core_df = reduce_to_k_core(co_purchase_df)


In [8]:
def get_image_list(image_folder):
    return set(os.listdir(image_folder))

def has_required_data(asin, df, required_columns):
    product = df[df['asin'] == asin]
    if product.empty:
        return False
    return all(product[col].notna().values[0] for col in required_columns)

def is_image_valid(asin, df, image_list):
    product = df[df['asin'] == asin]
    if product.empty or product['imUrl'].isna().values[0]:
        return False
    image_name = product['imUrl'].values[0].split('/')[-1]
    return image_name in image_list



In [9]:
def is_asin_valid(asin, co_purchase_df, all_details_df, image_folder, required_columns):
    if asin in co_purchase_df['asin'].values:
        if has_required_data(asin, all_details_df, required_columns) and is_image_valid(asin, all_details_df, image_folder):
            return asin
    return None


In [10]:
image_list = get_image_list('/home/arnuv/amazon_dataset/clothing_shoes_jewellery/images')

In [11]:
# Set 'asin' as the index for both DataFrames
k_core_df.set_index('asin', inplace=True)
all_details_df.set_index('asin', inplace=True)

In [12]:
all_details_df.head()

Unnamed: 0_level_0,title,price,imUrl,brand,description,categories,category,also_viewed,also_bought,bought_together
asin,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
37214,Purple Sequin Tiny Dancer Tutu Ballet Dance Fa...,6.99,http://ecx.images-amazon.com/images/I/31mCncNu...,Big Dreams,,"[['Clothing, Shoes & Jewelry', 'Girls'], ['Clo...",clothing_shoes_jewellery,['B00JO8II76'],,
31887,Ballet Dress-Up Fairy Tutu,6.79,http://ecx.images-amazon.com/images/I/314qZjYe...,Boutique Cutie,This adorable basic ballerina tutu is perfect ...,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",clothing_shoes_jewellery,"['B00538F5OK', 'B003AVKOP2', 'B008F0SU0Y', 'B0...","['B003AVKOP2', 'B00D103F8U', 'B008F0SU0Y', 'B0...",[]
123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,64.98,http://ecx.images-amazon.com/images/I/413tGhqo...,,Elegance par excellence. Hand-crafted of the f...,"[['Clothing, Shoes & Jewelry', 'Novelty, Costu...",clothing_shoes_jewellery,"['B0006JCGUM', 'B000BMTCK6', 'B00194Q262', 'B0...","['B000BMTCK6', 'B0006JCGUM', 'B00194Q262', 'B0...","['B000BMTCK6', 'B0006JCGUM']"
456844570,RiZ Women's Beautify Crafted &frac12; Rimmed F...,,http://ecx.images-amazon.com/images/I/31QZTHxv...,,,"[['Clothing, Shoes & Jewelry', 'Women', 'Acces...",clothing_shoes_jewellery,"['B004UOPY80', 'B00G2LY6OC', 'B00877BFFM', 'B0...",,
456808574,Lantin White Visor Wrap Around Ski Style Aviat...,,http://ecx.images-amazon.com/images/I/31UsrgT5...,,,"[['Clothing, Shoes & Jewelry', 'Women', 'Acces...",clothing_shoes_jewellery,"['B005WF42SY', 'B003T1CFF0', 'B006AZCMUA', 'B0...",,


In [13]:
def has_required_data(product, required_columns):
    if product.empty:
        return False
    return all(isinstance(product[col], str) for col in required_columns)

def is_image_valid(product, image_list):
    if product.empty or product['imUrl'] is None or pd.isna(product['imUrl']):
        return False

    image_name = product['imUrl'].split('/')[-1]
    return image_name in image_list

def is_asin_valid(asin, co_purchase_df, all_details_df, image_list, required_columns):
    product = all_details_df.loc[asin]
    if has_required_data(product, required_columns) and is_image_valid(product, image_list):
        return asin
    else:
        return None

def process_asins(co_purchase_df, all_details_df, image_list, required_columns):
    valid_asins = set()
    asins = co_purchase_df.index.unique()

    for asin in tqdm(asins):
        result = is_asin_valid(asin, co_purchase_df, all_details_df, image_list, required_columns)
        if result:
            valid_asins.add(result)

    return valid_asins

# Usage example
image_list = get_image_list('/home/arnuv/amazon_dataset/clothing_shoes_jewellery/images')

valid_asins = process_asins(k_core_df, all_details_df, image_list, ['title', 'imUrl', 'brand'])

100%|██████████| 407672/407672 [00:21<00:00, 19195.19it/s]


In [14]:
k_core_df = k_core_df.reset_index()

In [15]:
# Step 2: Create masks for both 'asin' and 'also_bought'
asin_mask = k_core_df['asin'].isin(valid_asins)
also_bought_mask = k_core_df['also_bought'].apply(lambda x: x in valid_asins)

In [16]:
combined_mask = asin_mask & also_bought_mask

In [17]:
filtered_df = k_core_df[combined_mask]

In [18]:
filtered_df.to_csv('kcore5new_brand.csv', index=False)

CODE FOR GENERATING EDGE_INDICES

In [19]:
k_5_csv = pd.read_csv('kcore5new_brand.csv')

In [20]:
class AsinIdMap:
    def __init__(self, asin_list):
        self.asin_list = asin_list
        self.asin_to_idx = {}
        self.product_idx_to_asin = {}
        self._build()
    
    def _build(self):
        for idx, asin in enumerate(self.asin_list):
            self.asin_to_idx[asin] = idx
            self.product_idx_to_asin[idx] = asin
    
    def get_idx(self, asin):
        return self.asin_to_idx[asin]
    
    def get_asin(self, product_idx):
        return self.product_idx_to_asin[product_idx]
    
    def get_count(self):
        return len(self.asin_to_idx)

In [21]:
asin_list = list(k_5_csv['asin'].unique())
asinIdLookup = AsinIdMap(asin_list)

In [22]:
def build_product_to_product_edge_index(edges_pd, asin_id_map, right_key):
    product_to_product_edge_index = []
    for idx, row in tqdm(edges_pd.iterrows()):
            asin1 = row['asin']
            product1_idx = asin_id_map.get_idx(asin1)
            asin2 = row[right_key]
            product2_idx = asin_id_map.get_idx(asin2)
            if product1_idx == product2_idx:
                continue
            product_to_product_edge_index.append([product1_idx, product2_idx])

    return torch.tensor(product_to_product_edge_index).t().contiguous()

In [23]:
edge_index = build_product_to_product_edge_index(k_5_csv, asinIdLookup, 'also_bought')
print("Edge shape: ", edge_index.shape)

2183it [00:00, 21825.81it/s]

574160it [00:26, 21319.34it/s]


Edge shape:  torch.Size([2, 574160])


In [24]:
np.save("k5_edge_index_brand.npy", edge_index)