In [1]:
from datasets import Dataset

In [4]:
import pandas as pd
# train = pd.read_csv("/kaggle/input/traincsv/train.csv")
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram


##### Shuffling and getting the subset

In [5]:
# Shuffle the DataFrame
shuffled_train = train.sample(frac=1, random_state=42).reset_index(drop=True)

# Ensure the first 5000 rows contain all types of values of entity_name
subset_size = 5000
unique_entity_names = shuffled_train['entity_name'].unique()
required_rows = []

for entity in unique_entity_names:
    entity_rows = shuffled_train[shuffled_train['entity_name'] == entity]
    required_rows.append(entity_rows)

# Concatenate the required rows and shuffle again to mix them
required_rows_df = pd.concat(required_rows).sample(frac=1, random_state=42).reset_index(drop=True)

# Ensure all rows with the same group_id are included
group_ids = required_rows_df['group_id'].unique()
final_rows = []

for group_id in group_ids:
    group_rows = shuffled_train[shuffled_train['group_id'] == group_id]
    final_rows.append(group_rows)

# Concatenate the final rows and sort by group_id
final_subset_df = pd.concat(final_rows).sort_values(by='group_id').reset_index(drop=True)

# Select the first 5000 rows
final_subset = final_subset_df.head(subset_size)

# Update the train DataFrame to be the final subset
train = final_subset
train.to_csv("train_subset.csv", index=False)

##### Function to download images

In [None]:
import os
import multiprocessing
from functools import partial
from tqdm import tqdm
from PIL import Image
from pathlib import Path
import urllib.request
import time

def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            print("Downloading image: ", image_link)
            urllib.request.urlretrieve(image_link, image_save_path)
            return image_save_path
        except:
            time.sleep(delay)
    
    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)

In [None]:
cwd = os.getcwd()
images_dir = os.path.join(cwd, 'images')
if not os.path.exists(images_dir):
    os.makedirs(images_dir)


##### Download Images

In [None]:
image_links = train["image_link"].tolist()
download_images(image_links, "images")

##### Create the python object in one go (might not work due to memory constraints)

In [None]:
data_dict = {}

def get_image_path(image_link):
    return f"images/{Path(image_link).name}"

for index, row in train.iterrows():
    data_dict[index] = {
        "group_id": row["group_id"],
        "image": Image.open(get_image_path(row["image_link"])),
        "image_link": row["image_link"],
        "entity_name": row["entity_name"],
        "entity_value": row["entity_value"],
    }

In [None]:
import pyarrow as pa

def is_compatible_with_pyarrow(data_dict):
    try:
        # Attempt to convert the data_dict to a PyArrow Table
        pa.Table.from_pydict(data_dict)
        return True
    except Exception as e:
        print(f"Data is not compatible with PyArrow: {e}")
        return False

# Check compatibility
if is_compatible_with_pyarrow(data_dict):
    print("data_dict is compatible with PyArrow.")
else:
    print("data_dict is not compatible with PyArrow.")


In [None]:

# Convert the images to byte arrays and ensure all values are strings to make the data compatible with PyArrow
import io

def convert_image_to_bytes(image):
    with io.BytesIO() as output:
        image.save(output, format="PNG")
        return output.getvalue()

for index in data_dict:
    data_dict[index]["image"] = convert_image_to_bytes(data_dict[index]["image"])
    data_dict[index]["group_id"] = str(data_dict[index]["group_id"])
    data_dict[index]["entity_value"] = str(data_dict[index]["entity_value"])

# Re-check compatibility with PyArrow
if is_compatible_with_pyarrow(data_dict):
    print("data_dict is now compatible with PyArrow.")
else:
    print("data_dict is still not compatible with PyArrow.")

# Ensure all values in data_dict are strings to make the data compatible with PyArrow
for index in data_dict:
    for key in data_dict[index]:
        if not isinstance(data_dict[index][key], bytes):
            data_dict[index][key] = str(data_dict[index][key])

# Re-check compatibility with PyArrow
if is_compatible_with_pyarrow(data_dict):
    print("data_dict is now compatible with PyArrow.")
else:
    print("data_dict is still not compatible with PyArrow.")
    
# Ensure all values in data_dict are either strings or bytes to make the data compatible with PyArrow
for index in data_dict:
    for key in data_dict[index]:
        if not isinstance(data_dict[index][key], (bytes, str)):
            data_dict[index][key] = str(data_dict[index][key])

# Re-check compatibility with PyArrow
if is_compatible_with_pyarrow(data_dict):
    print("data_dict is now compatible with PyArrow.")
else:
    print("data_dict is still not compatible with PyArrow.")



##### Load as HF dataset

In [None]:
dataset = Dataset.from_dict(data_dict)

##### HF login and pushing the dataset to HF

In [None]:
import getpass
os.environ["HF_TOKEN"] = getpass.getpass("Enter your Hugging Face token: ")

In [None]:
from huggingface_hub import login

login(token=os.environ["HF_TOKEN"])

In [None]:
dataset.push_to_hub("amanm10000/amazon-ml-challenge-train")