# Walmart Amazon Data Generator

This is the data loader notebook for the Walmart-Amazon dataset. This dataset was the easiest to evaluate because of a fairly small dataset (~15k samples overall) with a lot of token overlap.

In [2]:
import pandas as pd
from tqdm import tqdm

In [3]:
train_df = pd.read_csv("data/walmart-amazon/gs_train.csv")
train_df.head()

Unnamed: 0,source_id,target_id,matching
0,624,2862,False
1,1510,16351,False
2,102,9859,False
3,839,13024,False
4,971,5512,False


In [4]:
walmart_df = pd.read_csv("data/walmart-amazon/record_descriptions/1_walmart.csv")
amazon_df = pd.read_csv("data/walmart-amazon/record_descriptions/2_amazon.csv")
walmart_df.head()

Unnamed: 0,subject_id,id,upc,brand,groupname,title,price,shelfdescr,shortdescr,longdescr,imageurl,orig_shelfdescr,orig_shortdescr,orig_longdescr,modelno,shipweight,dimensions
0,1,14249992,64109216245,Draper,Electronics - General,Draper Infrared Remote Transmitter,58.45,Infrared transmitter. 3-button operation for i...,,DR1143Infrared transmitter. 3-button operation...,http://i.walmartimages.com/i/mp/00/64/10/92/16...,Infrared transmitter. 3-button operation for i...,,DR1143Infrared transmitter. 3-button operation...,121066,2.0,
1,2,10928662,1034383417,Epson,Monitors,Epson 1500 Hours 200W UHE Projector Lamp ELPLP12,438.84,,Epson ELPLP12 Replacement Lamp,EPSON ELPLP12 1500HRS 200V REPL LAMP FOR LAMP ...,http://i.walmartimages.com/i/p/00/01/03/43/83/...,,Epson ELPLP12 Replacement Lamp,EPSON ELPLP12 1500HRS 200V REPL LAMP FOR LAMP ...,ELPLP12,0.95,6.75 x 5.75 x 5.5 inches
2,3,11961447,80844700031,Comprehensive,TV Accessories,Comprehensive Two-Piece 75 Precision BNC Jack ...,59.25,Comprehensive s True 75 connectors eliminate i...,,CH1151Comprehensive s True 75 connectors elimi...,http://i.walmartimages.com/i/mp/00/80/84/47/00...,Comprehensive's True 75 connectors eliminate i...,,CH1151Comprehensive's True 75 connectors elimi...,BJ-2C7559,0.05,
3,4,13044637,79006932160,D-Link,Garden - General,D-Link DCS-1100 Network Camera,99.82,Surveillance Network Camera Built-in Omni-dir...,The D-Link DCS-1100 Network Camera comes with ...,Surveillance Network Camera Built-in Omni-dir...,http://i.walmartimages.com/i/p/00/79/00/69/32/...,<UL><LI>Surveillance/Network Camera</LI><LI>Bu...,The D-Link DCS-1100 Network Camera comes with ...,<UL><LI>Surveillance/Network Camera</LI><LI>Bu...,DCS-1100,,
4,5,13214131,6503081691,StarTech,Electronics - General,StarTech.com RKPW247015 24 Outlet Power Strip,59.0,Fits most 42U racks and equipment cabinets 10...,The StarTech.com RKPW247015 24 Outlet Power St...,24 Outlet Power Strip solution for your rackmo...,http://i.walmartimages.com/i/p/00/06/50/30/81/...,<ul><li>Fits most 42U racks and equipment cabi...,The StarTech.com RKPW247015 24 Outlet Power St...,<ul><li>24 Outlet Power Strip solution for you...,RKPW247015,5.25,72.5 x 2.5 x 1.5 inches


In [5]:
import numpy as np

def get_catalog_and_queries_df(judgments_df, amazon_df, walmart_df):

    target_ids = np.unique(walmart_df["subject_id"].values)
    catalog_df = walmart_df[walmart_df["subject_id"].isin(target_ids)]
    catalog_df = catalog_df.rename({"subject_id": "catalog_id"}, axis=1)
    catalog_df = catalog_df.rename({"title": "text"}, axis=1)
    
    train_df = {
        "input_text": [],
        "match_id": [],
        "judgment": [],
    }

    for j, row in tqdm(judgments_df.iterrows(), total=len(judgments_df.index)):
        source_id = row["source_id"]
        target_id = row["target_id"]

        train_df["input_text"].append(amazon_df[amazon_df["subject_id"] == target_id].iloc[0]["title"])
        train_df["match_id"].append(source_id)
        train_df["judgment"].append(row["matching"])

    queries_df = pd.DataFrame(train_df)
    
    return catalog_df, queries_df

In [6]:
train_catalog_df, train_queries_df = get_catalog_and_queries_df(pd.read_csv("data/walmart-amazon/gs_train.csv"), amazon_df, walmart_df)
val_catalog_df, val_queries_df = get_catalog_and_queries_df(pd.read_csv("data/walmart-amazon/gs_val.csv"), amazon_df, walmart_df)
test_catalog_df, test_queries_df = get_catalog_and_queries_df(pd.read_csv("data/walmart-amazon/gs_test.csv"), amazon_df, walmart_df)

100%|██████████| 10905/10905 [00:03<00:00, 3444.40it/s]
100%|██████████| 3131/3131 [00:00<00:00, 3800.73it/s]
100%|██████████| 1543/1543 [00:00<00:00, 3607.37it/s]


In [7]:
train_catalog_df.to_csv("data/walmart-amazon/train_catalog.csv", index=False)
train_queries_df.to_csv("data/walmart-amazon/train_queries.csv", index=False)
val_catalog_df.to_csv("data/walmart-amazon/val_catalog.csv", index=False)
val_queries_df.to_csv("data/walmart-amazon/val_queries.csv", index=False)
test_catalog_df.to_csv("data/walmart-amazon/test_catalog.csv", index=False)
test_queries_df.to_csv("data/walmart-amazon/test_queries.csv", index=False)

In [8]:
train_catalog_df

Unnamed: 0,catalog_id,id,upc,brand,groupname,text,price,shelfdescr,shortdescr,longdescr,imageurl,orig_shelfdescr,orig_shortdescr,orig_longdescr,modelno,shipweight,dimensions
0,1,14249992,64109216245,Draper,Electronics - General,Draper Infrared Remote Transmitter,58.45,Infrared transmitter. 3-button operation for i...,,DR1143Infrared transmitter. 3-button operation...,http://i.walmartimages.com/i/mp/00/64/10/92/16...,Infrared transmitter. 3-button operation for i...,,DR1143Infrared transmitter. 3-button operation...,121066,2.00,
1,2,10928662,1034383417,Epson,Monitors,Epson 1500 Hours 200W UHE Projector Lamp ELPLP12,438.84,,Epson ELPLP12 Replacement Lamp,EPSON ELPLP12 1500HRS 200V REPL LAMP FOR LAMP ...,http://i.walmartimages.com/i/p/00/01/03/43/83/...,,Epson ELPLP12 Replacement Lamp,EPSON ELPLP12 1500HRS 200V REPL LAMP FOR LAMP ...,ELPLP12,0.95,6.75 x 5.75 x 5.5 inches
2,3,11961447,80844700031,Comprehensive,TV Accessories,Comprehensive Two-Piece 75 Precision BNC Jack ...,59.25,Comprehensive s True 75 connectors eliminate i...,,CH1151Comprehensive s True 75 connectors elimi...,http://i.walmartimages.com/i/mp/00/80/84/47/00...,Comprehensive's True 75 connectors eliminate i...,,CH1151Comprehensive's True 75 connectors elimi...,BJ-2C7559,0.05,
3,4,13044637,79006932160,D-Link,Garden - General,D-Link DCS-1100 Network Camera,99.82,Surveillance Network Camera Built-in Omni-dir...,The D-Link DCS-1100 Network Camera comes with ...,Surveillance Network Camera Built-in Omni-dir...,http://i.walmartimages.com/i/p/00/79/00/69/32/...,<UL><LI>Surveillance/Network Camera</LI><LI>Bu...,The D-Link DCS-1100 Network Camera comes with ...,<UL><LI>Surveillance/Network Camera</LI><LI>Bu...,DCS-1100,,
4,5,13214131,6503081691,StarTech,Electronics - General,StarTech.com RKPW247015 24 Outlet Power Strip,59.00,Fits most 42U racks and equipment cabinets 10...,The StarTech.com RKPW247015 24 Outlet Power St...,24 Outlet Power Strip solution for your rackmo...,http://i.walmartimages.com/i/p/00/06/50/30/81/...,<ul><li>Fits most 42U racks and equipment cabi...,The StarTech.com RKPW247015 24 Outlet Power St...,<ul><li>24 Outlet Power Strip solution for you...,RKPW247015,5.25,72.5 x 2.5 x 1.5 inches
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2549,2550,16360703,5114123170,3M,Stationery &amp; Office Machinery,3M Dry-Erase Board Porcelain 6 x4 Aluminum Frame,579.29,3M Aluminum Frame Porcelain Dry-erase Board Dr...,,TM5788 3M Aluminum Frame Porcelain Dry-erase B...,http://i.walmartimages.com/i/mp/00/05/11/41/23...,3M Aluminum Frame Porcelain Dry-erase Board Dr...,,TM5788<b>3M Aluminum Frame Porcelain Dry-erase...,,42.00,
2550,2551,16553062,89979400511,HTC,Electronics - General,HTC MyTouch 3G White Unlocked,199.00,Android OS v1.5 3.15MP camera Video recording ...,The HTC MyTouch 3G features a Gorilla Glass di...,HTC MyTouch 3G Sound and Music Vibration polyp...,http://i.walmartimages.com/i/p/00/89/97/94/00/...,"<li>Android OS, v1.5<li>3.15MP camera<li>Video...",The HTC MyTouch 3G features a Gorilla Glass di...,<p><b>HTC MyTouch 3G:</b><p><b>Sound and Music...,Mytouch 3G,0.85,4.45 x 2.19 x 0.58 inches
2551,2552,16637140,89807400111,Dell,Printers,Dell Series 2 Color Ink,35.97,Color Color For Dell A940 A960 printers,The Dell Series 2 Color Ink provides consisten...,Dell Series 2 Color Ink Color Color For Dell A...,http://i.walmartimages.com/i/p/00/89/80/74/00/...,"<li>Color: Color<li>For Dell A940, A960 printers",The Dell Series 2 Color Ink provides consisten...,<p><b>Dell Series 2 Color Ink:</b><ul><li>Colo...,330-0048,7.00,5.5 x 3.86 x 1.26 inches
2552,2553,9714616,72286814877,Belkin,Electronics - General,Belkin Pro Series High Integrity Monitor Cable...,7.88,The Belkin Pro Series High Integrity Monitor C...,The Belkin Pro Series High Integrity Monitor C...,Technical Information Cable Type Monitor ...,http://i.walmartimages.com/i/p/00/72/28/68/14/...,The Belkin Pro Series High Integrity Monitor C...,The Belkin Pro Series High Integrity Monitor C...,<table><tr><td colspan=&quot;2&quot; height=&q...,F3H982-06,0.60,6.3 x 3.6 x 0.9 inches
