### Ebay API product search - IPhone

In [None]:
import requests
import json

CLIENT_ID = "___"       #YOUR CLIENT ID HERE 
CLIENT_SECRET = "___"       #YOUR CLIENT-SECRET HERE 

def get_app_token(client_id, client_secret):
    url = "https://api.ebay.com/identity/v1/oauth2/token"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    data = {
        "grant_type": "client_credentials",
        "scope": "https://api.ebay.com/oauth/api_scope"
    }
    response = requests.post(url, headers=headers, data=data, auth=(client_id, client_secret))
    response.raise_for_status()
    return response.json()['access_token']

def search_items(access_token, query, total_limit=1000):
    url = "https://api.ebay.com/buy/browse/v1/item_summary/search"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
        "X-EBAY-C-ENDUSERCTX": "contextualLocation=country=US"
    }

    all_items = []
    offset = 0
    page_size = 100  
    while offset < total_limit:   #loop
        params = {
            "q": query,
            "limit": page_size,
            "offset": offset
        }
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        items = data.get("itemSummaries", [])
        all_items.extend(items)
        if not items:
            break
        offset += page_size
    return all_items

if __name__ == "__main__":
    token = get_app_token(CLIENT_ID, CLIENT_SECRET)
    items = search_items(token, "iphone 16", total_limit=1000)  #search term
    print(f"Total items retrieved: {len(items)}\n")

with open('iphone_results.json', 'w') as f:
    json.dump(items, f, indent=2)


Total items retrieved: 1000



In [10]:
#print first item
for idx, item in enumerate(items[:1], start=1):
    print(f"--- Item {idx} ---")
    print(json.dumps(item, indent=2))
    print("\n")

--- Item 1 ---
{
  "itemId": "v1|356605678205|625572140283",
  "title": "Apple iPhone 16 128GB Unlocked Very Good Condition",
  "itemGroupHref": "https://api.ebay.com/buy/browse/v1/item/get_items_by_item_group?item_group_id=356605678205",
  "leafCategoryIds": [
    "9355"
  ],
  "categories": [
    {
      "categoryId": "9355",
      "categoryName": "Cell Phones & Smartphones"
    },
    {
      "categoryId": "15032",
      "categoryName": "Cell Phones & Accessories"
    }
  ],
  "image": {
    "imageUrl": "https://i.ebayimg.com/images/g/3bwAAeSwgUtnvQT6/s-l225.jpg"
  },
  "price": {
    "value": "687.50",
    "currency": "USD"
  },
  "itemGroupType": "SELLER_DEFINED_VARIATIONS",
  "itemHref": "https://api.ebay.com/buy/browse/v1/item/v1%7C356605678205%7C625572140283",
  "seller": {
    "username": "directauth",
    "feedbackPercentage": "97.7",
    "feedbackScore": 23260
  },
  "marketingPrice": {
    "originalPrice": {
      "value": "732.99",
      "currency": "USD"
    },
    "disco

#### Clean Ebay data - IPhone

In [None]:
import pandas as pd
import json
with open("data/iphone_results.json", "r") as f:
    data = json.load(f)
# Convert to DataFrame
df_iphone = pd.json_normalize(data)
#df_iphone.to_csv("iphone_pre.csv")
#Extract shipping cost
df_iphone["shipping_cost"] = df_iphone["shippingOptions"].apply(
    lambda x: float(x[0]["shippingCost"]["value"]) if isinstance(x, list) and "shippingCost" in x[0] else None
)

#Use date to calculate days listed
df_iphone["item_origin_date"] = pd.to_datetime(df_iphone["itemOriginDate"], errors="coerce")
today = pd.Timestamp.now(tz='UTC')
df_iphone["days_listed"] = (today - df_iphone["item_origin_date"]).dt.days

#Convert category ID to number
df_iphone["leaf_category_id"] = df_iphone["leafCategoryIds"].apply(
    lambda x: int(x[0]) if isinstance(x, list) and len(x) > 0 else None
)

#Extract category
def get_first_category_id(categories):
    if categories and len(categories) > 0:
        return categories[0].get('categoryId')
    return None
def get_first_category_name(categories):
    if categories and len(categories) > 0:
        return categories[0].get('categoryName')
    return None
df_iphone['category_id'] = df_iphone['categories'].apply(get_first_category_id)
df_iphone['category_name'] = df_iphone['categories'].apply(get_first_category_name)

#Find sellers with multiple items
df_iphone["seller_item_count"] = df_iphone["seller.username"].map(df_iphone["seller.username"].value_counts())

#Extract model, to be filtered
df_iphone['model'] = df_iphone['title'].str.extract(r"(iPhone\s*\d+)", expand=False)
df_iphone['model_number'] = df_iphone['model'].str.extract(r"(\d+)", expand=False)
df_iphone['model_number'] = pd.to_numeric(df_iphone['model_number'], errors='coerce')

#filter out old models and outlier prices
df_iphone['price.value'] = pd.to_numeric(df_iphone['price.value'], errors='coerce')
df_iphone_filtered = df_iphone[
    (df_iphone['model_number'] > 13) & (df_iphone['price.value'] < 1500)]

#keep only needed columns
df_iphone_filtered = df_iphone_filtered[["itemId", "title", "condition", "conditionId", "price.value", "price.currency", "seller.username", 
                    "seller.feedbackPercentage", "seller.feedbackScore", "itemLocation.country", "marketingPrice.originalPrice.value",
                    "marketingPrice.discountPercentage", "shipping_cost", "days_listed", "category_id", "category_name", "seller_item_count", "model_number"]]

#Export to CSV
df_iphone_filtered.to_csv("data/iphone_cleaned.csv", index=False)

In [None]:
df_iphone_filtered.head()

Unnamed: 0,itemId,title,condition,conditionId,price.value,price.currency,seller.username,seller.feedbackPercentage,seller.feedbackScore,itemLocation.country,marketingPrice.originalPrice.value,marketingPrice.discountPercentage,shipping_cost,days_listed,category_id,category_name,seller_item_count,model_number
0,v1|356605678205|625572140284,Apple iPhone 16 128GB Unlocked Very Good Condi...,Very Good - Refurbished,2020,765.01,USD,directauth,97.7,23254,US,,,0.0,137,9355,Cell Phones & Smartphones,23.0,16.0
1,v1|296890540172|594654890070,Apple iPhone 16 Plus 128GB Unlocked AT&T T-Mob...,Very Good - Refurbished,2020,653.95,USD,supplytronics,98.3,115556,US,899.0,27.0,0.0,204,9355,Cell Phones & Smartphones,45.0,16.0
2,v1|396850329079|0,iPhone 14 Pro,Used,3000,400.0,USD,harmd9,0.0,0,US,,,,0,9355,Cell Phones & Smartphones,1.0,14.0
3,v1|356605688915|625572005827,Apple iPhone 16 128GB Unlocked Excellent Condi...,Excellent - Refurbished,2010,702.49,USD,directauth,97.7,23254,US,,,0.0,137,9355,Cell Phones & Smartphones,23.0,16.0
4,v1|355095070816|624350343872,Apple iPhone 14 128GB Network Unlocked Very Go...,Very Good - Refurbished,2020,326.46,USD,directauth,97.7,23254,US,699.0,53.0,0.0,641,9355,Cell Phones & Smartphones,23.0,14.0


### Ebay API Product Search - Soccer jerseys

In [None]:
import requests
import json

# Your eBay app credentials
CLIENT_ID = "___"       #YOUR CLIENT ID HERE 
CLIENT_SECRET = "___"       #YOUR CLIENT-SECRET HERE

def get_app_token(client_id, client_secret):
    url = "https://api.ebay.com/identity/v1/oauth2/token"
    headers = {"Content-Type": "application/x-www-form-urlencoded"}
    data = {
        "grant_type": "client_credentials",
        "scope": "https://api.ebay.com/oauth/api_scope"
    }
    response = requests.post(url, headers=headers, data=data, auth=(client_id, client_secret))
    response.raise_for_status()
    return response.json()['access_token']

def search_items(access_token, query, total_limit=1000):
    url = "https://api.ebay.com/buy/browse/v1/item_summary/search"
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
        "X-EBAY-C-ENDUSERCTX": "contextualLocation=country=US"
    }

    all_items = []
    offset = 0
    page_size = 100  # eBay max per page

    while offset < total_limit:
        params = {
            "q": query,
            "limit": page_size,
            "offset": offset
        }
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()
        data = response.json()
        items = data.get("itemSummaries", [])
        all_items.extend(items)
        if not items:
            break
        offset += page_size

    return all_items

if __name__ == "__main__":
    token = get_app_token(CLIENT_ID, CLIENT_SECRET)
    soccer_items = search_items(token, "soccer jersey", total_limit=1000)
    print(f"Total items retrieved: {len(soccer_items)}\n")

with open('data/soccer_results.json', 'w') as f:
    json.dump(soccer_items, f, indent=2)

Total items retrieved: 1000



#### Clean Ebay data - Soccer jerseys

In [None]:
from rapidfuzz import fuzz, process
import re
import pandas as pd
import json
with open("data/soccer_results.json", "r") as f:
    data = json.load(f)
df_soccer = pd.json_normalize(data)

#Use rapidfuzz (fuzzy search) to find club or country names within title
clubs = ["Liverpool", "Arsenal", "Manchester United", "Manchester City", "Paris Saint Germain", "Inter Milan", "Juventus",
         "Real Madrid", "Barcelona", "Bayern Munich", "Club America", "Inter Miami"]
countries = ["Mexico", "Argentina", "Brazil", "France", "Germany", "Spain", "England", "Italy", "USA", "South Korea"]

def extract_best_match(text, choices, score_cutoff=87):
    if not isinstance(text, str):
        return None
    match, score, _ = process.extractOne(text, choices, scorer=fuzz.partial_ratio)
    return match if score >= score_cutoff else None

df_soccer["club"] = df_soccer["title"].apply(lambda x: extract_best_match(x, clubs))
df_soccer["country"] = df_soccer["title"].apply(lambda x: extract_best_match(x, countries))

#Find year within title
def extract_final_year(title):
    if not isinstance(title, str):
        return None
    range_match = re.findall(r'(\d{2,4})[-/](\d{2,4})', title)
    if range_match:
        _, second = range_match[0]
        # Expand 2-digit years
        if len(second) == 2:
            if int(second) <= 30:
                return f"20{second}"
            else:
                return f"19{second}"
        else:
            return second
    single_match = re.findall(r'\d{4}', title)
    if single_match:
        return single_match[0]
    # Ignore single 2-digit years
    return None
df_soccer["year"] = df_soccer["title"].apply(extract_final_year)

#Extract shipping cost
df_soccer["shipping_cost"] = df_soccer["shippingOptions"].apply(
    lambda x: float(x[0]["shippingCost"]["value"]) if isinstance(x, list) and "shippingCost" in x[0] else None
)

#Use date to calculate days listed
df_soccer["item_origin_date"] = pd.to_datetime(df_soccer["itemOriginDate"], errors="coerce")
today = pd.Timestamp.now(tz='UTC')
df_soccer["days_listed"] = (today - df_soccer["item_origin_date"]).dt.days

#Extract category
def get_first_category_id(categories):
    if categories and len(categories) > 0:
        return categories[0].get('categoryId')
    return None
def get_first_category_name(categories):
    if categories and len(categories) > 0:
        return categories[0].get('categoryName')
    return None
df_soccer['category_id'] = df_soccer['categories'].apply(get_first_category_id)
df_soccer['category_name'] = df_soccer['categories'].apply(get_first_category_name)

#Find sellers with multiple listings
df_soccer["seller_item_count"] = df_soccer["seller.username"].map(df_soccer["seller.username"].value_counts())

#Filter columns needed
df_soccer = df_soccer[["itemId", "title", "conditionId", "condition", "price.value", "price.currency", "seller.username", 
          "seller.feedbackPercentage", "seller.feedbackScore", "itemLocation.country", "marketingPrice.originalPrice.value",
          "marketingPrice.discountPercentage", "shipping_cost", "days_listed", "category_id", "category_name", "seller_item_count", 
          "club", "country", "year"]]

# Export to CSV
df_soccer.to_csv("data/soccer_jerseys.csv", index=False)

In [35]:
df_soccer.head()

Unnamed: 0,itemId,title,conditionId,condition,price.value,price.currency,seller.username,seller.feedbackPercentage,seller.feedbackScore,itemLocation.country,marketingPrice.originalPrice.value,marketingPrice.discountPercentage,shipping_cost,days_listed,category_id,category_name,seller_item_count,club,country,year
0,v1|317069403907|615526711901,Liverpool Home Jersey 25/26 ( Player Version),1000,New,45.0,USD,iri4147,97.7,47,US,,,0.0,3,2887,Soccer-International Clubs,25,Liverpool,,2026.0
1,v1|177245638563|476899094411,Tigres UANL Away Jersey 2023/24 Size S-XL,1000,New,18.0,USD,soccerelf,100.0,10,US,,,0.0,7,2887,Soccer-International Clubs,8,,,2024.0
2,v1|317059278518|615510384436,Real Madrid Baseball Style Jersey Limited Edition,1000,New,52.0,USD,iri4147,97.7,47,US,,,0.0,6,2887,Soccer-International Clubs,25,Real Madrid,,
3,v1|388511742986|655644751940,Neymar Jr Santos 2012 Jersey - Retro Jersey - ...,1000,New,65.0,USD,shipezusa,97.1,84,US,,,0.0,122,2887,Soccer-International Clubs,11,,,2012.0
4,v1|177245711199|476913681054,Paris Player Version Jersey 2025/26 Size M-XXL,1000,New,35.0,USD,soccerelf,100.0,10,US,,,0.0,7,2887,Soccer-International Clubs,8,,,2026.0


### Rainforest API for Amazon - Microwaves

In [None]:
import requests
import json
def fetch_page(api_key, search_term, page, domain='amazon.com'):
    params = {
        'api_key': api_key,
        'type': 'search',
        'amazon_domain': domain,
        'search_term': search_term,
        'page': page
    }
    response = requests.get('https://api.rainforestapi.com/request', params)
    data = response.json()
    return data.get("search_results", [])

api_key = "___"    #YOUR API KEY HERE  
search_term = "microwave"  #search term

# Loop through 10 pages
all_results = []
for page in range(1, 11):
    results = fetch_page(api_key, search_term, page)
    if results:
        all_results.extend(results)
    else:
        print(f"No results on page {page}")
        break
print(results[0])

with open('data/microwave_results.json', 'w') as f:
    json.dump(all_results, f, indent=2)

#### Clean Amazon Info - Microwaves

In [None]:
import json
import pandas as pd
import re
# Convert to DataFrame
with open("data/microwave_results.json", "r") as f:
    data = json.load(f)
df_microwaves = pd.DataFrame(data)
#df_microwaves.to_csv("microwaves_pre.csv", index=False)

#drop duplicates
df_microwaves = df_microwaves.drop_duplicates(subset='asin', keep='first').reset_index(drop=True)

#drop missing prices
df_microwaves = df_microwaves[df_microwaves["price"] != ""].reset_index(drop=True)
df_microwaves = df_microwaves.dropna(subset=["price"]).reset_index(drop=True)

#extract size from title
def extract_cu_ft(title):
    if not isinstance(title, str):
        return None
    # Regex to find pattern like "0.7 Cu", "0.7 Cu. Ft.", "0.7 cu ft"
    pattern = r'(\d+(\.\d+)?)\s*Cu\.?\s*Ft\.?'
    match = re.search(pattern, title, re.IGNORECASE)
    if match:
        return float(match.group(1))
    return None
df_microwaves['cu_ft'] = df_microwaves['title'].apply(extract_cu_ft)

#Recent sales
def convert_sales_to_number(text):
    if not isinstance(text, str):
        return None  
    match = re.search(r'([\d\.]+)\s*(K)?', text, re.IGNORECASE)
    if match:
        number = float(match.group(1))
        multiplier = match.group(2)
        if multiplier and multiplier.upper() == 'K':
            number *= 1_000
        return int(number)
    return None
df_microwaves['recent_sales_num'] = df_microwaves['recent_sales'].apply(convert_sales_to_number)

#Extract price detail
for field in ["value", "list_price", "name"]:
    df_microwaves[f"price_{field}"] = df_microwaves["price"].apply(
        lambda x: x.get(field) if isinstance(x, dict) else None
    )

#Extact availability info
df_microwaves['stock_info'] = df_microwaves['availability'].apply(lambda x: x.get('raw') if isinstance(x, dict) else None)

#Keep only needed columns
df_microwaves = df_microwaves[["position", "title", "asin", "is_prime", "rating", "ratings_total", "sponsored",
                               "is_small_business", "coupon", "cu_ft", "recent_sales_num", "price_value", "price_list_price", 
                               "price_name", "stock_info"]]

df_microwaves.to_csv("data/microwaves.csv", index=False)

In [5]:
df_microwaves.head()

Unnamed: 0,position,title,asin,is_prime,rating,ratings_total,sponsored,is_small_business,coupon,cu_ft,recent_sales_num,price_value,price_list_price,price_name,stock_info
0,1,"Chefman Countertop Microwave Oven 0.7 Cu. Ft.,...",B0DY95T5HB,False,4.3,1396.0,True,,,0.7,9000,64.99,$99.99,Limited time deal,
1,2,"Farberware Countertop Microwave 1000 Watts, 1....",B01EIZSF6I,False,4.4,23609.0,True,,,1.1,4000,119.99,$129.99,Primary,
2,3,TOSHIBA EM131A5C-BS Countertop Microwave Ovens...,B071WCB1T6,False,4.4,54371.0,,,,1.2,10000,149.99,,,
3,5,5304522796 Microwave Door Latch Compatible wit...,B0F5WPTLZS,False,5.0,1.0,,,,,50,19.99,$21.97,Primary,
4,7,BLACK+DECKER EM044KB19 Over The Range Microwav...,B09SB8MK37,True,4.4,11371.0,True,,,1.9,400,342.99,,,


### Rainforest API for Amazon - Lego

In [None]:
import requests
import json
def fetch_page(api_key, search_term, page, domain='amazon.com'):
    params = {
        'api_key': api_key,
        'type': 'search',
        'amazon_domain': domain,
        'search_term': search_term,
        'page': page
    }
    response = requests.get('https://api.rainforestapi.com/request', params)
    data = response.json()
    return data.get("search_results", [])

api_key = "___"   #YOUR API KEY HERE   
search_term = "lego"  #search term

# Loop through 7 pages
lego_all_results = []
for page in range(1, 8):
    results = fetch_page(api_key, search_term, page)
    if results:
        lego_all_results.extend(results)
    else:
        break

with open('data/lego_results.json', 'w') as f:
    json.dump(lego_all_results, f, indent=2)

#### Clean Amazon data - Lego

In [None]:
import json
import pandas as pd
import re
with open("data/lego_results.json", "r") as f:
    data = json.load(f)
# Convert to DataFrame
df_lego = pd.DataFrame(data)
#df_lego.to_csv("lego_pre.csv", index=False)

#drop duplicates
df_lego = df_lego.drop_duplicates(subset='asin', keep='first').reset_index(drop=True)

#drop missing prices
df_lego = df_lego[df_lego["price"] != ""].reset_index(drop=True)
df_lego = df_lego.dropna(subset=["price"]).reset_index(drop=True)

#extract theme from title
def extract_lego_theme(title):
    if not isinstance(title, str): return None
    themes = ["Star Wars","Harry Potter","Disney","Technic","Architecture","Speed Champions", "Bluey",
              "City","Friends","NINJAGO","Marvel","Botanicals","Classic","Creator","Ideas","DUPLO","Art"]
    title_lower = title.lower()
    return next((theme for theme in themes if theme.lower() in title_lower), "Other")
df_lego['lego_theme'] = df_lego['title'].apply(extract_lego_theme)

#extact age from title
def extract_age(text):
    if not isinstance(text, str):
        return None
    pattern = r'(?:Ages?|Aged)\s*(\d{1,2})(?:\+|-?\d{0,2})?'
    match = re.search(pattern, text, flags=re.IGNORECASE)
    if match:
        return int(match.group(1))
    else:
        return None
df_lego["age_rec"] = df_lego['title'].apply(extract_age)

#Recent sales
def convert_sales_to_number(text):
    if not isinstance(text, str):
        return None  
    match = re.search(r'([\d\.]+)\s*(K)?', text, re.IGNORECASE)
    if match:
        number = float(match.group(1))
        multiplier = match.group(2)
        if multiplier and multiplier.upper() == 'K':
            number *= 1_000
        return int(number)
    return None
df_lego['recent_sales_num'] = df_lego['recent_sales'].apply(convert_sales_to_number)

#Extract price detail
for field in ["value", "list_price", "name"]:
    df_lego[f"price_{field}"] = df_lego["price"].apply(
        lambda x: x.get(field) if isinstance(x, dict) else None
    )

#Extact availability info
df_lego['stock_info'] = df_lego['availability'].apply(lambda x: x.get('raw') if isinstance(x, dict) else None)

#Keep only needed columns
df_lego = df_lego[["position", "title", "asin", "is_prime", "rating", "ratings_total", "sponsored",
                    "coupon", "lego_theme", "age_rec", "recent_sales_num", 
                    "price_value", "price_list_price", "price_name", "stock_info"]]

df_lego.to_csv("data/lego.csv", index=False)

In [20]:
df_lego.head()

Unnamed: 0,position,title,asin,is_prime,rating,ratings_total,sponsored,coupon,lego_theme,age_rec,recent_sales_num,price_value,price_list_price,price_name,stock_info
0,1,LEGO Bluey: Bluey’s Family House - Toddler Toy...,B0DRW8L7HY,False,4.7,51.0,True,,Bluey,,2000.0,69.99,,,
1,2,LEGO ǀ Disney Princess Beauty and The Beast Ca...,B0DRW654DB,False,5.0,2.0,True,,Disney,18.0,,279.99,,,
2,3,LEGO Star Wars: The Phantom Menace Battle Droi...,B0DRW6VY43,False,4.2,11.0,True,,Star Wars,12.0,700.0,139.99,,,
3,4,LEGO Harry Potter Book Nook: Hogwarts Express ...,B0DRW7WRX3,False,4.8,26.0,True,,Harry Potter,,1000.0,99.99,,,
4,5,LEGO Icons Boutique Hotel Model Kits - Bedroom...,B09JKXSQWG,False,4.8,441.0,,,Other,18.0,10000.0,213.96,$229.99,Primary,
