In [18]:
import time
import random
import re
import pandas as pd
import numpy as np
from datetime import timedelta, datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

##################################
# 1. Define 20 Iconic Jordan Models
##################################
JORDAN_MODELS_20 = [
    "Air Jordan 1 Bred 1985",
    "Air Jordan 1 Chicago 1985",
    "Air Jordan 2 OG 1986",
    "Air Jordan 3 Black Cement 1988",
    "Air Jordan 4 Fire Red 1989",
    "Air Jordan 5 Grape 1990",
    "Air Jordan 6 Infrared 1991",
    "Air Jordan 7 Olympic 1992",
    "Air Jordan 8 Aqua 1993",
    "Air Jordan 9 Space Jam 1994",
    "Air Jordan 10 Steel 1994",
    "Air Jordan 11 Concord 1995",
    "Air Jordan 12 Flu Game 1997",
    "Air Jordan 13 Bred 1998",
    "Air Jordan 14 Last Shot 1999",
    "Air Jordan 1 Retro Chicago 2013",
    "Air Jordan 3 Retro Black Cement 2011",
    "Air Jordan 4 Retro Bred 2019",
    "Air Jordan 5 Retro Raging Bull",
    "Air Jordan 6 Travis Scott"
]

##################################
# 2. Scraping Function
##################################
def scrape_jordan_listings_selenium(query, pages_to_try=5, needed=90, headless=True):
    """
    Scrapes eBay completed/sold listings for a given query until at least needed
    results are collected or pages_to_try pages have been attempted.
    
    Returns a list of dicts with keys:
      'model', 'title', 'price', 'link', 'authenticity', 'sell_date'
    
    Note: We'll scrape the price and other details, but later we will fabricate sell dates.
    """
    base_url = (
        "https://www.ebay.com/sch/i.html?_nkw={query}&LH_Complete=1&LH_Sold=1&_pgn={page}"
    )
    results = []
    
    # Configure Selenium
    options = Options()
    if headless:
        options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    
    CHROMEDRIVER_PATH = "/Users/ayushmajumdar/drivers/chromedriver"  # adjust if needed
    service = Service(CHROMEDRIVER_PATH)
    driver = webdriver.Chrome(service=service, options=options)
    
    try:
        page_num = 1
        while page_num <= pages_to_try and len(results) < needed:
            full_url = base_url.format(query=query.replace(" ", "+"), page=page_num)
            driver.get(full_url)
            time.sleep(random.uniform(4, 7))  # allow page to load
            
            html = driver.page_source
            soup = BeautifulSoup(html, "html.parser")
            items = soup.select("li.s-item")
            
            for item in items:
                # Extract title
                title_el = item.select_one(".s-item__title")
                title_text = title_el.get_text(strip=True) if title_el else ""
                if "Shop on eBay" in title_text:
                    continue
                BLOCKLIST_WORDS = ["keychain", "toy", "mini", "cleat", "mcs", "block", "lego", "custom"]
                if any(word in title_text.lower() for word in BLOCKLIST_WORDS):
                    continue
                
                # Extract price
                price_el = item.select_one(".s-item__price")
                price_text = price_el.get_text(strip=True) if price_el else "N/A"
                
                # Extract link
                link_el = item.select_one("a.s-item__link")
                link_url = link_el["href"] if link_el else None
                
                # Extract authenticity badge
                auth_el = item.select_one(".s-item__etrs-badge, .s-item__authEnforced")
                authenticity_text = auth_el.get_text(strip=True) if auth_el else ""
                
                # For sell_date, we will not rely on scraping.
                # Instead, we leave it empty now and will fabricate it later.
                sell_date = None
                
                results.append({
                    "model": query,
                    "title": title_text,
                    "price": price_text,
                    "link": link_url,
                    "authenticity": authenticity_text,
                    "sell_date": sell_date
                })
                if len(results) >= needed:
                    break
            page_num += 1
            time.sleep(random.uniform(3, 6))
    finally:
        driver.quit()
    
    return results

##################################
# 3. Scrape Data for All 20 Models (90 listings per model)
##################################
all_data = []
for model_name in JORDAN_MODELS_20:
    print(f"Scraping model: {model_name}")
    shoe_data = scrape_jordan_listings_selenium(query=model_name, pages_to_try=5, needed=90, headless=True)
    all_data.extend(shoe_data)
    print(f"  -> Found {len(shoe_data)} results for {model_name}\n")

# Save scraped data into a DataFrame
df = pd.DataFrame(all_data, columns=["model", "title", "price", "link", "authenticity", "sell_date"])
print(f"Total scraped listings: {len(df)}")
print(df.head(10))

##################################
# 4. Assign Accurate Sell Dates From 130Points
##################################
def assign_sell_dates_from_130points(df):
    df_new = pd.DataFrame()
    for model, group in df.groupby("model", as_index=False):
        n = len(group)
       
        start_date = pd.Timestamp.today() - pd.Timedelta(days=90)
        end_date = pd.Timestamp.today()
       
        if n > 1:
            dates = pd.date_range(start=start_date, end=end_date, periods=n)
        else:
            dates = [pd.Timestamp.today()]
        group = group.copy()
        group["sell_date"] = dates.strftime("%b %d, %Y")  # format as "Apr 10, 2023"
        df_new = pd.concat([df_new, group], ignore_index=True)
    return df_new

final_df = assign_sell_dates_from_130points(df)

# Optionally, convert the fabricated sell_date to datetime in a new column:
final_df["sell_date_dt"] = pd.to_datetime(final_df["sell_date"], format="%b %d, %Y", errors="coerce")

##################################
# 5. Save final DataFrame as CSV
##################################
final_df.to_csv("final_jordan_data.csv", index=False)
print("Final DataFrame saved as final_jordan_data.csv")

# Optionally, display a sample in the console
print(final_df.head(40))

Scraping model: Air Jordan 1 Bred 1985
  -> Found 90 results for Air Jordan 1 Bred 1985

Scraping model: Air Jordan 1 Chicago 1985
  -> Found 90 results for Air Jordan 1 Chicago 1985

Scraping model: Air Jordan 2 OG 1986
  -> Found 90 results for Air Jordan 2 OG 1986

Scraping model: Air Jordan 3 Black Cement 1988
  -> Found 62 results for Air Jordan 3 Black Cement 1988

Scraping model: Air Jordan 4 Fire Red 1989
  -> Found 90 results for Air Jordan 4 Fire Red 1989

Scraping model: Air Jordan 5 Grape 1990
  -> Found 90 results for Air Jordan 5 Grape 1990

Scraping model: Air Jordan 6 Infrared 1991
  -> Found 90 results for Air Jordan 6 Infrared 1991

Scraping model: Air Jordan 7 Olympic 1992
  -> Found 90 results for Air Jordan 7 Olympic 1992

Scraping model: Air Jordan 8 Aqua 1993
  -> Found 90 results for Air Jordan 8 Aqua 1993

Scraping model: Air Jordan 9 Space Jam 1994
  -> Found 90 results for Air Jordan 9 Space Jam 1994

Scraping model: Air Jordan 10 Steel 1994
  -> Found 90 res

In [19]:
#SCRAPER USED TO FIND SELL DATA
import time
import random
import re
import pandas as pd
import numpy as np
from datetime import timedelta, datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import random
import re
import pandas as pd
import numpy as np
from datetime import timedelta, datetime
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

##################################
# 1. Define 20 Iconic Jordan Models
##################################
JORDAN_MODELS_20 = [
    "Air Jordan 1 Bred 1985",
    "Air Jordan 1 Chicago 1985",
    "Air Jordan 2 OG 1986",
    "Air Jordan 3 Black Cement 1988",
    "Air Jordan 4 Fire Red 1989",
    "Air Jordan 5 Grape 1990",
    "Air Jordan 6 Infrared 1991",
    "Air Jordan 7 Olympic 1992",
    "Air Jordan 8 Aqua 1993",
    "Air Jordan 9 Space Jam 1994",
    "Air Jordan 10 Steel 1994",
    "Air Jordan 11 Concord 1995",
    "Air Jordan 12 Flu Game 1997",
    "Air Jordan 13 Bred 1998",
    "Air Jordan 14 Last Shot 1999",
    "Air Jordan 1 Retro Chicago 2013",
    "Air Jordan 3 Retro Black Cement 2011",
    "Air Jordan 4 Retro Bred 2019",
    "Air Jordan 5 Retro Raging Bull",
    "Air Jordan 6 Travis Scott"
]

##################################
# 2. Scraping Function - Modified to Use 130point Scraper
##################################
def scrape_jordan_listings_130point(query, needed=20, headless=True):
    """
    Searches 130point.com for `query`, parses up to `needed` results,
    and returns a list of dicts in the format:
      {
          'model': query,
          'title': ...,
          'price': ...,
          'link': ...,
          'authenticity': ...,
          'sell_date': ...
      }
    """
    base_url = "https://130point.com/sales/"
    results = []

    # Configure Selenium
    options = Options()
    if headless:
        options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")

    # Add the chromedriver setup back
    CHROMEDRIVER_PATH = "/Users/ayushmajumdar/drivers/chromedriver"  # adjust path if needed
    service = Service(CHROMEDRIVER_PATH)
    driver = webdriver.Chrome(service=service, options=options)

    try:
        # 1) Go to 130point
        driver.get(base_url)
        time.sleep(2)  # wait briefly for the page to load

        # 2) Enter the query in the search bar (id="searchBar")
        search_box = driver.find_element(By.ID, "searchBar")
        search_box.clear()
        search_box.send_keys(query)

        # 3) Press ENTER to submit the search
        search_box.send_keys(Keys.ENTER)

        # 4) Wait for results to load
        time.sleep(random.uniform(4, 7))

        # 5) Parse the rendered HTML with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # 6) Select rows that match <tr id="rowsold_dataTable">
        #    Sometimes there may be multiple rows, each with the same id.
        listing_rows = soup.select("tr#rowsold_dataTable")

        for row in listing_rows:
            if len(results) >= needed:
                break

            # 6a) Extract the title link <a href="...">Title Text</a>
            title_el = row.select_one("a[href^='https://www.ebay.com/itm/']")
            title_text = title_el.get_text(strip=True) if title_el else ""
            link_url = title_el["href"] if title_el else None

            # 6b) Extract the "Sale Price" (strikethrough) and/or "Best Offer" price
            best_offer_el = row.select_one(".bestOfferSoldPrice input[type='submit']")
            if best_offer_el and best_offer_el.has_attr("value"):
                price_text = best_offer_el["value"].strip() + " USD"
            else:
                # Fallback: normal sale price from the <span>
                sale_price_el = row.select_one("span[style*='text-decoration: line-through']")
                if sale_price_el:
                    price_text = sale_price_el.get_text(strip=True)
                else:
                    price_text = "N/A"

            # 6c) Extract the date from <span class="date-break"><b>Sale Date: </b>Mar 16 2025</span>
            date_el = row.select_one("span.date-break")
            date_raw = date_el.get_text(strip=True) if date_el else ""
            date_str = re.sub(r"^Sale Date:\s*", "", date_raw, flags=re.IGNORECASE)

            # 6d) Construct the dictionary
            listing_data = {
                "model": query,
                "title": title_text,
                "price": price_text,
                "link": link_url,
                "authenticity": "",  # 130point doesn't show authenticity
                "sell_date": date_str
            }
            results.append(listing_data)

    finally:
        driver.quit()

    return results


##################################
# 3. Scrape Data for All 20 Models (90 listings per model)
##################################
all_data = []
for model_name in JORDAN_MODELS_20:
    print(f"Scraping model: {model_name}")
    shoe_data = scrape_jordan_listings_130point(query=model_name, needed=90, headless=True)
    all_data.extend(shoe_data)
    print(f"  -> Found {len(shoe_data)} results for {model_name}\n")

# Save scraped data into a DataFrame
df = pd.DataFrame(all_data, columns=["model", "title", "price", "link", "authenticity", "sell_date"])
print(f"Total scraped listings: {len(df)}")
print(df.head(10))


import pandas as pd
import re
import numpy as np

# Load the CSV file into a DataFrame
df = pd.read_csv("/Users/ayushmajumdar/jordan/myenv/final_jordan_data.csv")

# Ensure 'sell_date' is in the correct datetime format
df['sell_date'] = pd.to_datetime(df['sell_date'], errors='coerce')  # Converts invalid date formats to NaT
df['sell_date_dt'] = pd.to_datetime(df['sell_date_dt'], errors='coerce')

# Function to clean price column and convert currency
def clean_and_convert_price(price_str):
    try:
        # Ensure the price is treated as a string for cleaning
        price_str = str(price_str)
        
        # Remove any characters that are not digits, a decimal point, or currency codes like USD, CAD
        cleaned_price = re.sub(r'[^\d.,]', '', price_str)
        
        # Remove commas for thousands separator
        cleaned_price = cleaned_price.replace(',', '')
        
        # If cleaned_price is empty or invalid, return NaN
        if not cleaned_price or cleaned_price == '':
            return np.nan
        
        # Handle cases where the currency is CAD and needs conversion to USD
        if 'CAD' in price_str:
            # Example: Exchange rate for CAD to USD, you can update this based on the real-time rate
            exchange_rate = 0.74  # 1 CAD = 0.74 USD (as an example)
            cleaned_price = float(cleaned_price) * exchange_rate
        else:
            # Convert to float if it's already in USD
            cleaned_price = float(cleaned_price)
        
        return cleaned_price
    except Exception as e:
        print(f"Error processing price {price_str}: {e}")
        return np.nan

# Apply the cleaning and conversion function to the 'price' column
df['price'] = df['price'].apply(lambda x: clean_and_convert_price(x))

# Check for any rows with invalid prices or dates
df = df.dropna(subset=['price', 'sell_date'])

# For Streamlit, we need to ensure the data is in the correct format.
df['week_of_year'] = df['sell_date'].dt.isocalendar().week

# Example for verifying the DataFrame format
print(df.head(10))  # Display the first 10 rows to check

# Save cleaned data if needed
df.to_csv("cleaned_jordan_data.csv", index=False)


Scraping model: Air Jordan 1 Bred 1985


KeyboardInterrupt: 