# Web Scraping 

In [1]:
num_pages = 3

In [2]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

search_urls = ['https://www.ebay.com/sch/i.html?_from=R40&_nkw=uniqlo+shirt&_sacat=0&LH_TitleDesc=0&_osacat=0&rt=nc&LH_Sold=1&LH_Complete=1&_ipg=192&_pgn=%s' \
         %(str(i + 1)) for i in range(num_pages)]

from time import time

'''
    Fetch and clean transaction date, location, and seller score data 
    for an individual sold item page that is not still a current listing
'''
def get_item_info_all_sold(soup):
    trans_date = soup.find('span', {'id':'bb_tlft'}).get_text()
    location = soup.find_all('tr', 'vi-ht20')[-2].find_all('div', 'u-flL')[1].get_text()
    seller_score = soup.find_all('span', 'mbg-l')[0].get_text()
    return [clean_time(trans_date), location, clean_seller_score(seller_score)]

'''
    Fetch and clean transaction date, location, and seller score data 
    for an individual sold item page that is still a current listing
'''
def get_item_info_still_left(soup):
    location = soup.find('span', attrs = {'itemprop':'availableAtOrFrom'}).get_text()
    seller_score = soup.find('span', 'mbg-l').find('a').get_text()
    return [None, location, seller_score]

'''
    Fetch and clean transaction date, location, and seller score data 
    for an individual sold item page that has a "hybrid" banner transition layout
'''
def get_item_info_hybrid(soup):
    seller_score = soup.find_all('span','app-sellerpresence__feedbackscore')[0].get_text()
    location = soup.find_all('span','cc-textblock--block')[0].get_text()
    return [None, location, seller_score]

'''
    Cleans a time string
'''
def clean_time(time):
    if time != None:
        return time[5:].replace('\n\n', ' ')[:-1] 
    return None

'''
    Cleans a seller score
'''
def clean_seller_score(score):
    return score.replace(' ', '')[3:-2]

'''
    Fetches sold item's individual page info to sold listings dataframe
'''
def get_single_page_info(listings_df):
    date_list = []
    loc_list = []
    score_list = []
    for link in listings_df['link']:
        item_page = requests.get(link)
        soup = BeautifulSoup(item_page.content, 'html.parser')
        
        #sold page 
        if soup.find('span', {'id':'bb_tlft'}) != None:
            item_info = get_item_info_all_sold(soup)
        
        #still active listing page (multiple quantities)
        elif soup.find('span', attrs = {'itemprop':'availableAtOrFrom'}) != None:
            item_info = get_item_info_still_left(soup)
        
        #hybrid transition page (weird notice banner without showing)
        elif soup.find('span','app-sellerpresence__feedbackscore') != None:
            item_info = get_item_info_hybrid(soup)
        
        #exhausted all possible individual pages, flag as bad
        elif soup.find('a', 'nodestar-item-card-details__view-link') == None:
            item_info = ['BAD', 'BAD', 'BAD']
        
        #has banner that has another link accessible
        else:
            link_2 = soup.find('a', 'nodestar-item-card-details__view-link')['href']
            item_page = requests.get(link_2)
            soup = BeautifulSoup(item_page.content, 'html.parser')
            item_info = get_item_info_still_left(soup)
        date_list.append(item_info[0])
        loc_list.append(item_info[1])
        score_list.append(item_info[2])
        
    return pd.DataFrame({'link':listings_df['link'], 'date':date_list, \
                                'location':loc_list, 'score':score_list})

'''
    Gets a dataframe of which each row is an item from the sold items page with
    info from both the overall and individual page incorporated
'''
def get_listings_df(search_url):
    t0 = time()
    search_page = requests.get(search_url)
    soup = BeautifulSoup(search_page.content, 'html.parser')
    #each listings page has weird first header content
    listings = soup.find_all('li', attrs={'class': 's-item'})[1:]
    
    title_list = []
    sec_info_list = []
    link_list = []
    sold_at_lower_p_list = []
    price_list = []
    shipping_list = []
    purchase_type_list = []
    num_bid_list = []
    for item in listings:
        title = item.find('h3', 's-item__title').get_text()
        sec_info = item.find('span', 'SECONDARY_INFO').get_text()
        link = item.find('a')['href']
        sold_at_lower_price = True if item.find('span', 'STRIKETHROUGH POSITIVE') != None else False
        price = item.find('span', 'POSITIVE').get_text()
        shipping = item.find('span', 's-item__shipping s-item__logisticsCost').get_text()
        purchase_info = item.find('span', 's-item__purchase-options s-item__purchaseOptions')
        purchase_type = purchase_info.get_text() if purchase_info != None else None
        bid_info = item.find('span', 's-item__bids s-item__bidCount')
        num_bids = bid_info.get_text() if bid_info != None else 0

        title_list.append(title)
        sec_info_list.append(sec_info)
        link_list.append(link)
        sold_at_lower_p_list.append(sold_at_lower_price)
        price_list.append(price)
        shipping_list.append(shipping)
        purchase_type_list.append(purchase_type)
        num_bid_list.append(num_bids)
    
    listings_df = pd.DataFrame({'title':title_list, 'secondary_info':sec_info_list, 'link':link_list, \
                         'sold_at_lower_price':sold_at_lower_p_list, 'price':price_list, \
                         'shipping':shipping_list, 'purchase_type':purchase_type_list, 'num_bids':num_bid_list})
    final_listings_df = get_single_page_info(listings_df).merge(listings_df, on = 'link')
    
    t1 = time()
    print('1 search page took %s seconds to fetch.' %(t1 - t0))
    
    return final_listings_df

'''
    Gets overall sold listings dataframe out of a sold listings url list
'''
def get_final_item_df(search_urls):
    return pd.concat([get_listings_df(search_url) for search_url in search_urls])

listings_df = get_final_item_df(search_urls)

listings_df.to_csv('scraped_uniqlo_data.csv')

print('fetching process is complete')


1 search page took 504.74946308135986 seconds to fetch.
1 search page took 463.47371315956116 seconds to fetch.
1 search page took 508.8712911605835 seconds to fetch.
fetching process is complete
