In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request as urllib
from requests import get
import matplotlib.pyplot as plt
from dateutil import parser
%matplotlib inline

In [41]:
from apscheduler.schedulers.blocking import BlockingScheduler
from datetime import datetime, date
import os, glob

In [4]:
sched = BlockingScheduler()# call the BlockingScheduler class

Now we will define the job that needs to be run periodically. In this case the job will consist of going through a list of urls from which it will extract whether an item which is displayed online for sale is available or not. The scheduler should open a csv file and input the availability of the item. 

In [5]:
#get the list of urls
#Given a base url, this item will keep extracting URLs from the search result page.
base_url = 'https://www.ebay.com/b/Camping-Hiking-Sleeping-Gear/181403/bn_7327730?_pgn='
## this function returns a list of links correspoding to an item on sale in ebay
def get_item_links(base_url, uppercut = 800000):
    links = []
    count = 1
    while len(links)<uppercut:
        url = base_url+str(count)
        response = get(url)
        print(url)
        html_soup = BeautifulSoup(response.text, 'html.parser')
        item_page_links = html_soup.find_all('div', class_ = "s-item__wrapper clearfix")
        for link in item_page_links:
            links.append(link.a.attrs['href'])
        count+=1
    return links[:uppercut]
    

#print(response.text[500:1000])

In [6]:
#here we will call the function to get the desired number of links, in order to keep things simple we will just get
#5 links
item_urls = get_item_links(base_url, uppercut=5)

https://www.ebay.com/b/Camping-Hiking-Sleeping-Gear/181403/bn_7327730?_pgn=1


Here we will define the job, the job (which also most likely is a function) itself might depend on some other funtions

In [52]:
def job_get_item_details(url_list):
    sellerfeedback = []
    item_available = []
    price = []
    item_currency = []
    item_description = []
    keyword = []
    seller_name = []
    currency = []
    quantity_available = []
    for url in url_list:
        print(url)
        response = get(url)
        first_soup = BeautifulSoup(response.text, 'html.parser')# get the html soup
        heading = first_soup.find_all('h1', class_ = "it-ttl")#find the item description
        seller = first_soup.find_all('span', class_ = "mbg-nw")#find the seller name
        seller_feedback = first_soup.find_all('span', class_ = "mbg-l")#get the seller feedback
        item_description.append(heading[0].text[16:])
        seller_name.append(seller[0].text)
        sellerfeedback.append(np.int(seller_feedback[0].text[15:-2]))
        item = first_soup.find_all('span', itemprop = "name")
        available_quantity = first_soup.find_all('span', id = "qtySubTxt")
        retail_price = first_soup.find_all('span', id = "prcIsum")#find the retail price
        discount_price = first_soup.find_all('span', id = "mm-saleDscPrc")
        try:#append retail price, look for error
            y=np.float(retail_price[0].attrs['content'])
            price.append(y)
            print('The price is {}'.format(y))
        except:#else look for discounted price if any
            z  = discount_price[0].text
            m = np.float(''.join(i for i in z if (i.isdigit() or i=='.')))
            price.append(m) 
            print('The discounted price is {}'.format(m))
        try:#append currency 
            price_string = retail_price[0].text.replace('.','')
            currency_1 = ''.join([i for i in price_string if not i.isdigit()])
            currency.append(currency_1)
            print('The currency is {}'.format(currency_1))
        except:#append currency for discounted price
            price_string = discount_price[0].text.replace('.','')
            currency_1 = ''.join([i for i in price_string if not i.isdigit()])
            currency.append(currency_1)
            print('The currency is {}'.format(currency_1))
        try:
            x = available_quantity[0].text
            quantity = [int(s) for s in x.split() if s.isdigit()][0]
            quantity_available.append(quantity)
            if quantity>0:
                item_available.append(1)
            else:
                item_available.append(0)
            print('Available quantity is : {}'.format(quantity))
        except:
            quantity_available.append(np.nan)
            item_available.append(0)
        try:
            keyword.append(item[4].text)#get the keyword
            print(item[4].text)
        except:
            if len(item)>0:
                keyword.append(item[-2].text)
                print(item[-2].text)
            else:
                keyword.append('Not available')
                print('Key word not available')
        
    filename = 'item_availability.csv'
    dfitems = {'Keyword':keyword, 'seller name':seller_name,
               'Price':price, 'Currency':currency}
    return pd.DataFrame(dfitems, columns=dfitems.keys())

In [53]:
item_availability = job_get_item_details(item_urls)

https://www.ebay.com/itm/Inflatable-Air-Sofa-Bed-Lazy-Sleeping-Camping-Bag-Beach-Hangout-Couch-Windbed/292566375653?epid=715012867&hash=item441e509ce5:m:mSeLLE_HXu8OCpTWY6uyAZA&var=591377547355
The discounted price is 8.99
The currency is US $
Available quantity is : 10
Sleeping Bags
https://www.ebay.com/itm/Double-Self-Inflating-Camping-Roll-Mat-Camp-Bed-Inflatable-Sleeping-Mattress/253747098737?hash=item3b14817471:g:brEAAOSwZrdbRxom
The price is 4.99
The currency is US $
Available quantity is : 10
Mattresses & Pads
https://www.ebay.com/itm/New-Double-Outdoor-Person-Travel-Camping-Hanging-Hammock-Bed-Wi-Mosquito-Net-Set/253639619614?hash=item3b0e19741e:m:mIHN9kGpsMWHPShA5HQi3uw&var=552873885342
The price is 4.99
The currency is US $
Available quantity is : 129
Sporting Goods
https://www.ebay.com/itm/Double-Sideds-Inflatable-Sleep-Pillow-Mat-Cushion-For-Camping-Picnic-Travel-Soft/132279480734?hash=item1ecc78b19e:m:m0v5IfPKuZEtGejjRdB1hww&var=431607141476
The price is 1.49
The currency 

In [60]:
item_availability
item_availability.to_csv('items_availability',index=False)

In [61]:
def job_get_item_availability(url_list):
    item_available = []
    dateandtime = datetime.now()
    for url in url_list:
        response = get(url)
        first_soup = BeautifulSoup(response.text, 'html.parser')# get the html soup
        available_quantity = first_soup.find_all('span', id = "qtySubTxt")
        try:
            x = available_quantity[0].text
            quantity = [int(s) for s in x.split() if s.isdigit()][0]
            if quantity>0:
                item_available.append(1)
            else:
                item_available.append(0)
            #print('Available quantity is : {}'.format(quantity))
        except:
            item_available.append(0)
    dict_availability = {str(dateandtime):item_available}
    return pd.DataFrame(dict_availability)
        
        

In [62]:
availability = job_get_item_availability(item_urls)

In [83]:
def job_to_schedule():
    global item_urls
    url_list = item_urls
    seed_file_link = 'items_availability'
    df_init = pd.read_csv(seed_file_link)
    df_availability = job_get_item_availability(url_list)
    df_final = pd.concat([df_init, df_availability], axis=1)
    df_final.to_csv('items_availability', index = False)
    print('Done at {}'.format(str(datetime.now())))
    #return df_final

In [84]:
test = job_to_schedule()

Done at 2018-07-15 22:51:49.889796


In [85]:
test

Unnamed: 0,Keyword,seller name,Price,Currency,2018-07-15 22:34:23.966376,2018-07-15 22:34:54.218792,2018-07-15 22:43:28.839938,2018-07-15 22:43:58.840642,2018-07-15 22:51:41.404031
0,Sleeping Bags,xianghuietchco.ltd,8.99,US $,1,1,1,1,1
1,Mattresses & Pads,klcollins23,4.99,US $,1,1,1,1,1
2,Sporting Goods,2011always-for-you,4.99,US $,1,1,1,1,1
3,Mattresses & Pads,lins-bay,1.49,US $,1,1,1,1,1
4,Cots,rugweavers101,14.99,C $,1,1,1,1,1


In [86]:
sched = BlockingScheduler()

In [87]:
sched.add_job(job_to_schedule, 'interval', seconds = 30)
sched.start()

Done at 2018-07-15 22:52:40.673499
Done at 2018-07-15 22:53:11.722725
Done at 2018-07-15 22:53:44.100281
Done at 2018-07-15 22:54:13.010631


KeyboardInterrupt: 

In [88]:
data = pd.read_csv('items_availability')

In [89]:
data

Unnamed: 0,Keyword,seller name,Price,Currency,2018-07-15 22:34:23.966376,2018-07-15 22:34:54.218792,2018-07-15 22:43:28.839938,2018-07-15 22:43:58.840642,2018-07-15 22:51:41.404031,2018-07-15 22:52:32.903064,2018-07-15 22:53:02.904653,2018-07-15 22:53:32.905110,2018-07-15 22:54:02.904822
0,Sleeping Bags,xianghuietchco.ltd,8.99,US $,1,1,1,1,1,1,1,1,1
1,Mattresses & Pads,klcollins23,4.99,US $,1,1,1,1,1,1,1,1,1
2,Sporting Goods,2011always-for-you,4.99,US $,1,1,1,1,1,1,1,1,1
3,Mattresses & Pads,lins-bay,1.49,US $,1,1,1,1,1,1,1,1,1
4,Cots,rugweavers101,14.99,C $,1,1,1,1,1,1,1,1,1
