## Preliminaries

In [None]:
#!usr/bin/python
import pandas as pd, numpy as np
# import numpy as np
from bs4 import BeautifulSoup
# from ast import literal_eval
import requests, datetime, time
import json
from listing_parsers import active_listing_parser, closed_listing_parser

In [None]:
# Bring in auctions html

headers = {
    'User-Agent': 'AV User Agent 20201011',
    'From': 'anvance@gmail.com'
}

In [None]:
site = requests.get('https://bringatrailer.com/auctions', headers=headers)
site.status_code

## Parse active auctions

In [None]:
# Soup it
soup = BeautifulSoup(site.text, 'html.parser')

In [None]:
# parse and build tuples for a table

labels = ['uid',
          'listing_title', 
          'listing_excerpt', 
          'listing_link', 
          'latest_bid', 
          'year',
          'make',
          'model',
          'category',
          'keyword',
          'vin',
          'latitude',
          'longitude']

listing_tuples = []

In [None]:
for item in soup.findAll('div', class_ = lambda x: (x == 'auctions-item-container'))[1:]:
    try:
        uid = item['data-update']    
    except AttributeError:
        uid = 'not found'
    try:
        listing_title = item.find('h3',class_='auctions-item-title').text    
    except AttributeError:
        listing_title = 'not found'
    try:
        listing_excerpt = item.find('div', class_='auctions-item-excerpt').text
    except AttributeError:
        listing_excerpt = 'not found'
    try:
        listing_link = item.find('h3').find('a', attrs = {'href': True})['href']
    except AttributeError:
        listing_link = 'not found'
    try:
        latest_bid = item.find_all('span', attrs= {'data-current': True})[0]['data-current']
    except AttributeError:
        latest_bid = 'not found'
    try:
        year = json.loads(item['data-searchable'])['year']
    except AttributeError:
        year = 'not found'
    try:
        make = json.loads(item['data-searchable'])['make']
    except AttributeError:
        make = 'not found'
    try:
        model = json.loads(item['data-searchable'])['model']
    except AttributeError:
        model = 'not found'
    try:
        category = json.loads(item['data-searchable'])['category']
    except AttributeError:
        category = 'not found'
    try:
        keyword = json.loads(item['data-searchable'])['keyword']
    except AttributeError:
        keyword = 'not found'
    try:
        vin = json.loads(item['data-searchable'])['vin']
    except AttributeError:
        vin = 'not found'
    try:
        latitude = item['data-lat']
    except AttributeError:
        latitude = 'not found'
    try:
        longitude = item['data-lon']
    except AttributeError:
        longitude = 'not found'
    listing_tuple = (uid,
                     listing_title, 
                     listing_excerpt, 
                     listing_link, 
                     latest_bid, 
                     year,
                     make,
                     model,
                     category,
                     keyword,
                     vin,
                     latitude,
                     longitude)
    listing_tuples.append(listing_tuple)

In [None]:
staging_df = pd.DataFrame(listing_tuples, columns = labels)

In [None]:
time_string = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d_%H-%M-%S')
time_string_2 = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S')

In [None]:
staging_df['scrape_ts'] = time_string_2

In [None]:
extradeetslist = list()
for url in staging_df['listing_link']:
    dict_ = active_listing_parser(url)
    dict_['url'] = url
    random_number = np.abs(np.random.normal())*10
    extradeetslist.append(dict_)
    time.sleep(random_number)

In [None]:
extradeetsdf = pd.DataFrame(extradeetslist)
staging_df = staging_df.merge(extradeetsdf, how='inner', left_on='listing_link',right_on='url')

In [None]:
del(staging_df['url'])

In [None]:
staging_df.to_csv('/home/ubuntu/projects/bat-2020-data/bat_listings_output_v2_%s.csv' % time_string, 
                  index=True, index_label='fid')

## Parse auction results

In [None]:
results = requests.get('https://bringatrailer.com/auctions/results', headers=headers)

In [None]:
results.status_code

In [None]:
results_soup = BeautifulSoup(results.text)

In [None]:
columns = ['title','result', 'link']
results_list = []
for item in results_soup.find_all('div', class_='exceptional-item-extended'):
    title = item.find('a').text
    result = item.find('div', class_='exceptional-item-status').text
    link = item.find('a')['href']
    results_tuple = (title, result, link)
    results_list.append(results_tuple)
for item in results_soup.find_all('div', class_='auctions-item-extended'):
    title = item.find('span').text
    result = item.find('div', class_='auctions-item-status').text
    link = item.find('a')['href']
    results_tuple = (title, result, link)
    results_list.append(results_tuple)

In [None]:
results_df = pd.DataFrame(results_list, columns=columns)

In [None]:
extradeetslist = list()
for url in results_df['link']:
    dict_ = closed_listing_parser(url)
    dict_['url'] = url
    random_number = np.abs(np.random.normal())*10
    extradeetslist.append(dict_)
    time.sleep(random_number)
extradeetsdf = pd.DataFrame(extradeetslist)
results_df = results_df.merge(extradeetsdf, how='inner', left_on='link',right_on='url')
del(results_df['url'])

In [None]:
time_string = datetime.datetime.strftime(datetime.datetime.now(), '%Y-%m-%d_%H-%M-%S')
results_df.to_csv('/home/ubuntu/projects/bat-2020-data/bat_results_output_v2_%s.csv' % time_string, index=True)

In [None]:
print('Scrape complete - uploading to S3...')

### S3 Upload