## Load Config Data

Load in configuration data, which will dictate the behavior of the scraper.  The login information will be used to log into untappd.  The search_terms is only required if the url_file does not exist.  If it exists, the search scraping will not occur.  

**Sample Config File**

Open "untappd_sample.cfg" for a sample configuration file.  Add a username and password.  If you make a copy and name it untappd.cfg, git will ignore it and your password will not be checked in. 


In [1]:
import os
import json

config_path = 'untappd_AF.cfg'

with open(config_path) as rdr:
    config = json.load(rdr)

### Create web driver using the scraper

In [2]:
import untappd_scraper
from untappd_scraper import ScraperType

browser = untappd_scraper.create_driver(config, headless=True)

### Identify Beer URLs to Scrape

If the url_file in the config exists, we'll use that.  Otherwise we'll use the search terms to begin scraping.

In [None]:
import json
import time
import uuid
import feather

import pandas as pd
from glob import glob

url_file = config['scraping']['url_file']

urls = untappd_scraper.read_pkl(url_file)
    
print('URLs Found:', len(urls))

## Identify all existing urls, and remove them from our url list
df = pd.concat([feather.read_dataframe(file) for file in glob('../data/aslin-beer-info*.feather')])
df.head()

existing_ids = set([int(x) for x in df['id']])

print("Number of URLS before filter:", len(urls))
urls = [url for url in urls if int(url.split('/')[-1]) not in existing_ids]
print("Number of URLS after filter: ", len(urls))

beer_scraper = untappd_scraper.create_scraper(ScraperType.BEER, browser)

beer_results = []
review_results = []

for url in urls:
    
    beers, reviews = beer_scraper.scrape_beer(url)
    beer_results.append(beers)
    review_results.extend(reviews)
    
    print(f"{len(beer_results)}) {url} found {len(reviews)} reviews")
    ## Every 25 beers write out the beer info and reviews
    if len(beer_results) >= 50:
        print("Clearing")
        file_id = str(uuid.uuid4())
        
        # Write beer info
        feather.write_dataframe(untappd_scraper.create_beer_df(beer_results), f'../data/aslin-beer-info_{file_id}.feather')
        with open(f'../data/aslin-beer-info_{file_id}.json', 'w') as wtr:
            json.dump(beer_results, wtr)

        # Write user reviews
        feather.write_dataframe(untappd_scraper.create_reviews_df(review_results), f'../data/aslin-reviews_{file_id}.feather')
        with open(f'../data/aslin-reviews_{file_id}.json', 'w') as wtr:
            json.dump(review_results, wtr)
        
        beer_results = []
        review_results = []
        
        time.sleep(60)
    else:
        time.sleep(10)
        
print("Finishing")
file_id = str(uuid.uuid4())

# Write beer info
feather.write_dataframe(untappd_scraper.create_beer_df(beer_results), f'../data/aslin-beer-info_{file_id}.feather')
with open(f'../data/aslin-beer-info_{file_id}.json', 'w') as wtr:
    json.dump(beer_results, wtr)

# Write user reviews
feather.write_dataframe(untappd_scraper.create_reviews_df(review_results), f'../data/aslin-reviews_{file_id}.feather')
with open(f'../data/aslin-reviews_{file_id}.json', 'w') as wtr:
    json.dump(review_results, wtr)

beer_results = []
review_results = []

In [12]:
print("Finishing")
file_id = str(uuid.uuid4())

# Write beer info
feather.write_dataframe(untappd_scraper.create_beer_df(beer_results), f'../data/aslin-beer-info_{file_id}.feather')
with open(f'../data/aslin-beer-info_{file_id}.json', 'w') as wtr:
    json.dump(beer_results, wtr)

# Write user reviews
feather.write_dataframe(untappd_scraper.create_reviews_df(review_results), f'../data/aslin-reviews_{file_id}.feather')
with open(f'../data/aslin-reviews_{file_id}.json', 'w') as wtr:
    json.dump(review_results, wtr)

beer_results = []
review_results = []

Finishing


In [34]:
browser.quit()

In [35]:
from glob import glob
import json
import feather
import pandas as pd

In [36]:
def deduplicate_json(pattern, file_out):
    
    data = []
    files = glob(pattern)
    for file in files:
        with open(file) as rdr:
            data.extend(json.load(rdr))
    print(len(data)) 
    with open(file_out, 'w') as wtr:
        json.dump(data, wtr)



In [37]:
pattern = '../data/aslin-beer-info*.feather'
df = pd.concat([feather.read_dataframe(file) for file in glob(pattern)], sort=False)
df.head()

Unnamed: 0,abv,brewery,date,description,ibu,id,name,rating,style,num ratings
0,18.4,Aslin Beer Company,2018-01-04,Pinkies Up is an AVIPA that is clocking in at ...,,2460710,Pinkies Up,4.05,IPA - Triple,1333
1,8.5,Aslin Beer Company,2018-06-28,Our collaboration with Southern Grist Brewing!...,,2721897,Predictable Patterns,4.13,IPA - Imperial / Double,1256
2,15.0,Aslin Beer Company,2018-09-07,"Dreams is an Imperial stout with Almond, Cocon...",,2834533,Dreams,4.43,Stout - Imperial / Double,1259
3,15.0,Aslin Beer Company,2016-03-05,Imperial Bisc,,1454711,Buongiorno,4.25,Stout - Imperial / Double,1255
4,4.5,Aslin Beer Company,2017-08-16,Never A Bride is a Petite Saison fermented on ...,,2245866,Never A Bride,3.63,Saison / Farmhouse Ale,1152


In [38]:
df['rating'] = pd.to_numeric(df['rating'])
df['id'] = pd.to_numeric(df['id'])

print(len(df))
df = df.drop_duplicates(['id', 'brewery', 'name'])
len(df)

261


261

In [39]:
file_out= '../data/aslin-beer-info-merged.feather'
feather.write_dataframe(df, file_out)


In [40]:
df.dtypes

abv                   float64
brewery                object
date           datetime64[ns]
description            object
ibu                   float64
id                      int64
name                   object
rating                float64
style                  object
num ratings             int64
dtype: object

In [41]:
deduplicate_json('../data/aslin-beer-info*.json', '../data/aslin-beer-info-merged.json')

261


In [42]:
deduplicate_json('../data/aslin-reviews*.json', '../data/aslin-reviews-merged.json')

65345


In [43]:
pattern = '../data/aslin-reviews*.feather'
df = pd.concat([feather.read_dataframe(file) for file in glob(pattern)], sort=False)
df.head()

df['beer_id'] = pd.to_numeric(df['beer_id'])
df['rating'] = pd.to_numeric(df['rating'])
print(len(df))
df = df.drop_duplicates()
print(len(df))

file_out= '../data/aslin-reviews-merged.feather'

65345
64018


In [44]:
feather.write_dataframe(df, file_out)

In [45]:
df.head()

Unnamed: 0,beer_id,comment,rating,serving,user_id
0,2460710,,,Can,Zuber260
1,2460710,Just a tad old by looking at other check ins.....,,Can,Mikevt89
2,2460710,,350.0,,EastCoastJamie
3,2460710,,375.0,,leaston
4,2460710,,,,mat1622


In [46]:
len(df)

64018

In [48]:
beer_info = feather.read_dataframe('../data/aslin-beer-info-merged.feather')

In [49]:
len(beer_info)

261