## Load Config Data

Load in configuration data, which will dictate the behavior of the scraper.  The login information will be used to log into untappd.  The search_terms is only required if the url_file does not exist.  If it exists, the search scraping will not occur.  

**Sample Config File**

Open "untappd_sample.cfg" for a sample configuration file.  Add a username and password.  If you make a copy and name it untappd.cfg, git will ignore it and your password will not be checked in. 


In [None]:
import os
import json

config_path = 'untappd.cfg'

with open(config_path) as rdr:
    config = json.load(rdr)

### Create web driver using the scraper

In [None]:
import untappd_scraper
from untappd_scraper import ScraperType

browser = untappd_scraper.create_driver(config, headless=True)

### Identify Beer URLs to Scrape

If the url_file in the config exists, we'll use that.  Otherwise we'll use the search terms to begin scraping.

In [None]:
url_file = config['scraping']['url_file']

if not os.path.exists(url_file):
    urls = []
    
    ## Create search term scraper
    search_term_scraper = untappd_scraper.create_scraper(ScraperType.SEARCH, browser)
    
    for search_term in config['scraping']['search_terms']:
        urls.extend(search_term_scraper.scrape_search_term(search_term))
    
    urls = list(set(urls))
    untappd_scraper.write_pkl(url_file, urls)
    
else:
    urls = untappd_scraper.read_pkl(url_file)
    
print('URLs Found:', len(urls))

In [None]:
import json
import time
import uuid
import feather

import pandas as pd
from glob import glob

## Identify all existing urls, and remove them from our url list
df = pd.concat([feather.read_dataframe(file) for file in glob('../data/beer-info*.feather')])
df.head()

existing_ids = set([int(x) for x in df['id']])

print("Number of URLS before filter:", len(urls))
urls = [url for url in urls if int(url.split('/')[-1]) not in existing_ids]
print("Number of URLS after filter: ", len(urls))

beer_scraper = untappd_scraper.create_scraper(ScraperType.BEER, browser)

beer_results = []
review_results = []

for url in urls:
    
    beers, reviews = beer_scraper.scrape_beer(url)
    beer_results.append(beers)
    review_results.extend(reviews)
    
    print(f"{len(beer_results)}) {url} found {len(reviews)} reviews")
    ## Every 25 beers write out the beer info and reviews
    if len(beer_results) >= 50:
        print("Clearing")
        file_id = str(uuid.uuid4())
        
        # Write beer info
        feather.write_dataframe(untappd_scraper.create_beer_df(beer_results), f'../data/beer-info_{file_id}.feather')
        with open(f'../data/beer-info_{file_id}.json', orient='records') as wtr:
            json.dump(beer_results, wtr)

        # Write user reviews
        feather.write_dataframe(untappd_scraper.create_reviews_df(review_results), f'../data/reviews_{file_id}.feather')
        with open(f'../data/reviews_{file_id}.json', 'w') as wtr:
            json.dump(review_results, wtr)
        
        beer_results = []
        review_results = []
        
        time.sleep(60)
    else:
        time.sleep(10)

In [None]:
browser.quit()

In [1]:
from glob import glob
import json
import feather
import pandas as pd

In [7]:
def deduplicate_json(pattern, file_out):
    
    data = []
    files = glob(pattern)
    for file in files:
        with open(file) as rdr:
            data.extend(json.load(rdr))
    print(len(data)) 
    with open(file_out, 'w') as wtr:
        json.dump(data, wtr)



In [None]:
pattern = '../data/beer-info*.feather'
df = pd.concat([feather.read_dataframe(file) for file in glob(pattern)], sort=False)
df.head()

In [None]:
df['rating'] = pd.to_numeric(df['rating'])
df['id'] = pd.to_numeric(df['id'])

print(len(df))
df.drop_duplicates(['id', 'brewery', 'name'])
len(df)

In [None]:
file_out= '../data/beer-info-merged.feather'
feather.write_dataframe(df, file_out)


In [None]:
df.dtypes

In [None]:
deduplicate_json('../data/beer-info*.json', '../data/beer-info-merged.json')

In [8]:
deduplicate_json('../data/reviews*.json', '../data/reviews-merged.json')

731276


In [11]:
pattern = '../data/reviews*.feather'
df = pd.concat([feather.read_dataframe(file) for file in glob(pattern)], sort=False)
df.head()

df['beer_id'] = pd.to_numeric(df['beer_id'])
df['rating'] = pd.to_numeric(df['rating'])
print(len(df))
df = df.drop_duplicates()
print(len(df))

file_out= '../data/reviews-merged.feather'

730912
350122


In [12]:
feather.write_dataframe(df, file_out)

In [13]:
df.head()

Unnamed: 0,beer_id,comment,rating,user_id,serving
0,1300529,,3.75,Vasen_pakki,
1,1300529,,3.5,Dave-Hill,
2,1300529,,3.75,jsapas,
3,1300529,,3.25,vanatyhi1,
4,1300529,BB 30/10/2016. Üpris kentsakas überrüübe,3.75,stennibal,


In [14]:
len(df)

350122