In [1]:
import pandas as pd
import numpy as np
import os
import json
import matplotlib.pyplot as plt

import settings

## Loading & pre-process data

In [2]:
userdata = '/data/userdata.csv'

data = pd.read_csv(userdata, index_col = 'created_at')

# timestamp == index
data.index = pd.to_datetime(data.index)
# create unique beer_id, brewery_id columns from url
data['beer_id'] = data['beer_url'].str.rsplit('/', 1).str[-1]
data['brewery_id'] = data['brewery_url'].str.rsplit('/', 1).str[-1]
data.head()

Unnamed: 0_level_0,beer_name,brewery_name,beer_type,beer_abv,beer_ibu,comment,venue_name,venue_city,venue_state,venue_country,...,beer_url,brewery_url,brewery_country,brewery_city,brewery_state,flavor_profiles,purchase_venue,serving_type,beer_id,brewery_id
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-09-02 17:55:23,Atom Smasher,Two Brothers Brewing Company,Märzen,7.7,22,,,,,,...,https://untappd.com/beer/13003,https://untappd.com/brewery/1298,United States,Warrenville,IL,,,,13003,1298
2013-09-04 22:35:41,Oktoberfest,Revolution Brewing Company,Märzen,5.7,25,Mmm,Revolution Brewing Brewpub,Chicago,IL,United States,...,https://untappd.com/beer/78774,https://untappd.com/brewery/2605,United States,Chicago,IL,,,,78774,2605
2013-09-05 16:33:30,Prairie Path Ale,Two Brothers Brewing Company,Golden Ale,5.1,25,"Gluten removed? Can't tell! Refreshing, not to...",,,,,...,https://untappd.com/beer/3142,https://untappd.com/brewery/1298,United States,Warrenville,IL,,,,3142,1298
2013-09-13 17:29:52,Danish Style Red Lager,Figueroa Mountain Brewing Co.,Lager - Vienna,5.5,23,,Figueroa Mountain Brewing Company,Santa Barbara,CA,United States,...,https://untappd.com/beer/39441,https://untappd.com/brewery/5800,United States,Buellton,CA,,,,39441,5800
2013-09-13 20:04:59,Robert Masterson & Ryan Reschan/Rip Current/St...,Stone Brewing,IPA - American,7.7,90,,Union Ale,Santa Barbara,CA,United States,...,https://untappd.com/beer/389713,https://untappd.com/brewery/1204,United States,Escondido,CA,,,,389713,1204


In [3]:
data.dtypes

beer_name           object
brewery_name        object
beer_type           object
beer_abv           float64
beer_ibu             int64
comment             object
venue_name          object
venue_city          object
venue_state         object
venue_country       object
venue_lat          float64
venue_lng          float64
rating_score       float64
checkin_url         object
beer_url            object
brewery_url         object
brewery_country     object
brewery_city        object
brewery_state       object
flavor_profiles     object
purchase_venue      object
serving_type        object
beer_id             object
brewery_id          object
dtype: object

## Beer Map

Create map with size of blob proportionate to # of checkins/unique.

In [4]:
# aggregate brewery data
grouped = data.groupby('brewery_id')
brewery_data = grouped[['brewery_name', 'brewery_url', 'brewery_country', 'brewery_city',
       'brewery_state']].first()
brewery_data[['count','unique']] = grouped['beer_id'].describe()[['count','unique']]

brewery_data = brewery_data.rename(columns={col : col.split('_')[-1] for col in brewery_data.columns})
brewery_data['rating_avg'] = grouped['rating_score'].mean()
brewery_data.fillna('', inplace=True)

# create address col
brewery_data['address'] = brewery_data[['name','city','state','country']].apply(lambda x: ', '.join([y for y in x if y]), axis=1)

In [5]:
# Fetch coordinates using geopy and Google
from geopy.geocoders import GoogleV3
from time import sleep
# Google API restricts to 1sec

geolocator = GoogleV3(api_key=settings.API_KEY)

coords_path = '../data/coords.json'

try:
    with open(coords_path, 'r') as f:
        coords_dict = json.load(f)
        print('Uploaded coords_dict from json')
except EnvironmentError:
    print('Creating coords_dict')
    coords_dict = {}


def locate(address, timeout=10):
    if address not in coords_dict:
        try:
            location = geolocator.geocode(address, timeout=timeout)
            lat = location.latitude
            long = location.longitude
            
            coords_dict[address] = [lat, long]
            return pd.Series([lat, long])
        except:
            sleep(1)
            pass
        try:
            rdx = ', '.join(address.split(', ')[1:])
            location = geolocator.geocode(rdx, timeout=timeout)
            lat = location.latitude
            long = location.longitude
            
            coords_dict[address] = [lat, long]
            return pd.Series([lat, long])
        except:
            print('No coords for: ', address)
            return pd.Series([np.nan, np.nan])
        
    else:
        lat = coords_dict[address][0]
        long = coords_dict[address][1]
        return pd.Series([lat, long])
    
brewery_data[['lat','long']] = brewery_data['address'].apply(locate)

# write coords to json
f = open(coords_path, 'w')
f.write(json.dumps(coords_dict))
f.close()

Uploaded coords_dict from json


In [6]:
brewery_data.columns

Index(['name', 'url', 'country', 'city', 'state', 'count', 'unique',
       'rating_avg', 'address', 'lat', 'long'],
      dtype='object')

In [69]:
# Map breweries
import folium

LOCAL_COORDS = (34.4208, -119.6982)
m = folium.Map(location=LOCAL_COORDS, zoom_start=4, min_zoom=2, 
               max_bounds=True, no_wrap=True)

for each in brewery_data.iterrows():
    location = [each[1]['lat'], each[1]['long']]
    brewery = each[1]['name']
    address = ', '.join([y for y in each[1][['city','state']].values if y])
    if each[1]['country'] != 'United States':
        address = '{}, {}'.format(address, each[1]['country'])
    unique = each[1]['unique']
    count = each[1]['count']
    size = np.sqrt(unique)*5 # 1ct = 1px
    if not np.isnan(location).any():
        text = '{}\n{}\nUnique: {}, Total: {}'.format(brewery, address, unique, count)
        popup = folium.Popup(text, parse_html=True)
        folium.CircleMarker(
            location=location,
            radius=size,
            popup=popup,
            color='#FFD700',
            fill=True,
            fill_color='#FFD700', fill_opacity=0.5, 
            weight=1,
        ).add_to(m)
    
display(m)

## Checkin Stats

In [72]:
# summary
stats = {}
stats["total"] = len(data)
stats["unique"] = len(data.beer_id.unique())
stats["rating avg"] = data.rating_score.mean()
stats["styles"] = len(data.beer_type.unique())
stats["breweries"] = len(data.brewery_id.unique())
stats["countries"] = len(data.brewery_country.unique())

def beer_stats(stats):
    """ Prints summary/stats of checkin data """
    
    print("Checkin Stats")
    for key, value in stats.items():
        if type(value) is int:
            print("%s: %d" % (key, value))
        else:
            print("%s: %0.2f" % (key, value))
            
beer_stats(stats)

Checkin Stats
total: 959
unique: 788
rating avg: 3.95
styles: 108
breweries: 223
countries: 15
