In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (30, 20)
plt.rcParams['font.family'] = 'sans-serif'

%matplotlib inline

# data from https://www.kaggle.com/nickhould/craft-cans (scraped from CraftCans.com in January 2017)
beers = pd.read_csv("beers.csv")
breweries = pd.read_csv("breweries.csv")

In [5]:
beers.head()

Unnamed: 0.1,Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces
0,0,0.05,,1436,Pub Beer,American Pale Lager,408,12.0
1,1,0.066,,2265,Devil's Cup,American Pale Ale (APA),177,12.0
2,2,0.071,,2264,Rise of the Phoenix,American IPA,177,12.0
3,3,0.09,,2263,Sinister,American Double / Imperial IPA,177,12.0
4,4,0.075,,2262,Sex and Candy,American IPA,177,12.0


In [6]:
breweries.head()

Unnamed: 0.1,Unnamed: 0,name,city,state
0,0,NorthGate Brewing,Minneapolis,MN
1,1,Against the Grain Brewery,Louisville,KY
2,2,Jack's Abby Craft Lagers,Framingham,MA
3,3,Mike Hess Brewing Company,San Diego,CA
4,4,Fort Point Beer Company,San Francisco,CA


In [7]:
# remove redundant column, rename columns for clarity
beers = beers.drop('Unnamed: 0',axis=1)
breweries = breweries.rename(columns = {'Unnamed: 0': 'brewery_id', 'name': 'brewery_name'})

# merge dataframes, remove NaN values, make ABV more readable
data = pd.merge(beers,breweries,on='brewery_id',how='inner')
data = data[np.isfinite(data['ibu'])]
data['abv'] = data['abv']*100

In [8]:
data.head()

Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces,brewery_name,city,state
14,6.1,60.0,1979,Bitter Bitch,American Pale Ale (APA),177,12.0,18th Street Brewery,Gary,IN
21,9.9,92.0,1036,Lower De Boom,American Barleywine,368,8.4,21st Amendment Brewery,San Francisco,CA
22,7.9,45.0,1024,Fireside Chat,Winter Warmer,368,12.0,21st Amendment Brewery,San Francisco,CA
24,4.4,42.0,876,Bitter American,American Pale Ale (APA),368,12.0,21st Amendment Brewery,San Francisco,CA
25,4.9,17.0,802,Hell or High Watermelon Wheat (2009),Fruit / Vegetable Beer,368,12.0,21st Amendment Brewery,San Francisco,CA


In [9]:
data['state'].unique()

array([' IN', ' CA', ' FL', ' MO', ' WA', ' CO', ' LA', ' KY', ' OR',
       ' AK', ' NC', ' MI', ' TX', ' AL', ' MA', ' AZ', ' MN', ' ME',
       ' VA', ' IL', ' TN', ' MT', ' WY', ' NE', ' NY', ' NJ', ' NV',
       ' OK', ' WI', ' OH', ' GA', ' RI', ' IA', ' ID', ' DC', ' KS',
       ' ND', ' VT', ' MD', ' WV', ' CT', ' PA', ' HI', ' NM', ' MS',
       ' AR', ' SC', ' DE', ' UT', ' NH'], dtype=object)

In [10]:
# remove space in front of state names
data['state'] = data['state'].str.slice(1,3)

In [11]:
ca = data[data['state'] == 'CA']
ca.shape

(135, 10)

In [12]:
ca['brewery_name'].value_counts()

21st Amendment Brewery               17
Golden Road Brewing                  14
Anderson Valley Brewing Company      14
Modern Times Beer                     8
TailGate Beer                         7
Mike Hess Brewing Company             6
Sierra Nevada Brewing Company         6
Manzanita Brewing Company             5
Ruhstaller Beer Company               5
Black Market Brewing Company          4
Ballast Point Brewing Company         4
Fort Point Beer Company               4
Central Coast Brewing Company         4
Saint Archer Brewery                  4
Hess Brewing Company                  3
Mission Brewery                       3
Devil's Canyon Brewery                3
The Dudes' Brewing Company            3
Firestone Walker Brewing Company      3
Headlands Brewing Company             3
Mavericks Beer Company                3
Hangar 24 Craft Brewery               2
Figueroa Mountain Brewing Company     2
Butcher's Brewing                     1
Mother Earth Brew Company             1


In [13]:
ca['city'].value_counts()

San Diego              35
San Francisco          22
Boonville              14
Los Angeles            14
Chico                   6
Temecula                5
Santee                  5
Sacramento              5
San Luis Obispo         4
Torrance                3
Paso Robles             3
Half Moon Bay           3
Belmont                 3
Mill Valley             3
Redlands                2
Santa Cruz              2
Buellton                2
Claremont               1
Vista                   1
South San Francisco     1
Carlsbad                1
Name: city, dtype: int64

In [14]:
from geopy.geocoders import Nominatim
geolocator = Nominatim()

# make it easier for geopy to find coordinates
ca['city-state'] = ca['city']+", "+ca['state']
cities = ca['city-state'].unique()

# create dictionaries of latitudes and longitudes
lats = dict(zip(cities, pd.Series(cities).apply(geolocator.geocode).apply(lambda x: x.latitude)))
longs = dict(zip(cities, pd.Series(cities).apply(geolocator.geocode).apply(lambda x: x.longitude)))

ca['latitude'] = ca['city-state'].map(lats)
ca['longitude'] = ca['city-state'].map(longs)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
ca.head()

Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces,brewery_name,city,state,city-state,latitude,longitude
21,9.9,92.0,1036,Lower De Boom,American Barleywine,368,8.4,21st Amendment Brewery,San Francisco,CA,"San Francisco, CA",37.779281,-122.419236
22,7.9,45.0,1024,Fireside Chat,Winter Warmer,368,12.0,21st Amendment Brewery,San Francisco,CA,"San Francisco, CA",37.779281,-122.419236
24,4.4,42.0,876,Bitter American,American Pale Ale (APA),368,12.0,21st Amendment Brewery,San Francisco,CA,"San Francisco, CA",37.779281,-122.419236
25,4.9,17.0,802,Hell or High Watermelon Wheat (2009),Fruit / Vegetable Beer,368,12.0,21st Amendment Brewery,San Francisco,CA,"San Francisco, CA",37.779281,-122.419236
26,4.9,17.0,801,Hell or High Watermelon Wheat (2009),Fruit / Vegetable Beer,368,12.0,21st Amendment Brewery,San Francisco,CA,"San Francisco, CA",37.779281,-122.419236


In [1]:
# Pixiedust experimentation

import pixiedust

Pixiedust database opened successfully


Downloading 'https://raw.githubusercontent.com/benhuds/craftbeer-data/master/ca-data.csv?token=AC-hiLTrSaF8UQGwKUvFCas6KJ3UycCDks5YwcS-wA%3D%3D' from https://raw.githubusercontent.com/benhuds/craftbeer-data/master/ca-data.csv?token=AC-hiLTrSaF8UQGwKUvFCas6KJ3UycCDks5YwcS-wA%3D%3D


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Creating pySpark DataFrame for 'https://raw.githubusercontent.com/benhuds/craftbeer-data/master/ca-data.csv?token=AC-hiLTrSaF8UQGwKUvFCas6KJ3UycCDks5YwcS-wA%3D%3D'. Please wait...
Successfully created pySpark DataFrame for 'https://raw.githubusercontent.com/benhuds/craftbeer-data/master/ca-data.csv?token=AC-hiLTrSaF8UQGwKUvFCas6KJ3UycCDks5YwcS-wA%3D%3D'


In [20]:
sqlContext = SQLContext(sc)

sparkA = sqlContext.createDataFrame(ca)

In [22]:
display(sparkA)

In [25]:
pd.set_option('display.max_rows', 500)
ca[ca['city'] == "South San Francisco"]

Unnamed: 0,abv,ibu,id,name,style,brewery_id,ounces,brewery_name,city,state,city-state,latitude,longitude
136,5.2,10.0,1172,Mo's Gose,Gose,461,16.0,Armstrong Brewing Company,South San Francisco,CA,"South San Francisco, CA",37.654949,-122.408125


In [43]:
import reviews

def test(r):
    return reviews.query_api(r['brewery_name'],r['city-state'])

bs = ca[['brewery_name','city-state','latitude','longitude']]
bs = bs.drop_duplicates()
bs['review'] = bs.apply(test,axis=1)
bs

Unnamed: 0,brewery_name,city-state,latitude,longitude,review
21,21st Amendment Brewery,"San Francisco, CA",37.779281,-122.419236,4.0
105,Anderson Valley Brewing Company,"Boonville, CA",39.009167,-123.366111,4.0
136,Armstrong Brewing Company,"South San Francisco, CA",37.654949,-122.408125,5.0
183,Ballast Point Brewing Company,"San Diego, CA",32.717421,-117.162771,5.0
276,Black Market Brewing Company,"Temecula, CA",33.494635,-117.147366,4.0
487,Butcher's Brewing,"Carlsbad, CA",33.158093,-117.350594,4.5
577,Central Coast Brewing Company,"San Luis Obispo, CA",35.282753,-120.659616,5.0
630,Claremont Craft Ales,"Claremont, CA",34.096676,-117.719778,4.5
722,Devil's Canyon Brewery,"Belmont, CA",37.520215,-122.275801,4.0
792,Figueroa Mountain Brewing Company,"Buellton, CA",34.613473,-120.193669,4.5


In [46]:
bs2 = sqlContext.createDataFrame(bs)
display(bs2)