In [16]:
import feather

import pandas as pd
from glob import glob

In [17]:
def create_df(search_str):
    return pd.concat([pd.read_json(file) for file in glob(search_str)]).drop_duplicates()

In [18]:
beer_df = create_df('data/beer_info*.json')

In [19]:
reviews_df = create_df('data/reviews*.json')

In [20]:
print('Beer count:', len(beer_df))
print('Review count:', len(reviews_df))

Beer count: 575
Review count: 65435


In [21]:
beer_df.to_json('data-merged/beer-info.json', orient='records')
reviews_df.to_json('data-merged/reviews.json', orient='records')

In [22]:
beer_df['abv'] = pd.to_numeric(beer_df['abv'].str.split('%').str[0].str.replace('No ABV', ''))
beer_df['ibu'] = pd.to_numeric(beer_df['ibu'].str.split(' ').str[0].str.replace('No', ''))

beer_df['rating'] = pd.to_numeric(beer_df['rating'].str.replace('(', '').str.replace(')', ''))

beer_df['date'] = pd.to_datetime(beer_df['date'].str.split(' ').str[1])

beer_df['num ratings'] = pd.to_numeric(beer_df['raters'].str.split(' ').str[0].str.replace(',', ''))
del beer_df['raters']

beer_df.head(2)

Unnamed: 0,abv,brewery,date,description,ibu,id,name,rating,style,num ratings
0,9.99,Tanker Brewery,2015-11-05,Tanker and Mean Sardine Brewery from Portugal ...,90.0,1300529,Surf Wax DIPA,3.67,IPA - Imperial / Double,395
1,8.2,Brazos Valley Brewing,2015-08-01,,85.0,1182200,MindFlayer DIPA,3.78,IPA - Imperial / Double,405


In [23]:
reviews_df['user_id'] = reviews_df['user_id'].str.replace('https://untappd.com/user/', '')

In [24]:
feather.write_dataframe(beer_df, 'data-merged/beer-info.feather')

In [25]:
feather.write_dataframe(reviews_df, 'data-merged/reviews.feather')

In [26]:
reviews_df.head(2)

Unnamed: 0,beer_id,comment,rating,user_id
0,1300529,,3.75,Vasen_pakki
1,1300529,,3.5,Dave-Hill


In [27]:
reviews_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65435 entries, 0 to 4362
Data columns (total 4 columns):
beer_id    65435 non-null int64
comment    15917 non-null object
rating     65169 non-null float64
user_id    65435 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 3.7+ MB
