In [1]:
# Libreria de Analisis de Datos
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

In [2]:
# Leemos el fichero CSV: BEERADVOCATE RATINGS, RESULTANTE DEL PASO ANTERIOR 
reviews_raw = pd.read_csv('./Data/beer_reviews_v2.csv', delimiter=',',\
                          dtype={'review_profilename': str, \
                                 'beer_name': str, 'beer_advocates_style': str, \
                                 'brewery_name': str})

In [3]:
# Haremos una copia para trabajar con ella
dfreviews = reviews_raw[['review_profilename', 'beer_name', 'brewery_name', \
                         'beer_advocates_style', 'beer_abv', 'abv_strength', \
                         'review_overall', 'review_aroma', 'review_appearance', 'review_palate', \
                         'review_taste', 'review_average']].copy()

nRow, nCol = dfreviews.shape
print('Hay',  nRow, 'filas y', nCol, 'columnas')

# Primer vistazo
dfreviews.head()

Hay 1301173 filas y 12 columnas


Unnamed: 0,review_profilename,beer_name,brewery_name,beer_advocates_style,beer_abv,abv_strength,review_overall,review_aroma,review_appearance,review_palate,review_taste,review_average
0,zyzygy,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.5,4.0,4.5,4.5,4.3
1,ypsifly,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.5,4.0,3.0,4.0,4.5,4.0
2,woemad,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.5,4.0,4.5,4.5,4.3
3,wnhay,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.0,4.5,4.0,4.0,4.1
4,williamherbert,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,3.0,3.0,4.5,3.0,3.5,3.4


In [4]:
# Estructura original del fichero
dfreviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1301173 entries, 0 to 1301172
Data columns (total 12 columns):
review_profilename      1301173 non-null object
beer_name               1301173 non-null object
brewery_name            1301173 non-null object
beer_advocates_style    1301173 non-null object
beer_abv                1301173 non-null float64
abv_strength            1301173 non-null int64
review_overall          1301173 non-null float64
review_aroma            1301173 non-null float64
review_appearance       1301173 non-null float64
review_palate           1301173 non-null float64
review_taste            1301173 non-null float64
review_average          1301173 non-null float64
dtypes: float64(7), int64(1), object(4)
memory usage: 119.1+ MB


In [5]:
# Los valores STRING se cargan con el TIPO PANDAS object: Convertiremos 'object' a Tipos PANDAS 'category'
# Asi los STRINGS se codificaran como numeros y sera mas eficiente su analisis posterior
dfreviews[dfreviews.select_dtypes(['object']).columns] = \
    dfreviews.select_dtypes(['object']).apply(lambda x: x.astype('category'))

dfreviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1301173 entries, 0 to 1301172
Data columns (total 12 columns):
review_profilename      1301173 non-null category
beer_name               1301173 non-null category
brewery_name            1301173 non-null category
beer_advocates_style    1301173 non-null category
beer_abv                1301173 non-null float64
abv_strength            1301173 non-null int64
review_overall          1301173 non-null float64
review_aroma            1301173 non-null float64
review_appearance       1301173 non-null float64
review_palate           1301173 non-null float64
review_taste            1301173 non-null float64
review_average          1301173 non-null float64
dtypes: category(4), float64(7), int64(1)
memory usage: 90.0 MB


In [6]:
def get_ibu_range_avg(str_ibu_range):
    # SEPARADOR '-'
    if str_ibu_range.find('-') != -1:
        separatorChar = '-'
    
    # SEPARADOR '–'
    if str_ibu_range.find('–') != -1:
        separatorChar = '–'

    l = str_ibu_range.split(separatorChar)
    #print(l)
    minIBU = float(l[0])
    maxIBU = float(l[1])
    
    beerStyleAvgIBU = ((minIBU + maxIBU) / 2)
    
    return beerStyleAvgIBU

In [7]:
get_ibu_range_avg('20-30')

25.0

In [8]:
styles_raw = pd.read_csv('./Data/beer_styles_v1.csv', delimiter=',',\
                          dtype={'style_name': str, 'family_name': str, \
                                 'abv_range': str, 'ibu_range': str, 'style_URL': str})

In [9]:
nRowS, nColS = styles_raw.shape
print('Hay',  nRowS, 'filas y', nColS, 'columnas')

# Primer vistazo
styles_raw.head()

Hay 111 filas y 7 columnas


Unnamed: 0,abv_range,abv_strength,family_name,ibu_range,ibu_strength,style_URL,style_name
0,6.3-7.6%,3,Bocks,20-30,3,https://www.beeradvocate.com/beer/styles/32/,German Bock
1,6.6-7.9%,3,Bocks,17-27,3,https://www.beeradvocate.com/beer/styles/35/,German Doppelbock
2,7.0-14.0%,4,Bocks,25-35,3,https://www.beeradvocate.com/beer/styles/36/,German Eisbock
3,6.3-8.1%,3,Bocks,20-38,3,https://www.beeradvocate.com/beer/styles/33/,German Maibock
4,7.0-9.5%,3,Bocks,15-35,3,https://www.beeradvocate.com/beer/styles/92/,German Weizenbock


In [10]:
styles_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111 entries, 0 to 110
Data columns (total 7 columns):
abv_range       111 non-null object
abv_strength    111 non-null int64
family_name     111 non-null object
ibu_range       111 non-null object
ibu_strength    111 non-null int64
style_URL       111 non-null object
style_name      111 non-null object
dtypes: int64(2), object(5)
memory usage: 6.1+ KB


In [11]:
# Los valores STRING se cargan con el TIPO PANDAS object: Convertiremos  'object' a Tipos PANDAS 'category'
# Asi los STRINGS se codificaran como numeros y sera mas eficiente su analisis posterior
styles_raw[styles_raw.select_dtypes(['object']).columns] = \
    styles_raw.select_dtypes(['object']).apply(lambda x: x.astype('category'))

styles_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111 entries, 0 to 110
Data columns (total 7 columns):
abv_range       111 non-null category
abv_strength    111 non-null int64
family_name     111 non-null category
ibu_range       111 non-null category
ibu_strength    111 non-null int64
style_URL       111 non-null category
style_name      111 non-null category
dtypes: category(5), int64(2)
memory usage: 21.0 KB


### AÑADIMOS la columna nueva 'beer_style_avg_ibu'

In [12]:
bs_testing_list = ['American Barleywine', 'German Bock']
for bs in bs_testing_list:
    print('Beer style: "{0}", IBU Range: {1}, Media(IBU Range): {2}'.format(\
                bs, \
                styles_raw[styles_raw['style_name'] == bs]['ibu_range'][0], \
                get_ibu_range_avg(styles_raw[styles_raw['style_name'] == bs]['ibu_range'][0])))


Beer style: "American Barleywine", IBU Range: 60–100, Media(IBU Range): 80.0
Beer style: "German Bock", IBU Range: 20-30, Media(IBU Range): 25.0


In [13]:
dfreviews['beer_style_ibu_avg'] = dfreviews['beer_advocates_style'] \
        .apply(lambda beer_style: \
                      get_ibu_range_avg(styles_raw[styles_raw['style_name'] == beer_style]['ibu_range'][0]))

In [14]:
dfreviews.head()

Unnamed: 0,review_profilename,beer_name,brewery_name,beer_advocates_style,beer_abv,abv_strength,review_overall,review_aroma,review_appearance,review_palate,review_taste,review_average,beer_style_ibu_avg
0,zyzygy,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.5,4.0,4.5,4.5,4.3,80.0
1,ypsifly,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.5,4.0,3.0,4.0,4.5,4.0,80.0
2,woemad,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.5,4.0,4.5,4.5,4.3,80.0
3,wnhay,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.0,4.5,4.0,4.0,4.1,80.0
4,williamherbert,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,3.0,3.0,4.5,3.0,3.5,3.4,80.0


In [15]:
dfreviews.tail()

Unnamed: 0,review_profilename,beer_name,brewery_name,beer_advocates_style,beer_abv,abv_strength,review_overall,review_aroma,review_appearance,review_palate,review_taste,review_average,beer_style_ibu_avg
1301168,BuckeyeNation,Point Spring Bock,Stevens Point Brewery,German Bock,5.2,2,3.5,3.0,4.0,2.5,3.0,3.2,25.0
1301169,Bighuge,Point Spring Bock,Stevens Point Brewery,German Bock,5.2,2,4.5,4.0,4.0,3.5,4.0,4.0,25.0
1301170,Beerwolf17,Point Spring Bock,Stevens Point Brewery,German Bock,5.2,2,4.0,3.0,3.5,4.0,3.0,3.5,25.0
1301171,Atron67,Point Spring Bock,Stevens Point Brewery,German Bock,5.2,2,3.5,2.5,3.0,4.0,4.0,3.4,25.0
1301172,ADR,Point Spring Bock,Stevens Point Brewery,German Bock,5.2,2,4.0,4.5,4.0,3.0,3.5,3.8,25.0


In [16]:
dfreviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1301173 entries, 0 to 1301172
Data columns (total 13 columns):
review_profilename      1301173 non-null category
beer_name               1301173 non-null category
brewery_name            1301173 non-null category
beer_advocates_style    1301173 non-null category
beer_abv                1301173 non-null float64
abv_strength            1301173 non-null int64
review_overall          1301173 non-null float64
review_aroma            1301173 non-null float64
review_appearance       1301173 non-null float64
review_palate           1301173 non-null float64
review_taste            1301173 non-null float64
review_average          1301173 non-null float64
beer_style_ibu_avg      1301173 non-null float64
dtypes: category(4), float64(8), int64(1)
memory usage: 100.0 MB


### AÑADIREMOS 2 COLUMNAS PARA LOS VALORES DE LAS 2 FORMULAS DE SCORING

#### COLUMNA 'beeradvocate_score'
def beeradvocate_score(ratingsdf):
    return(((ratingsdf['review_taste'] * 0.40) + \
            (ratingsdf['review_aroma'] * 0.24) + \
            (ratingsdf['review_palate'] * 0.10) + \
            (ratingsdf['review_appearance'] * 0.06) + \
            (ratingsdf['review_overall'] * 0.20)))

#### COLUMNA 'aroma_appearance_score'
def aroma_appearance_score_2(ratingsdf):
    return(((ratingsdf['review_aroma'] * 0.40) + \
            (ratingsdf['review_appearance'] * 0.24) + \
            (ratingsdf['review_taste'] * 0.10) + \
            (ratingsdf['review_palate'] * 0.06) + \
            (ratingsdf['review_overall'] * 0.20)))

In [17]:
dfreviews['beeradvocate_score'] = dfreviews.apply(lambda row:\
                                                         ((row['review_taste'] * 0.40) + \
                                                          (row['review_aroma'] * 0.24) + \
                                                          (row['review_palate'] * 0.10) + \
                                                          (row['review_appearance'] * 0.06) + \
                                                          (row['review_overall'] * 0.20)), \
                                                         axis=1)

In [18]:
dfreviews.head()

Unnamed: 0,review_profilename,beer_name,brewery_name,beer_advocates_style,beer_abv,abv_strength,review_overall,review_aroma,review_appearance,review_palate,review_taste,review_average,beer_style_ibu_avg,beeradvocate_score
0,zyzygy,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.5,4.0,4.5,4.5,4.3,80.0,4.37
1,ypsifly,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.5,4.0,3.0,4.0,4.5,4.0,80.0,4.24
2,woemad,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.5,4.0,4.5,4.5,4.3,80.0,4.37
3,wnhay,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.0,4.5,4.0,4.0,4.1,80.0,4.03
4,williamherbert,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,3.0,3.0,4.5,3.0,3.5,3.4,80.0,3.29


In [19]:
dfreviews['aroma_appearance_score'] = dfreviews.apply(lambda row:\
                                                         ((row['review_aroma'] * 0.40) + \
                                                          (row['review_appearance'] * 0.24) + \
                                                          (row['review_taste'] * 0.10) + \
                                                          (row['review_palate'] * 0.06) + \
                                                          (row['review_overall'] * 0.20)), \
                                                         axis=1)

In [20]:
dfreviews.tail()

Unnamed: 0,review_profilename,beer_name,brewery_name,beer_advocates_style,beer_abv,abv_strength,review_overall,review_aroma,review_appearance,review_palate,review_taste,review_average,beer_style_ibu_avg,beeradvocate_score,aroma_appearance_score
1301168,BuckeyeNation,Point Spring Bock,Stevens Point Brewery,German Bock,5.2,2,3.5,3.0,4.0,2.5,3.0,3.2,25.0,3.11,3.31
1301169,Bighuge,Point Spring Bock,Stevens Point Brewery,German Bock,5.2,2,4.5,4.0,4.0,3.5,4.0,4.0,25.0,4.05,4.07
1301170,Beerwolf17,Point Spring Bock,Stevens Point Brewery,German Bock,5.2,2,4.0,3.0,3.5,4.0,3.0,3.5,25.0,3.33,3.38
1301171,Atron67,Point Spring Bock,Stevens Point Brewery,German Bock,5.2,2,3.5,2.5,3.0,4.0,4.0,3.4,25.0,3.48,3.06
1301172,ADR,Point Spring Bock,Stevens Point Brewery,German Bock,5.2,2,4.0,4.5,4.0,3.0,3.5,3.8,25.0,3.82,4.09


In [21]:
dfreviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1301173 entries, 0 to 1301172
Data columns (total 15 columns):
review_profilename        1301173 non-null category
beer_name                 1301173 non-null category
brewery_name              1301173 non-null category
beer_advocates_style      1301173 non-null category
beer_abv                  1301173 non-null float64
abv_strength              1301173 non-null int64
review_overall            1301173 non-null float64
review_aroma              1301173 non-null float64
review_appearance         1301173 non-null float64
review_palate             1301173 non-null float64
review_taste              1301173 non-null float64
review_average            1301173 non-null float64
beer_style_ibu_avg        1301173 non-null float64
beeradvocate_score        1301173 non-null float64
aroma_appearance_score    1301173 non-null float64
dtypes: category(4), float64(10), int64(1)
memory usage: 119.8 MB


### CREAREMOS UNA NUEVA COLUMNA 'beer_id'

In [22]:
# TENEMNOS 6504 CERVEZAS DISTINTAS
len(dfreviews['beer_name'].unique())

6504

In [23]:
beerdf = pd.Series(dfreviews['beer_name'].unique()) \
                   .to_frame() \
                   .rename(columns = {0: 'beer_name'})
beerdf.head()

Unnamed: 0,beer_name
0,Stone Old Guardian Barley Wine Style Ale 2006
1,Samuel Adams Brown Ale
2,Hennepin (Farmhouse Saison)
3,Double Bag
4,Berkshire Russian Imperial Stout


In [24]:
beerdf = beerdf.reset_index().rename(columns = {'index': 'beer_id'})
beerdf.head()

Unnamed: 0,beer_id,beer_name
0,0,Stone Old Guardian Barley Wine Style Ale 2006
1,1,Samuel Adams Brown Ale
2,2,Hennepin (Farmhouse Saison)
3,3,Double Bag
4,4,Berkshire Russian Imperial Stout


'beer_df' Tambien es tipo 'category' en Dataframe beerdf y hara mas rapida la creacion de la nueva columna 'beer_id' en el dataframe dfreviews

In [101]:
# 6504 CERVEZAS DISTINTAS
beerdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6504 entries, 0 to 6503
Data columns (total 2 columns):
beer_id      6504 non-null int64
beer_name    6504 non-null category
dtypes: category(1), int64(1)
memory usage: 434.4 KB


In [100]:
import warnings

def get_beer_id(beer_name):
    # FALLA PARA 'Double Bag Y FUNCIONA ok PARA 'Samuel Adams Brown Ale'
    #return beerdf[beerdf['beer_name'] == beer_name]['beer_id'][1]
    # CONSIGO EL VALOR DEL INDICE DENTRO DEL ndarray DONDE SE ALMACENA EL ELEMENTO CON beer_name'
    tuple_result = np.where(beerdf == beer_name)
    # ESTA COMPARACION DEVUELVE UN WARNING POR UN PROBLEMA NO RESUELTO REPORTADO ENTRE numpy y Python
    # https://stackoverflow.com/questions/40659212/futurewarning-elementwise-comparison-failed-returning-scalar-but-in-the-futur
    empty_list = list([])
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=FutureWarning)
        # SIEMPRE ENCONTRARA 'beer_style' PERO SI NO...
        if list(tuple_result[0]) == empty_list:
            return ""
        else:
            return tuple_result[0][0]

In [102]:
get_beer_id('Double Bag')

3

In [103]:
beerdf[beerdf['beer_name'] == 'Double Bag']

Unnamed: 0,beer_id,beer_name
3,3,Double Bag


In [104]:
get_beer_id('Samuel Adams Brown Ale')

1

In [105]:
beerdf[beerdf['beer_name'] == 'Samuel Adams Brown Ale']

Unnamed: 0,beer_id,beer_name
1,1,Samuel Adams Brown Ale


In [106]:
get_beer_id('Unknown Beer Style')

''

In [107]:
dfreviewsAux = dfreviews.copy()

In [108]:
dfreviewsAux['beer_id'] = dfreviewsAux.apply(lambda row: \
                                             get_beer_id(row['beer_name']), \
                                             axis=1)

In [109]:
dfreviewsAux.head()

Unnamed: 0,review_profilename,beer_name,brewery_name,beer_advocates_style,beer_abv,abv_strength,review_overall,review_aroma,review_appearance,review_palate,review_taste,review_average,beer_style_ibu_avg,beeradvocate_score,aroma_appearance_score,beer_id
0,zyzygy,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.5,4.0,4.5,4.5,4.3,80.0,4.37,4.28,0
1,ypsifly,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.5,4.0,3.0,4.0,4.5,4.0,80.0,4.24,3.91,0
2,woemad,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.5,4.0,4.5,4.5,4.3,80.0,4.37,4.28,0
3,wnhay,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.0,4.5,4.0,4.0,4.1,80.0,4.03,4.12,0
4,williamherbert,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,3.0,3.0,4.5,3.0,3.5,3.4,80.0,3.29,3.41,0


In [119]:
dfreviewsAux[dfreviewsAux['beer_name'] == 'Double Bag']['beer_id'].unique()[0]

3

In [120]:
dfreviewsAux[dfreviewsAux['beer_name'] == 'Samuel Adams Brown Ale']['beer_id'].unique()[0]

1

'review_profilename' Tambien es tipo 'category' en Dataframe userdf y hara mas rapida la creacion de la nueva columna 'user_id' en el dataframe dfreviews

In [121]:
dfreviewsAux.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1301173 entries, 0 to 1301172
Data columns (total 16 columns):
review_profilename        1301173 non-null category
beer_name                 1301173 non-null category
brewery_name              1301173 non-null category
beer_advocates_style      1301173 non-null category
beer_abv                  1301173 non-null float64
abv_strength              1301173 non-null int64
review_overall            1301173 non-null float64
review_aroma              1301173 non-null float64
review_appearance         1301173 non-null float64
review_palate             1301173 non-null float64
review_taste              1301173 non-null float64
review_average            1301173 non-null float64
beer_style_ibu_avg        1301173 non-null float64
beeradvocate_score        1301173 non-null float64
aroma_appearance_score    1301173 non-null float64
beer_id                   1301173 non-null int64
dtypes: category(4), float64(10), int64(2)
memory usage: 129.8 MB


In [122]:
dfreviews = dfreviewsAux.copy()

### CREAREMOS Y AÑADIREMOS UNA COLUMNA 'user_id'

In [138]:
# TENEMOS 30810 USUARIOS DISTINTOS
len(dfreviews['review_profilename'].unique())

30810

In [123]:
userdf = pd.Series(dfreviews['review_profilename'].unique()) \
                   .to_frame() \
                   .rename(columns = {0: 'review_profilename'})
userdf.head()

Unnamed: 0,review_profilename
0,zyzygy
1,ypsifly
2,woemad
3,wnhay
4,williamherbert


In [124]:
userdf = userdf.reset_index().rename(columns = {'index': 'user_id'})
userdf.head()

Unnamed: 0,user_id,review_profilename
0,0,zyzygy
1,1,ypsifly
2,2,woemad
3,3,wnhay
4,4,williamherbert


In [125]:
# 30810 USUARIOS DISTINTOS
userdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30810 entries, 0 to 30809
Data columns (total 2 columns):
user_id               30810 non-null int64
review_profilename    30810 non-null category
dtypes: category(1), int64(1)
memory usage: 1.8 MB


In [126]:
import warnings

def get_user_id(user_name):
    # CONSIGO EL VALOR DEL INDICE DENTRO DEL ndarray DONDE SE ALMACENA EL ELEMENTO CON beer_name'
    tuple_result = np.where(userdf == user_name)
    # ESTA COMPARACION DEVUELVE UN WARNING POR UN PROBLEMA NO RESUELTO REPORTADO ENTRE numpy y Python
    # https://stackoverflow.com/questions/40659212/futurewarning-elementwise-comparison-failed-returning-scalar-but-in-the-futur
    empty_list = list([])
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=FutureWarning)
        # SIEMPRE ENCONTRARA 'user_name' PERO SI NO...
        if list(tuple_result[0]) == empty_list:
            return ""
        else:
            return tuple_result[0][0]

In [128]:
get_user_id('ypsifly')

1

In [129]:
userdf[userdf['review_profilename'] == 'ypsifly']

Unnamed: 0,user_id,review_profilename
1,1,ypsifly


In [130]:
get_user_id('wnhay')

3

In [131]:
userdf[userdf['review_profilename'] == 'wnhay']

Unnamed: 0,user_id,review_profilename
3,3,wnhay


In [132]:
get_user_id('Unknown User Profile Name')

''

In [133]:
dfreviewsAux = dfreviews.copy()

In [134]:
dfreviewsAux['user_id'] = dfreviewsAux.apply(lambda row: \
                                             get_user_id(row['review_profilename']), \
                                             axis=1)

In [135]:
dfreviewsAux[dfreviewsAux['review_profilename'] == 'ypsifly']['user_id'].unique()[0]

1

In [136]:
dfreviewsAux[dfreviewsAux['review_profilename'] == 'wnhay']['user_id'].unique()[0]

3

In [137]:
dfreviewsAux.head()

Unnamed: 0,review_profilename,beer_name,brewery_name,beer_advocates_style,beer_abv,abv_strength,review_overall,review_aroma,review_appearance,review_palate,review_taste,review_average,beer_style_ibu_avg,beeradvocate_score,aroma_appearance_score,beer_id,user_id
0,zyzygy,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.5,4.0,4.5,4.5,4.3,80.0,4.37,4.28,0,0
1,ypsifly,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.5,4.0,3.0,4.0,4.5,4.0,80.0,4.24,3.91,0,1
2,woemad,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.5,4.0,4.5,4.5,4.3,80.0,4.37,4.28,0,2
3,wnhay,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,4.0,4.0,4.5,4.0,4.0,4.1,80.0,4.03,4.12,0,3
4,williamherbert,Stone Old Guardian Barley Wine Style Ale 2006,Stone Brewing Co.,American Barleywine,11.2,4,3.0,3.0,4.5,3.0,3.5,3.4,80.0,3.29,3.41,0,4


In [139]:
dfreviewsAux.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1301173 entries, 0 to 1301172
Data columns (total 17 columns):
review_profilename        1301173 non-null category
beer_name                 1301173 non-null category
brewery_name              1301173 non-null category
beer_advocates_style      1301173 non-null category
beer_abv                  1301173 non-null float64
abv_strength              1301173 non-null int64
review_overall            1301173 non-null float64
review_aroma              1301173 non-null float64
review_appearance         1301173 non-null float64
review_palate             1301173 non-null float64
review_taste              1301173 non-null float64
review_average            1301173 non-null float64
beer_style_ibu_avg        1301173 non-null float64
beeradvocate_score        1301173 non-null float64
aroma_appearance_score    1301173 non-null float64
beer_id                   1301173 non-null int64
user_id                   1301173 non-null int64
dtypes: category(4), f

In [140]:
dfreviews = dfreviewsAux.copy()

### GUARDAREMOS LOS NUEVOS DATASET PARA ENTRADA DEL RECOMENDADOR

In [146]:
# Almacenaremos el NUEVO DATASET CON LAS VALORACIONES UTILES COMO ENTRADA DE RECOMENDADORES
dfreviews[['user_id', 'review_profilename', \
           'beer_name', 'beer_advocates_style', 'beer_id', \
           'beer_abv', 'abv_strength',  'beer_style_ibu_avg', 'brewery_name', \
           'review_aroma', 'review_appearance', 'review_palate', 'review_taste', 'review_overall', \
           'review_average', 'beeradvocate_score', 'aroma_appearance_score']] \
         .to_csv('./Data/beer_reviews_v3.csv', sep=',', index=False)

In [151]:
userdf[['user_id', 'review_profilename']] \
      .to_csv('./Data/users_v1.csv', sep=',', index=False)

In [165]:
dfreviewsAux = dfreviews.copy()

In [166]:
# CLASIFICAMOS POR CERVEZA Y 'review_overall' 
dfreviewsAux = dfreviewsAux.sort_values(['beer_id', 'review_overall'], ascending=True)

In [167]:
# PARA CADA CERVEZA, MANTENDREMOS UNA VOTACION, BORRAREMOS LAS DEMAS Y, OBTENDREMOS LOS DATOS DE LA CERVEZA
dfreviewsAux = dfreviewsAux.drop_duplicates(subset= ['beer_name'], keep='first')

In [168]:
# AHORA SOLO NOS QUEDAMOS CON LOS DATOS PROPIOS DE CADA CERVEZA
dfreviewsAux = dfreviewsAux[['beer_id','beer_name', 'beer_advocates_style', \
           'beer_abv', 'abv_strength', 'beer_style_ibu_avg', 'brewery_name']]

In [169]:
dfreviewsAux.head()

Unnamed: 0,beer_id,beer_name,beer_advocates_style,beer_abv,abv_strength,beer_style_ibu_avg,brewery_name
75,0,Stone Old Guardian Barley Wine Style Ale 2006,American Barleywine,11.2,4,80.0,Stone Brewing Co.
176,1,Samuel Adams Brown Ale,English Brown Ale,5.35,2,20.0,Boston Beer Company (Samuel Adams)
1686,2,Hennepin (Farmhouse Saison),Belgian Saison,7.7,3,29.0,Brewery Ommegang
2828,3,Double Bag,German Altbier,7.2,3,37.5,Long Trail Brewing Co.
3209,4,Berkshire Russian Imperial Stout,Russian Imperial Stout,8.5,3,70.0,Berkshire Brewing Company Inc.


In [170]:
dfreviewsAux.tail()

Unnamed: 0,beer_id,beer_name,beer_advocates_style,beer_abv,abv_strength,beer_style_ibu_avg,brewery_name
1301010,6499,MacTarnahan's Sling Shot Extra Pale Ale,American Pale Ale (APA),6.2,3,37.5,Portland Brewing Company
1301051,6500,Saxer JackFrost Winter Doppelbock,German Doppelbock,8.0,3,22.0,Saxer Brewing Company
1301090,6501,Nickel Brook Green Apple Pilsner,Fruit and Field Beer,4.0,2,25.0,Better Bitters Brewing Co.
1301102,6502,Mill Race Mild,English Dark Mild Ale,3.5,1,25.0,Grand River Brewing
1301154,6503,Point Spring Bock,German Bock,5.2,2,25.0,Stevens Point Brewery


In [171]:
dfreviewsAux.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6504 entries, 75 to 1301154
Data columns (total 7 columns):
beer_id                 6504 non-null int64
beer_name               6504 non-null category
beer_advocates_style    6504 non-null category
beer_abv                6504 non-null float64
abv_strength            6504 non-null int64
beer_style_ibu_avg      6504 non-null float64
brewery_name            6504 non-null category
dtypes: category(3), float64(2), int64(2)
memory usage: 757.4 KB


In [172]:
dfreviewsAux[['beer_id', 'beer_name', 'beer_advocates_style', \
              'beer_abv', 'abv_strength', 'beer_style_ibu_avg', 'brewery_name']] \
            .to_csv('./Data/beers_v1.csv', sep=',', index=False)