## Experiment w/ pre-trained word vectors

In [1]:
from gensim.models import KeyedVectors

In [2]:
# load pre-trained fastText word vectors
en_model = KeyedVectors.load_word2vec_format('../data/raw/wiki.en.vec')

In [27]:
# Pick a word 
find_similar_to = 'hoppy'

# Finding out similar words [default= top 10]
for similar_word in en_model.similar_by_word(find_similar_to):
    print("Word: {0}, Similarity: {1:.2f}".format(
        similar_word[0], similar_word[1]
    ))

Word: hoppybunny, Similarity: 0.62
Word: drinky, Similarity: 0.58
Word: hoppity, Similarity: 0.57
Word: hoppyland, Similarity: 0.55
Word: woppy, Similarity: 0.55
Word: malty, Similarity: 0.55
Word: cuddlesome, Similarity: 0.54
Word: nutty, Similarity: 0.54
Word: bunny, Similarity: 0.53
Word: chuggy, Similarity: 0.53


In [30]:
# Test words 
word_add = ['fresh', 'cut', 'grass']
word_sub = None

# Word vector addition and subtraction 
for resultant_word in en_model.most_similar(
    positive=word_add, 
#     negative=word_sub
):
    print("Word : {0} , Similarity: {1:.2f}".format(
        resultant_word[0], resultant_word[1]
    ))

Word : chopped , Similarity: 0.62
Word : dried , Similarity: 0.62
Word : cutting , Similarity: 0.61
Word : peeled , Similarity: 0.61
Word : dirt/grass , Similarity: 0.60
Word : stubbled , Similarity: 0.59
Word : oniongrass , Similarity: 0.59
Word : saltgrass , Similarity: 0.59
Word : watered , Similarity: 0.59
Word : unsprouted , Similarity: 0.59


## Look at occurrence of descriptors in reviews

In [1]:
import pandas as pd
import sys; sys.path.append('..')
from beer import descriptors

In [2]:
# read reviews df
df = pd.read_csv('../data/interim/ratebeer.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [67]:
review = df['review/text'].iloc[0]

In [68]:
set(review.lower().split()) & set(descriptors.all_descriptors)

{'caramel', 'cloudy', 'grapefruit', 'light', 'medium', 'orange', 'white'}

In [69]:
review_descriptors = df['review/text'].apply(lambda rev: 1 if len(set(str(rev).lower().split()) & 
                                                             set(descriptors.all_descriptors)) > 0 else 0)

In [70]:
# what fraction of reviews contain at least one descriptor?
review_descriptors.sum()/review_descriptors.count()

0.8050529330957269

## Experiment with descriptor/beer collaborative filtering

In [92]:
# first construct the training data: 'users' are queries, 'items' are beers, 
# and 'ratings' are average overall ratings among all reviews of beer i that contain descriptor j
#
# first get all descriptors in each review
df['descriptors'] = df['review/text'].apply(lambda r: list(set(str(r).lower().split()) & 
                                                           set(descriptors.all_descriptors)))
# create a new row for each element in the descriptors list
s = df.apply(lambda x: pd.Series(x['descriptors']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'descriptor'
df_extended = df.join(s)
# convert the score to the appropriate type
df_extended['score'] = df_extended['review/overall'].apply(lambda s: int(s.split('/')[0]))
# group by beer and descriptor, compute average score
beer_desc_score = df_extended.groupby(['beer/beerId', 'descriptor'])['score'].mean()
# persist the data
beer_desc_score.reset_index().to_csv('../data/interim/beer_desc_score.csv', index=False)

In [95]:
# next, train a simple out-of-the-box CF model on the data
from surprise import Dataset, Reader
from surprise import SVD
from surprise import evaluate, print_perf

ratings_df = beer_desc_score.reset_index()[['descriptor', 'beer/beerId', 'score']]

# create a Reader and Dataset (surprise)
reader = Reader(rating_scale=(1, 20))
data = Dataset.load_from_df(ratings_df, reader)
data.split(5)
# run the algorithm
algo = SVD()
 # evaluate performances
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
# print results
print_perf(perf)

Evaluating RMSE, MAE of algorithm SVD.

------------
Fold 1
RMSE: 1.5890
MAE:  1.1212
------------
Fold 2
RMSE: 1.5945
MAE:  1.1252
------------
Fold 3
RMSE: 1.5949
MAE:  1.1239
------------
Fold 4
RMSE: 1.5909
MAE:  1.1241
------------
Fold 5
RMSE: 1.5923
MAE:  1.1223
------------
------------
Mean RMSE: 1.5923
Mean MAE : 1.1233
------------
------------
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
MAE     1.1212  1.1252  1.1239  1.1241  1.1223  1.1233  
RMSE    1.5890  1.5945  1.5949  1.5909  1.5923  1.5923  


In [96]:
from surprise import NormalPredictor
# run the algorithm
algo = NormalPredictor()
 # evaluate performances
perf = evaluate(algo, data, measures=['RMSE', 'MAE'])
# print results
print_perf(perf)

Evaluating RMSE, MAE of algorithm NormalPredictor.

------------
Fold 1
RMSE: 3.6319
MAE:  2.8514
------------
Fold 2
RMSE: 3.6368
MAE:  2.8547
------------
Fold 3
RMSE: 3.6406
MAE:  2.8571
------------
Fold 4
RMSE: 3.6351
MAE:  2.8549
------------
Fold 5
RMSE: 3.6394
MAE:  2.8568
------------
------------
Mean RMSE: 3.6368
Mean MAE : 2.8550
------------
------------
        Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    
MAE     2.8514  2.8547  2.8571  2.8549  2.8568  2.8550  
RMSE    3.6319  3.6368  3.6406  3.6351  3.6394  3.6368  


In [3]:
bigstr = '\n'.join([str(s).lower() for s in df['review/text'].tolist()])

In [4]:
import string
translator = str.maketrans('', '', string.punctuation)

In [5]:
bigstr_nopunc = bigstr.translate(translator)

In [6]:
with open('../data/interim/reviews_raw.txt', 'w+') as f:
    f.write(bigstr_nopunc)

In [7]:
import pickle

In [8]:
with open("../data/raw/beerdf.pandas", "rb") as f:
    other_reviews = pickle.load(f, encoding='ISO-8859-1')

In [9]:
otherbigstr ='\n'.join([str(s).lower() for s in other_reviews['review/text'].tolist()])
otherbigstr_nopunc = otherbigstr.translate(translator)

with open('../data/interim/reviews_raw.txt', 'a') as f:
    f.write(otherbigstr_nopunc)