# EDA with our first model

In [1]:
import os
os.chdir('/Users/brettcastellanos/galvanize/craft_beer_ratings')
import pickle
import pandas as pd
import numpy as np
import src.nlp.nlp as nlp
from pprint import pprint
import matplotlib.pyplot as plt

In [2]:
def unpickle(filename: str):
    """Returns the unpickled object from the file.
    """
    with open(filename, 'rb') as p:
        item = pickle.load(p)
    return item

In [None]:
beer_df, reviews_df = nlp.load_data('data/beers.csv', 'data/reviews.csv')

reviews_df['text'] = [nlp.remove_bad_text(text) for text in reviews_df['text'].values]

In [11]:
W = unpickle('models/1-nmf/W.pkl')
NMF = unpickle('models/1-nmf/NMF.pkl')
TFIDF_Vectorizer = unpickle('models/1-nmf/TF-IDF-Vectorizer.pkl')
TFIDF = unpickle('models/1-nmf/TF-IDF.pkl')

## Find the keywords associated with each topic

Let's find the top 10 words associated with our top ten topics.

In [None]:
topic_vectors = NMF.components_
feature_names = TFIDF_Vectorizer.get_feature_names()

In [None]:
def get_top_15_topics_words(topic_vectors, feature_names):
    top_words = []
    for idx in range(15):
        top_words.append(get_top_words(topic_vectors[idx], feature_names))
    return top_words
    
def get_top_words(topic_vector, feature_names):
    top_words_idxs = np.argsort(topic_vector)[-1:-16:-1]
    top_words = [feature_names[idx] for idx in top_words_idxs]
    return top_words

In [None]:
top_15 = get_top_15_topics_words(topic_vectors, feature_names)

In [None]:
for topic in top_15:
    print(topic)

## Find the top 10 reviews associated with each topic

In [None]:
top_reviews_idx = np.argsort(W[:, 0])[-1:-11:-1]

In [None]:
reviews_df.iloc[top_reviews_idx]

In [None]:
def get_all_topics_top_ten(W, reviews_df):
    topics = []
    for idx in range(W.shape[1]):
        topics.append(get_top_ten_reviews(W, idx, reviews_df))
    return topics

def get_top_ten_reviews(W, topic_idx, reviews_df):
    top_reviews_idx = np.argsort(W[:, topic_idx])[-1:-11:-1]
    return reviews_df.iloc[top_reviews_idx]

In [None]:
top_ten_reviews_by_topic = get_all_topics_top_ten(W, reviews_df)

In [None]:
for topic in top_ten_reviews_by_topic:
    print(topic[['beer', 'overall']])
    print()

## Hierarchical Clustering
Here I'll take the mean review vector for each beer and cluster the beers.

The first step is to calculate the topic vector for each beer. To do this we will produce a topic vector for the beer based on a TF-IDF vector for text comprising all the reviews associated with that beer.

In [3]:
beer_df, reviews_df = nlp.load_data('data/raw/beers.csv', 'data/raw/reviews.csv', 50)


In [13]:
two_hearted = beer_df[beer_df['beer'] == 'two hearted ale']['review_id'].iloc[0]

IndexError: single positional indexer is out-of-bounds

In [None]:
two_hearted_reviews = reviews_df[reviews_df['brew_beer']==two_hearted]
two_hearted_reviews.head(2)

In [None]:
two_hearted_texts = two_hearted_reviews['text'].values
two_hearted_texts = nlp.clean_documents(two_hearted_texts)

In [None]:
two_hearted_text = ' '.join(list(two_hearted_texts))

In [None]:
two_hearted_tfidf = TFIDF_Vectorizer.transform([two_hearted_text])

In [None]:
two_hearted_tfidf.toarray().shape

In [None]:
NMF.components_.shape

In [None]:
vector = two_hearted_tfidf.dot(NMF.components_.T)

In [None]:
vector.shape

In [None]:
vector

In [42]:
def get_beer_mega_review(b: str, r: pd.DataFrame):
    """Return a string of the reviews associated with the given beer.
    """
    r2 = r[r['brew_beer']==b]
    print("Hi")
    print(r2)
    texts = r2['cleaned_text'].values
    return ' '.join(list(texts))

def get_all_beer_tf_idf(b: list, r: pd.DataFrame, tfidf_vectorizer):
    """Return the tf_idf matrix for all the beers listed.
    """
    
    r2 = [get_beer_mega_review(b, r) for bb in b]
    print(r[0])
    return tfidf_vectorizer.transform(reviews)

In [38]:
beers = pd.read_csv('data/2-clean/beers_trunc.csv')
reviews = pd.read_csv('data/1-clean/clean_reviews.csv')

In [None]:
reviews[reviews['brew_beer']=='klosterbrauerei andechs andechser bergbock hell'].head(1)

In [43]:
brew_beers = list(beers['brew_beer'].sample(50).values)
brew_beers[0]

"pott's naturpark-brauerei / pott's brau und backhaus pott's landbier"

In [None]:
tf_idf = get_all_beer_tf_idf(beers, reviews, TFIDF_Vectorizer)

In [28]:
tf_idf

<7x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [19]:
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram

In [20]:
distxy = squareform(pdist(tf_idf.todense(), metric='cosine'))

In [21]:
link = linkage(distxy, method='complete')

  if np.all(y >= 0) and np.allclose(y, y.T):


ValueError: The condensed distance matrix must contain only finite values.

In [22]:
labels = beer_df['beer'].values + ' | ' + beer_df['style']

fig =plt.figure(figsize=(40,15))
dendro = dendrogram(
    link, color_threshold=1.5, leaf_font_size=24,
    labels=labels.values, # orientation='right'
)
plt.subplots_adjust(top=.99, bottom=0.5, left=0.05, right=0.99)
plt.savefig('figure3')
plt.show()

NameError: name 'link' is not defined

<Figure size 2880x1080 with 0 Axes>