In [1]:
# The purpose of this notebook is to clean the 'blind_assessment' column in order to create wordclouds
# of the top 25% rated coffees, and top 25% priced coffees.
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib import style
from wordcloud import WordCloud, STOPWORDS


In [2]:
coffee_words = pd.read_csv('clean_coffee.csv', index_col=False)
coffee_words = coffee_words.drop('Unnamed: 0', axis=1)
coffee_words.head()


Unnamed: 0,title,rating,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,roast_level,roaster,roaster_location,dollars_per_ounce
0,Bolivia Manantial Gesha,93,8.0,9,8,9,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia",30.00/12 ounces,Medium-Light,Red Rooster Coffee Roaster,"Floyd, Virginia",2.5
1,Ethiopia Gera Genji Challa,94,8.0,9,9,9,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",28.00/12 ounces,Medium-Light,Mostra Coffee,"San Diego, California",2.333333
2,Yirgacheffe Mengesha Natural,94,8.0,9,9,9,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",20.50/12 ounces,Medium-Light,Regent Coffee,"Glendale, California",1.708333
3,Tropical Summer Colombia La Sierra,93,8.0,9,8,9,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",18.99/8 ounces,Medium-Light,Merge Coffee Company,"Harrisonburg, Virginia",2.37375
4,Tinamit Tolimán,93,8.0,9,9,9,"Deeply sweet-tart, chocolate-toned. Dark choco...","San Lucas Tolimán, Lake Atitlán growing region...",16.00/12 ounces,Medium-Light,El Gran Cafe,"Antigua, Guatemala",1.333333


In [3]:
# Let's start by removing punctuation and making everything lowercase.
coffee_words['blind_assessment'] = coffee_words['blind_assessment'].str.replace(',', '').str.lower()
coffee_words['blind_assessment'] = coffee_words['blind_assessment'].str.replace('.', '')
coffee_words['blind_assessment'] = coffee_words['blind_assessment'].str.replace('-', ' ')
coffee_words['blind_assessment'] = coffee_words['blind_assessment'].str.replace('toned', '')
coffee_words.head()


  coffee_words['blind_assessment'] = coffee_words['blind_assessment'].str.replace('.', '')


Unnamed: 0,title,rating,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,roast_level,roaster,roaster_location,dollars_per_ounce
0,Bolivia Manantial Gesha,93,8.0,9,8,9,richly aromatic floral magnolia cocoa nib can...,"Caranavi, Bolivia",30.00/12 ounces,Medium-Light,Red Rooster Coffee Roaster,"Floyd, Virginia",2.5
1,Ethiopia Gera Genji Challa,94,8.0,9,9,9,delicately aromatic complex lilac cocoa nib pi...,"Agaro Gera, Jimma Zone, Oromia State, Ethiopia",28.00/12 ounces,Medium-Light,Mostra Coffee,"San Diego, California",2.333333
2,Yirgacheffe Mengesha Natural,94,8.0,9,9,9,high fruit driven boysenberry pear cocoa nib ...,"Yirgacheffe growing region, southern Ethiopia",20.50/12 ounces,Medium-Light,Regent Coffee,"Glendale, California",1.708333
3,Tropical Summer Colombia La Sierra,93,8.0,9,8,9,fruit driven crisply chocolaty goji berry drie...,"La Sierra, Cauca Department, Colombia",18.99/8 ounces,Medium-Light,Merge Coffee Company,"Harrisonburg, Virginia",2.37375
4,Tinamit Tolimán,93,8.0,9,9,9,deeply sweet tart chocolate dark chocolate po...,"San Lucas Tolimán, Lake Atitlán growing region...",16.00/12 ounces,Medium-Light,El Gran Cafe,"Antigua, Guatemala",1.333333


In [4]:
# I'm going to create two datasets: one for the top 25% rated coffees, and one for the top 25% priced coffees.
# In order to do that, I'll figure out where my 75th percentile is for each of those features.
coffee_words.describe()


Unnamed: 0,rating,aftertaste,aroma,body,flavor,dollars_per_ounce
count,3558.0,3558.0,3558.0,3558.0,3558.0,3558.0
mean,92.470489,8.070826,8.708544,8.439011,8.871557,1.905811
std,1.939513,0.57307,0.539359,0.557683,0.484306,1.424509
min,80.0,5.0,6.0,6.0,6.0,0.280938
25%,92.0,8.0,8.0,8.0,9.0,1.2375
50%,93.0,8.0,9.0,8.0,9.0,1.479167
75%,94.0,8.0,9.0,9.0,9.0,1.833333
max,97.0,10.0,10.0,10.0,10.0,9.9875


In [5]:
# So ratings should be above 94 and prices should be above 1.833333.
top_rated = coffee_words[coffee_words['rating'] > 94]
top_rated.describe()


Unnamed: 0,rating,aftertaste,aroma,body,flavor,dollars_per_ounce
count,390.0,390.0,390.0,390.0,390.0,390.0
mean,95.294872,8.894872,9.117949,8.928205,9.328205,3.286165
std,0.548636,0.338945,0.360571,0.286768,0.470163,2.346476
min,95.0,8.0,8.0,8.0,9.0,0.790833
25%,95.0,9.0,9.0,9.0,9.0,1.5625
50%,95.0,9.0,9.0,9.0,9.0,2.079167
75%,95.0,9.0,9.0,9.0,10.0,4.749219
max,97.0,10.0,10.0,10.0,10.0,9.922334


In [6]:
top_priced = coffee_words[coffee_words['dollars_per_ounce'] > 1.83333]
top_priced.describe()


Unnamed: 0,rating,aftertaste,aroma,body,flavor,dollars_per_ounce
count,911.0,911.0,911.0,911.0,911.0,911.0
mean,93.468716,8.230516,8.913282,8.654226,9.041712,3.607751
std,1.534532,0.521612,0.408897,0.49178,0.375653,1.959754
min,84.0,7.0,7.0,7.0,7.0,1.833333
25%,93.0,8.0,9.0,8.0,9.0,2.082917
50%,94.0,8.0,9.0,9.0,9.0,2.825
75%,94.0,9.0,9.0,9.0,9.0,4.682586
max,97.0,10.0,10.0,10.0,10.0,9.9875


In [7]:
stopwords = list(STOPWORDS)
print(stopwords)


['for', 'be', "he'd", 'just', "we'll", 'your', 'would', 'such', 'the', 'their', 'does', 'at', 'into', "shouldn't", 'what', 'herself', 'i', 'since', 'below', 'same', 'yourself', 'any', 'while', "he's", 'theirs', 'until', 'myself', 'only', 'had', 'shall', 'own', 'r', 'he', "couldn't", 'an', 'off', "he'll", 'by', "we've", 'where', "i'll", "doesn't", "shan't", "that's", 'whom', "you're", 'most', "i've", 'which', "here's", 'himself', 'is', 'here', "she'll", "hasn't", "hadn't", 'also', "they've", 'are', 'has', 'as', "we're", "wasn't", 'not', 'me', 'we', 'they', "why's", "won't", 'been', "we'd", 'my', 'under', 'cannot', 'him', 'should', 'so', 'were', 'on', 'can', "isn't", 'before', 'more', 'her', 'some', 'hence', 'against', 'how', 'ought', 'was', 'if', "what's", "you'll", "haven't", 'when', "can't", 'our', 'did', 'yourselves', "i'd", 'am', 'over', 'have', 'itself', 'both', 'nor', 'again', "aren't", 'all', 'you', 'then', "you'd", 'she', 'those', 'but', 'no', 'ever', "they'd", 'a', "let's", 'ou

In [8]:
top_rated['blind_assessment'] = top_rated['blind_assessment'].str.split(' ')
# top_rated['blind_assessment'].head()
assessments = top_rated['blind_assessment'].tolist()
# text = []
# for title in top_rated['blind_assessment']:
#     text += top_rated['blind_assessment'][title]
assessments = assessments.flat()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_rated['blind_assessment'] = top_rated['blind_assessment'].str.split(' ')


[['intricate',
  'vibrantly',
  'floral',
  'citrusy',
  'lilac',
  'tangerine',
  'almond',
  'brittle',
  'sage',
  'wild',
  'honey',
  'in',
  'aroma',
  'and',
  'cup',
  'high',
  '',
  'harmonious',
  'structure',
  'with',
  'elegantly',
  'juicy',
  'acidity;',
  'full',
  'creamy',
  'mouthfeel',
  'flavor',
  'saturated',
  'very',
  'long',
  'finish',
  ''],
 ['massively',
  'floral',
  'juicy',
  'cocoa',
  '',
  'distinct',
  'cocoa',
  'honeysuckle',
  'tangerine',
  'caramel',
  'cashew',
  'in',
  'aroma',
  'and',
  'cup',
  'deeply',
  'sweet',
  'savory',
  'edged',
  'in',
  'structure',
  'with',
  'deep',
  'vibrant',
  'acidity',
  'full',
  'syrupy',
  'mouthfeel',
  'the',
  'finish',
  'is',
  'sweet',
  'and',
  'rich',
  'carrying',
  'most',
  'notes',
  'from',
  'the',
  'cup',
  'far',
  'into',
  'the',
  'long',
  ''],
 ['richly',
  'sweet',
  'tart',
  'floral',
  'and',
  'fruit',
  'driven',
  'mango',
  'cocoa',
  'nib',
  'ginger',
  'blossom',


In [13]:
text = top_rated['blind_assessment'][:]
stopwords = set(STOPWORDS)
wordcloud = WordCloud(background_color = 'white', max_words = 200, stopwords = stopwords)

wordcloud.generate(text)
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.savefig('wordcloud.png')

plt.show()

TypeError: expected string or bytes-like object