In [1]:
# The purpose of this notebook is to clean the 'blind_assessment' column in order to create wordclouds
# of the top 25% rated coffees, and top 25% priced coffees.
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
from matplotlib import style
from wordcloud import WordCloud, STOPWORDS


In [2]:
coffee_words = pd.read_csv('clean_coffee.csv', index_col=False)
coffee_words = coffee_words.drop('Unnamed: 0', axis=1)
coffee_words.head()


Unnamed: 0,title,rating,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,roast_level,roaster,roaster_location,dollars_per_ounce
0,Bolivia Manantial Gesha,93,8.0,9,8,9,"Richly aromatic, floral-toned. Magnolia, cocoa...","Caranavi, Bolivia",30.00/12 ounces,Medium-Light,Red Rooster Coffee Roaster,"Floyd, Virginia",2.5
1,Ethiopia Gera Genji Challa,94,8.0,9,9,9,"Delicately aromatic, complex. Lilac, cocoa nib...","Agaro Gera, Jimma Zone, Oromia State, Ethiopia",28.00/12 ounces,Medium-Light,Mostra Coffee,"San Diego, California",2.333333
2,Yirgacheffe Mengesha Natural,94,8.0,9,9,9,"High-toned, fruit-driven. Boysenberry, pear, c...","Yirgacheffe growing region, southern Ethiopia",20.50/12 ounces,Medium-Light,Regent Coffee,"Glendale, California",1.708333
3,Tropical Summer Colombia La Sierra,93,8.0,9,8,9,"Fruit-driven, crisply chocolaty. Goji berry, d...","La Sierra, Cauca Department, Colombia",18.99/8 ounces,Medium-Light,Merge Coffee Company,"Harrisonburg, Virginia",2.37375
4,Tinamit Tolimán,93,8.0,9,9,9,"Deeply sweet-tart, chocolate-toned. Dark choco...","San Lucas Tolimán, Lake Atitlán growing region...",16.00/12 ounces,Medium-Light,El Gran Cafe,"Antigua, Guatemala",1.333333


In [3]:
# Let's start by removing punctuation and making everything lowercase. Also, I see the word 'toned' a lot, and it doesn't
# tell us much. We'll remove it.
coffee_words['blind_assessment'] = coffee_words['blind_assessment'].str.replace(',', '').str.lower()
coffee_words['blind_assessment'] = coffee_words['blind_assessment'].str.replace('.', '')
coffee_words['blind_assessment'] = coffee_words['blind_assessment'].str.replace('-', ' ')
coffee_words['blind_assessment'] = coffee_words['blind_assessment'].str.replace('toned', '')
coffee_words.head()


  coffee_words['blind_assessment'] = coffee_words['blind_assessment'].str.replace('.', '')


Unnamed: 0,title,rating,aftertaste,aroma,body,flavor,blind_assessment,coffee_origin,est_price,roast_level,roaster,roaster_location,dollars_per_ounce
0,Bolivia Manantial Gesha,93,8.0,9,8,9,richly aromatic floral magnolia cocoa nib can...,"Caranavi, Bolivia",30.00/12 ounces,Medium-Light,Red Rooster Coffee Roaster,"Floyd, Virginia",2.5
1,Ethiopia Gera Genji Challa,94,8.0,9,9,9,delicately aromatic complex lilac cocoa nib pi...,"Agaro Gera, Jimma Zone, Oromia State, Ethiopia",28.00/12 ounces,Medium-Light,Mostra Coffee,"San Diego, California",2.333333
2,Yirgacheffe Mengesha Natural,94,8.0,9,9,9,high fruit driven boysenberry pear cocoa nib ...,"Yirgacheffe growing region, southern Ethiopia",20.50/12 ounces,Medium-Light,Regent Coffee,"Glendale, California",1.708333
3,Tropical Summer Colombia La Sierra,93,8.0,9,8,9,fruit driven crisply chocolaty goji berry drie...,"La Sierra, Cauca Department, Colombia",18.99/8 ounces,Medium-Light,Merge Coffee Company,"Harrisonburg, Virginia",2.37375
4,Tinamit Tolimán,93,8.0,9,9,9,deeply sweet tart chocolate dark chocolate po...,"San Lucas Tolimán, Lake Atitlán growing region...",16.00/12 ounces,Medium-Light,El Gran Cafe,"Antigua, Guatemala",1.333333


In [4]:
# I'm going to create two datasets: one for the top 25% rated coffees, and one for the top 25% priced coffees.
# In order to do that, I'll figure out where my 75th percentile is for each of those features.
coffee_words.describe()


Unnamed: 0,rating,aftertaste,aroma,body,flavor,dollars_per_ounce
count,3558.0,3558.0,3558.0,3558.0,3558.0,3558.0
mean,92.470489,8.070826,8.708544,8.439011,8.871557,1.905811
std,1.939513,0.57307,0.539359,0.557683,0.484306,1.424509
min,80.0,5.0,6.0,6.0,6.0,0.280938
25%,92.0,8.0,8.0,8.0,9.0,1.2375
50%,93.0,8.0,9.0,8.0,9.0,1.479167
75%,94.0,8.0,9.0,9.0,9.0,1.833333
max,97.0,10.0,10.0,10.0,10.0,9.9875


In [5]:
# So ratings should be above 94 and prices should be above 1.833333.
top_rated = coffee_words[coffee_words['rating'] > 94]
top_rated.describe()


Unnamed: 0,rating,aftertaste,aroma,body,flavor,dollars_per_ounce
count,390.0,390.0,390.0,390.0,390.0,390.0
mean,95.294872,8.894872,9.117949,8.928205,9.328205,3.286165
std,0.548636,0.338945,0.360571,0.286768,0.470163,2.346476
min,95.0,8.0,8.0,8.0,9.0,0.790833
25%,95.0,9.0,9.0,9.0,9.0,1.5625
50%,95.0,9.0,9.0,9.0,9.0,2.079167
75%,95.0,9.0,9.0,9.0,10.0,4.749219
max,97.0,10.0,10.0,10.0,10.0,9.922334


In [6]:
top_priced = coffee_words[coffee_words['dollars_per_ounce'] > 1.83333]
top_priced.describe()


Unnamed: 0,rating,aftertaste,aroma,body,flavor,dollars_per_ounce
count,911.0,911.0,911.0,911.0,911.0,911.0
mean,93.468716,8.230516,8.913282,8.654226,9.041712,3.607751
std,1.534532,0.521612,0.408897,0.49178,0.375653,1.959754
min,84.0,7.0,7.0,7.0,7.0,1.833333
25%,93.0,8.0,9.0,8.0,9.0,2.082917
50%,94.0,8.0,9.0,9.0,9.0,2.825
75%,94.0,9.0,9.0,9.0,9.0,4.682586
max,97.0,10.0,10.0,10.0,10.0,9.9875


In [7]:
# I'm going to put the stopwords in a list so I can use the list to remove them from my strings later.
stopwords = list(STOPWORDS)
print(stopwords)


['any', 'shall', 'http', 'these', "couldn't", "didn't", 'my', 'no', 'therefore', 'further', 'hers', 'this', 'the', 'it', 'only', 'same', "weren't", "let's", 'was', 'an', 'as', 'after', 'since', 'however', "what's", "hadn't", 'once', 'there', 'not', 'too', 'himself', 'what', 'and', 'yourself', 'just', "can't", 'all', 'yours', 'but', 'into', "you'll", "i'm", 'should', "doesn't", "they've", 'www', "you'd", 'me', "how's", 'be', 'by', 'have', 'both', 'ourselves', "she'll", 'below', 'before', 'whom', 'under', 'so', "we're", 'here', 'at', 'where', "shan't", "we've", 'with', "they'll", "there's", "isn't", 'during', 'down', "i'll", 'our', 'is', 'would', 'while', 'such', 'own', 'why', 'themselves', 'because', 'doing', 'against', 'their', 'been', 'does', 'how', 'yourselves', 'they', "he'd", 'other', "we'll", "you're", "wouldn't", 'myself', 'we', 'then', 'very', 'off', 'about', 'her', 'if', 'k', "when's", 'ever', 'who', 'hence', "it's", "where's", "shouldn't", 'having', 'can', "mustn't", 'did', "t

In [8]:
# Let's put all of the blind_assessment strings from the top rated coffees in a list, then we'll concatenate them into 
# one long string.
top_rated_assessments = top_rated['blind_assessment'].tolist()
print(len(top_rated_assessments))
top_rated_assessments


390


['intricate vibrantly floral citrusy lilac tangerine almond brittle sage wild honey in aroma and cup high  harmonious structure with elegantly juicy acidity; full creamy mouthfeel flavor saturated very long finish ',
 'massively floral juicy cocoa  distinct cocoa honeysuckle tangerine caramel cashew in aroma and cup deeply sweet savory edged in structure with deep vibrant acidity full syrupy mouthfeel the finish is sweet and rich carrying most notes from the cup far into the long ',
 'richly sweet tart floral and fruit driven mango cocoa nib ginger blossom lime zest cedar in aroma and cup balanced tart leaning structure with vibrant juicy acidity; lightly viscous silky mouthfeel long flavor saturated finish centered around fruit and florals',
 'high  chocolaty and richly aromatic malted chocolate black cherry cashew butter pink grapefruit toffee in aroma and cup brightly sweet structure with lively but restrained acidity; plush creamy mouthfeel the integrated finish carries over all th

In [9]:
# Here's our long string. Next we'll remove the stopwords.
top_rated_text = ""
for item in top_rated_assessments:
    top_rated_text += "".join(item) + ' '
    
print(top_rated_text)
    

intricate vibrantly floral citrusy lilac tangerine almond brittle sage wild honey in aroma and cup high  harmonious structure with elegantly juicy acidity; full creamy mouthfeel flavor saturated very long finish  massively floral juicy cocoa  distinct cocoa honeysuckle tangerine caramel cashew in aroma and cup deeply sweet savory edged in structure with deep vibrant acidity full syrupy mouthfeel the finish is sweet and rich carrying most notes from the cup far into the long  richly sweet tart floral and fruit driven mango cocoa nib ginger blossom lime zest cedar in aroma and cup balanced tart leaning structure with vibrant juicy acidity; lightly viscous silky mouthfeel long flavor saturated finish centered around fruit and florals high  chocolaty and richly aromatic malted chocolate black cherry cashew butter pink grapefruit toffee in aroma and cup brightly sweet structure with lively but restrained acidity; plush creamy mouthfeel the integrated finish carries over all the aroma and fl

In [10]:
# Ok, I see some punctuation I didn't remove at the top. Let's get those now, then remove stopwords.
top_rated_text = top_rated_text.replace(';', '')
top_rated_text = top_rated_text.replace(':', '')
top_rated_text = top_rated_text.replace('/', ' ')
top_rated_text = top_rated_text.replace('(', '')
top_rated_text = top_rated_text.replace(')', '')


In [11]:
top_rated_list = top_rated_text.split()

top_rated = [word for word in top_rated_list if word not in stopwords]

top_rated_string = ' '.join(top_rated)
top_rated_string


"intricate vibrantly floral citrusy lilac tangerine almond brittle sage wild honey aroma cup high harmonious structure elegantly juicy acidity full creamy mouthfeel flavor saturated long finish massively floral juicy cocoa distinct cocoa honeysuckle tangerine caramel cashew aroma cup deeply sweet savory edged structure deep vibrant acidity full syrupy mouthfeel finish sweet rich carrying notes cup far long richly sweet tart floral fruit driven mango cocoa nib ginger blossom lime zest cedar aroma cup balanced tart leaning structure vibrant juicy acidity lightly viscous silky mouthfeel long flavor saturated finish centered around fruit florals high chocolaty richly aromatic malted chocolate black cherry cashew butter pink grapefruit toffee aroma cup brightly sweet structure lively restrained acidity plush creamy mouthfeel integrated finish carries aroma flavor notes cup richly sweet floral fruit apricot star jasmine almond nougat lemon balm sandalwood aroma cup sweetly tart structure del

In [12]:
# I'm satisfied with that! Let's save it as a txt file so we can make a wordcloud in Tableau.
top_rated_list = top_rated_string.split()
top_rated_s = pd.Series(top_rated_list)
top_rated_s.to_csv('top_rated_text.csv')


In [13]:
# Now to do the same with the top priced coffees.
top_priced_assessments = top_priced['blind_assessment'].tolist()
print(len(top_priced_assessments))
top_priced_assessments


911


['richly aromatic floral  magnolia cocoa nib cane sugar red plum nutmeg in aroma and cup elegantly sweet structure with delicate bright acidity; silky vibrant mouthfeel flavor saturated finish centered around notes of magnolia and cocoa nib complicated by undertones of nutmeg ',
 'delicately aromatic complex lilac cocoa nib pink grapefruit zest shishito pepper apricot in aroma and cup sweetly tart with juicy bright acidity; plush syrupy smooth mouthfeel floral finish with cocoa undertones',
 'fruit driven crisply chocolaty goji berry dried plum baking chocolate amber narcissus in aroma and cup crisply sweet structure with balanced acidity; lightly satiny mouthfeel fruit  finish supported by notes of baking chocolate',
 'berry driven invitingly sweet tart dried mulberry cocoa nib pink grapefruit zest tea rose fresh cut oak in aroma and cup sweet  structure with brisk acidity; plush syrupy mouthfeel the long lingering finish consolidates to notes of cocoa  mulberry and oak',
 'crisply sw

In [15]:
# Here's our long string. Next we'll remove the stopwords.
top_priced_text = ""
for item in top_priced_assessments:
    top_priced_text += "".join(item) + ' '
    
# And while we'll here, let's remove the punctuation we missed at the beginning, then the stopwords.
top_priced_text = top_priced_text.replace(';', '')
top_priced_text = top_priced_text.replace(':', '')
top_priced_text = top_priced_text.replace('/', ' ')
top_priced_text = top_priced_text.replace('(', '')
top_priced_text = top_priced_text.replace(')', '')

top_priced_list = top_priced_text.split()

top_priced = [word for word in top_priced_list if word not in stopwords]

top_priced_string = ' '.join(top_priced)
top_priced_string


"richly aromatic floral magnolia cocoa nib cane sugar red plum nutmeg aroma cup elegantly sweet structure delicate bright acidity silky vibrant mouthfeel flavor saturated finish centered around notes magnolia cocoa nib complicated undertones nutmeg delicately aromatic complex lilac cocoa nib pink grapefruit zest shishito pepper apricot aroma cup sweetly tart juicy bright acidity plush syrupy smooth mouthfeel floral finish cocoa undertones fruit driven crisply chocolaty goji berry dried plum baking chocolate amber narcissus aroma cup crisply sweet structure balanced acidity lightly satiny mouthfeel fruit finish supported notes baking chocolate berry driven invitingly sweet tart dried mulberry cocoa nib pink grapefruit zest tea rose fresh cut oak aroma cup sweet structure brisk acidity plush syrupy mouthfeel long lingering finish consolidates notes cocoa mulberry oak crisply sweet tart richly nut pomegranate baking chocolate hazelnut lime zest cedar aroma cup sweet tart structure gently 

In [19]:
# Lovely!  Let's save it as a txt file so we can make a wordcloud in Tableau.
top_priced_list = top_priced_string.split()
top_priced_s = pd.Series(top_priced_list)
top_priced_s.to_csv('top_priced_text.csv')
