In [1]:
import os
import pandas as pd
import numpy as np
import string
from operator import itemgetter
from collections import Counter, OrderedDict

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt

In [2]:
wine = pd.read_csv('../raw_data/winemag-data-130k-v2.csv')
wine_list = list(wine['description'])
#wine_list_normalized = [clean_description(w) for w in wine_list]
#wine_list_preprocessed = list(set(wine_list_normalized))

In [3]:
full_wine_reviews_list = [str(r) for r in wine_list]
full_wine_corpus = ' '.join(wine_list)
wine_sentences_tokenized = sent_tokenize(full_wine_corpus)

In [4]:
stop_words = set(stopwords.words('english')) 

punctuation_table = str.maketrans({key: None for key in string.punctuation})
sno = SnowballStemmer('english')

def normalize_text(raw_text):
    try:
        word_list = word_tokenize(raw_text)
        normalized_sentence = []
        for w in word_list:
            try:
                w = str(w)
                lower_case_word = str.lower(w)
                stemmed_word = sno.stem(lower_case_word)
                no_punctuation = stemmed_word.translate(punctuation_table)
                if len(no_punctuation) > 1 and no_punctuation not in stop_words:
                    normalized_sentence.append(no_punctuation)
            except:
                continue
        return normalized_sentence
    except:
        return ''

# normalized_sentences = []
# for s in sentences_tokenized:
#     normalized_text = normalize_text(s)
#     normalized_sentences.append(normalized_text)

normalized_wine_sentences = []
for s in wine_sentences_tokenized:
    normalized_text = normalize_text(s)
    normalized_wine_sentences.append(normalized_text)

In [5]:
import pickle

In [7]:
with open("../raw_data/g_wine_preprocessing.txt", "rb") as fp:   # Unpickling
    wine_sentences = pickle.load(fp)

In [8]:
wine_word2vec_model = Word2Vec.load("../raw_data/food_word2vec_model.bin")

In [9]:
descriptor_mapping = pd.read_csv('../raw_data/descriptor_mapping.csv',delimiter=';', encoding='latin1').set_index('raw descriptor')

In [10]:
descriptor_mapping

Unnamed: 0_level_0,level_3,level_2,level_1,type
raw descriptor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abras,abrasive,high_tannin,tannin,bitter
acacia,acacia,flowery,flower,fat
acacia_flower,acacia,flowery,flower,fat
aciddriven,acid_driven,high_acid,acid,acid
aggress,aggressive,high_acid,acid,acid
...,...,...,...,...
zest,zest,citrus_fruit,fruit,sweet
zesti,zesty,high_acid,acid,acid
zing,zingy,high_acid,acid,acid
zingi,zingy,high_acid,acid,acid


In [13]:
wine_trigram_model = Phraser.load('../raw_data/wine_trigrams.pkl')

In [14]:
variety_mapping = {'Shiraz': 'Syrah', 'Pinot Gris': 'Pinot Grigio', 'Pinot Grigio/Gris': 'Pinot Grigio', 
                   'Garnacha, Grenache': 'Grenache', 'Garnacha': 'Grenache', 'CarmenÃ¨re': 'Carmenere',
                    'GrÃ¼ner Veltliner': 'Gruner Veltliner', 'TorrontÃ©s': 'Torrontes', 
                   'RhÃ´ne-style Red Blend': 'Rhone-style Red Blend', 'AlbariÃ±o': 'Albarino',
                  'GewÃ¼rztraminer': 'Gewurztraminer', 'RhÃ´ne-style White Blend': 'Rhone-style White Blend',
                  'SpÃƒÂ¤tburgunder, Pinot Noir': 'Pinot Noir', 'Sauvignon, Sauvignon Blanc': 'Sauvignon Blanc',
                  'Pinot Nero, Pinot Noir': 'Pinot Noir', 'Malbec-Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                  'Meritage, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Garnacha, Grenache': 'Grenache',
                   'FumÃ© Blanc': 'Sauvignon Blanc', 'Cabernet Sauvignon-Cabernet Franc, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Cabernet Sauvignon-Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet Blend, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Malbec-Cabernet Sauvignon, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Merlot-Cabernet Franc, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Merlot-Cabernet Sauvignon, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet Franc-Merlot, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Merlot-Malbec, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend',
                   'Cabernet, Bordeaux-style Red Blend': 'Bordeaux-style Red Blend', 'Primitivo, Zinfandel': 'Zinfandel',
                   'AragonÃªs, Tempranillo': 'Aragonez, Tempranillo'
                  }

def consolidate_varieties(variety_name):
    if variety_name in variety_mapping:
        return variety_mapping[variety_name]
    else:
        return variety_name

wine_df_clean = wine.copy()
wine_df_clean['variety'] = wine_df_clean['variety'].apply(consolidate_varieties)

In [15]:
order_of_geographies = ['region_1', 'province', 'country']

# replace any nan values in the geography columns with the word none
def replace_nan_for_zero(value):
    if str(value) == '0' or str(value) == 'nan':
        return 'none'
    else:
        return value

for o in order_of_geographies:
    wine_df_clean[o] = wine_df_clean[o].apply(replace_nan_for_zero)

wine_df_clean.loc[:, order_of_geographies].fillna('none', inplace=True)

In [16]:
wine.shape

(129971, 14)

In [17]:
variety_geo = wine_df_clean.groupby(['variety', 'country', 'province', 'region_1']).size().reset_index().rename(columns={0:'count'})
variety_geo_sliced = variety_geo.loc[variety_geo['count'] > 1]

vgeos_df = pd.DataFrame(variety_geo_sliced, columns=['variety', 'country', 'province', 'region_1', 'count']) 
vgeos_df.to_csv('varieties_all_geos.csv')

In [18]:
wine_df_clean.head()

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,none,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Grigio,Rainstorm
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [19]:
wine_df_clean.replace('none', '' , inplace=True)

In [20]:
wine_df_clean['geo_normalized'] = wine_df_clean['region_1'].astype(str) + ', '+ wine_df_clean['province'].astype(str)+ ', '+ wine_df_clean['country'].astype(str)

In [23]:
wine_df_merged = wine_df_clean.copy()
wine_df_merged.drop(columns=["taster_name",'taster_twitter_handle'])

Unnamed: 0.1,Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,title,variety,winery,geo_normalized
0,0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,"Etna, Sicily & Sardinia, Italy"
1,1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,", Douro, Portugal"
2,2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Grigio,Rainstorm,"Willamette Valley, Oregon, US"
3,3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,"Lake Michigan Shore, Michigan, US"
4,4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,"Willamette Valley, Oregon, US"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
129966,129966,Germany,Notes of honeysuckle and cantaloupe sweeten th...,Brauneberger Juffer-Sonnenuhr Spätlese,90,28.0,Mosel,,,Dr. H. Thanisch (Erben Müller-Burggraef) 2013 ...,Riesling,Dr. H. Thanisch (Erben Müller-Burggraef),", Mosel, Germany"
129967,129967,US,Citation is given as much as a decade of bottl...,,90,75.0,Oregon,Oregon,Oregon Other,Citation 2004 Pinot Noir (Oregon),Pinot Noir,Citation,"Oregon, Oregon, US"
129968,129968,France,Well-drained gravel soil gives this wine its c...,Kritt,90,30.0,Alsace,Alsace,,Domaine Gresser 2013 Kritt Gewurztraminer (Als...,Gewürztraminer,Domaine Gresser,"Alsace, Alsace, France"
129969,129969,France,"A dry style of Pinot Gris, this is crisp with ...",,90,32.0,Alsace,Alsace,,Domaine Marcel Deiss 2012 Pinot Gris (Alsace),Pinot Grigio,Domaine Marcel Deiss,"Alsace, Alsace, France"


In [24]:
# variety_geo_df = pd.read_csv('../raw_data/varieties_all_geos_normalized.csv', index_col=0)
# wine_df_merged = pd.merge(left=wine_df_clean, right=variety_geo_df, left_on=['variety', 'country', 'province', 'region_1', 'region_2'],
#                          right_on=['Variety', 'Country', 'Province', 'Region', 'Subregion'])
# wine_df_merged.drop(['Unnamed: 0', 'Country', 'taster_name', 'taster_twitter_handle',
#                      'Region', 'Subregion', 'count'], 
#                     axis=1, inplace=True)
# wine_df_merged

In [25]:
variety_geo_df.Variety.unique()

NameError: name 'variety_geo_df' is not defined

In [26]:
variety_geo_df.geo_normalized.unique()

NameError: name 'variety_geo_df' is not defined

In [27]:
wine_df_merged.geo_normalized.unique()

array(['Etna, Sicily & Sardinia, Italy', ', Douro, Portugal',
       'Willamette Valley, Oregon, US', ...,
       'Del Veneto, Northeastern Italy, Italy',
       'Bardolino Superiore, Veneto, Italy',
       'Paestum, Southern Italy, Italy'], dtype=object)

In [28]:
variety_geos = wine_df_merged.groupby(['variety', 'geo_normalized']).size()
at_least_n_types = variety_geos[variety_geos > 30].reset_index()
# at_least_n_types.head(10)
wine_df_merged_filtered = pd.merge(wine_df_merged, at_least_n_types, left_on=['variety', 'geo_normalized'], right_on=['variety', 'geo_normalized'])
wine_df_merged_filtered = wine_df_merged_filtered[['title', 'variety', 'geo_normalized', 'description']]
print(wine_df_merged_filtered.shape)

(92228, 4)


In [29]:
wine_reviews = list(wine_df_merged['description'])

core_tastes = ['aroma', 'weight', 'sweet', 'acid', 'salt', 'piquant', 'fat', 'bitter']

descriptor_mappings = dict()
for c in core_tastes:
    descriptor_mapping_filtered=descriptor_mapping[descriptor_mapping['type']==c]
    descriptor_mappings[c] = descriptor_mapping_filtered                                                   
    

def return_descriptor_from_mapping(descriptor_mapping, word, core_tastes):
    if word in list(descriptor_mapping.index):
        descriptor_to_return = descriptor_mapping['level_3'][word]
        return descriptor_to_return
    else:
        return None

review_descriptors = []
for review in wine_reviews:
    taste_descriptors = []
    normalized_review = normalize_text(review)
    phrased_review = wine_trigram_model[normalized_review]
    #print(normalized_review)
    
    for c in core_tastes:                                                      
        descriptors_only = [return_descriptor_from_mapping(descriptor_mappings[c], word, c) for word in phrased_review]
        no_nones = [str(d).strip() for d in descriptors_only if d is not None]
        descriptorized_review = ' '.join(no_nones)
        taste_descriptors.append(descriptorized_review)
    review_descriptors.append(taste_descriptors)


In [30]:
review_descriptors

[['herb sage',
  '',
  'tropical_fruit fruit dry apple citrus dry',
  'brisk',
  '',
  '',
  '',
  ''],
 ['', '', 'ripe fruit juicy berry fruit', 'fresh', '', '', '', 'smooth firm'],
 ['green', '', 'lime rind pineapple', 'tart snappy crisp', '', '', '', ''],
 ['',
  'opulent',
  'pineapple rind lemon_pith orange guava mango',
  '',
  '',
  '',
  '',
  'astringent'],
 ['earth herb', 'hearty', '', '', '', '', '', 'rough tannin rustic'],
 ['tomato herb dark',
  '',
  'blackberry raspberry plum fruit',
  'fresh',
  '',
  'spice',
  '',
  'grabby'],
 ['savory herb',
  '',
  'candy berry',
  'bright fresh',
  '',
  'white_pepper',
  '',
  'soft'],
 ['restrained', '', 'dry', '', '', 'spice', '', 'firm'],
 ['savory thyme',
  'elegant',
  'dry peach off-dry fruit',
  'brisk fresh',
  '',
  '',
  '',
  ''],
 ['', '', 'apple pear fruit dry', 'fresh crisp', '', 'spice', 'depth', ''],
 ['oak coffee', '', 'plum chocolate', '', '', '', '', 'soft supple'],
 ['minerality', '', 'dry citrus', 'crisp', ''

In [31]:
taste_descriptors = []
taste_vectors = []

for n, taste in enumerate(core_tastes):
    print(taste)
    taste_words = [r[n] for r in review_descriptors]
    
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit(taste_words)
    dict_of_tfidf_weightings = dict(zip(X.get_feature_names(), X.idf_))
        
    wine_review_descriptors = []
    wine_review_vectors = []
    
    for d in taste_words:
        descriptor_count = 0
        weighted_review_terms = []
        terms = d.split(' ')
        for term in terms:
            if term in dict_of_tfidf_weightings.keys():
                tfidf_weighting = dict_of_tfidf_weightings[term]
                try:
                    word_vector = wine_word2vec_model.wv.get_vector(term).reshape(1, 300)
                    weighted_word_vector = tfidf_weighting * word_vector
                    weighted_review_terms.append(weighted_word_vector)
                    descriptor_count += 1
                except:
                    continue
            else:
                continue
        try:
            review_vector = sum(weighted_review_terms)/len(weighted_review_terms)
            review_vector = review_vector[0]
        except:
            review_vector = np.nan
#         terms_and_vec = [terms, review_vector]
        wine_review_vectors.append(review_vector)
        wine_review_descriptors.append(terms)
    
    taste_vectors.append(wine_review_vectors)
    taste_descriptors.append(wine_review_descriptors)
    

taste_vectors_t = list(map(list, zip(*taste_vectors)))
taste_descriptors_t = list(map(list, zip(*taste_descriptors)))

review_vecs_df = pd.DataFrame(taste_vectors_t, columns=core_tastes)

columns_taste_descriptors = [a + '_descriptors' for a in core_tastes]
review_descriptors_df = pd.DataFrame(taste_descriptors_t, columns=columns_taste_descriptors)

wine_df_vecs = pd.concat([wine_df_merged, review_descriptors_df, review_vecs_df], axis=1)
wine_df_vecs.shape

aroma
weight
sweet
acid
salt
piquant
fat
bitter


(129971, 31)

In [32]:
wine_df_vecs['geo_normalized']

0            Etna, Sicily & Sardinia, Italy
1                         , Douro, Portugal
2             Willamette Valley, Oregon, US
3         Lake Michigan Shore, Michigan, US
4             Willamette Valley, Oregon, US
                        ...                
129966                     , Mosel, Germany
129967                   Oregon, Oregon, US
129968               Alsace, Alsace, France
129969               Alsace, Alsace, France
129970               Alsace, Alsace, France
Name: geo_normalized, Length: 129971, dtype: object

In [33]:
# pull the average embedding for the wine attribute across all wines. 
avg_taste_vecs = dict()
for t in core_tastes:
    # look at the average embedding for a taste, across all wines that have descriptors for that taste 
    review_arrays = wine_df_vecs[t].dropna()
    average_taste_vec = np.average(review_arrays)
    avg_taste_vecs[t] = average_taste_vec

In [34]:
normalized_geos = list(set(zip(wine_df_vecs['variety'], wine_df_vecs['geo_normalized'])))

def subset_wine_vectors(list_of_varieties, wine_attribute):
    wine_variety_vectors = []
    for v in list_of_varieties:

        one_var_only = wine_df_vecs.loc[(wine_df_vecs['variety'] == v[0]) & 
                                                (wine_df_vecs['geo_normalized'] == v[1])]
        if len(list(one_var_only.index)) < 1 or str(v[1][-1]) == '0':
            continue
        else:
            taste_vecs = list(one_var_only[wine_attribute])
            taste_vecs = [avg_taste_vecs[wine_attribute] if 'numpy' not in str(type(x)) else x for x in taste_vecs]
            average_variety_vec = np.average(taste_vecs, axis=0)
            
            descriptor_colname = wine_attribute + '_descriptors'
            all_descriptors = [i[0] for i in list(one_var_only[descriptor_colname])]
            word_freqs = Counter(all_descriptors)
            most_common_words = word_freqs.most_common(50)
            top_n_words = [(i[0], "{:.2f}".format(i[1]/len(taste_vecs))) for i in most_common_words]
            top_n_words = [i for i in top_n_words if len(i[0])>2]
            wine_variety_vector = [v, average_variety_vec, top_n_words]
                
            wine_variety_vectors.append(wine_variety_vector)
            
    return wine_variety_vectors


def pca_wine_variety(list_of_varieties, wine_attribute, pca=True):
    wine_var_vectors = subset_wine_vectors(normalized_geos, wine_attribute)
    
    wine_varieties = [str(w[0]).replace('(', '').replace(')', '').replace("'", '').replace('"', '') for w in wine_var_vectors]
    wine_var_vec = [w[1] for w in wine_var_vectors]
    if pca:
        pca = PCA(1)
        wine_var_vec = pca.fit_transform(wine_var_vec)
        wine_var_vec = pd.DataFrame(wine_var_vec, index=wine_varieties)
    else:
        wine_var_vec = pd.Series(wine_var_vec, index=wine_varieties)
    wine_var_vec.sort_index(inplace=True)
    
    wine_descriptors = pd.DataFrame([w[2] for w in wine_var_vectors], index=wine_varieties)
    wine_descriptors = pd.melt(wine_descriptors.reset_index(), id_vars='index')
    wine_descriptors.sort_index(inplace=True)
    
    return wine_var_vec, wine_descriptors

taste_dataframes = []
# generate the dataframe of aromas vectors as output, 
aroma_vec, aroma_descriptors = pca_wine_variety(normalized_geos, 'aroma', pca=False)
taste_dataframes.append(aroma_vec)

# generate the dataframes of nonaroma scalars
for tw in core_tastes[1:]:
    pca_w_dataframe, nonaroma_descriptors = pca_wine_variety(normalized_geos, tw, pca=True)
    taste_dataframes.append(pca_w_dataframe)
    
# combine all the dataframes created above into one 
all_nonaromas = pd.concat(taste_dataframes, axis=1)
all_nonaromas.columns = core_tastes

In [35]:
taste_dataframes[2]

Unnamed: 0,0
"Abouriou, Côtes du Marmandais, Southwest France, France",-8.063285
"Abouriou, Russian River Valley, California, US",-7.546587
"Agiorgitiko, , Arcadia, Greece",-23.464617
"Agiorgitiko, , Attica, Greece",-30.252591
"Agiorgitiko, , Corinth, Greece",-15.694755
...,...
"Zweigelt, Naches Heights, Washington, US",-18.204896
"Zweigelt, Seneca Lake, New York, US",-19.744323
"Zweigelt, Yakima Valley, Washington, US",-15.713985
"Çalkarası, , Aegean, Turkey",15.850864


In [36]:
# save the 50 top descriptors for each wine variety as a CSV file. We will us this later to dig deeper into our proposed wine recommendations.

aroma_descriptors_copy = aroma_descriptors.copy()
aroma_descriptors_copy.set_index('index', inplace=True)
aroma_descriptors_copy.dropna(inplace=True)

aroma_descriptors_copy = pd.DataFrame(aroma_descriptors_copy['value'].tolist(), index=aroma_descriptors_copy.index)
aroma_descriptors_copy.columns = ['descriptors', 'relative_frequency']
aroma_descriptors_copy.to_csv('wine_variety_descriptors.csv')

In [37]:
def normalize(df, cols_to_normalize):
    for feature_name in cols_to_normalize:
        print(feature_name)
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        df[feature_name] = df[feature_name].apply(lambda x: (x- min_value)/(max_value-min_value))
#         (df[feature_name] - min_value) / (max_value - min_value)
    return df

all_nonaromas_normalized = normalize(all_nonaromas, cols_to_normalize=core_tastes[1:])
all_nonaromas_normalized.to_csv('wine_aromas_nonaromas.csv')



weight
sweet
acid
salt
piquant
fat
bitter
